Many open source projects have moved to using SPDX identifiers to specify license information, reducing the amount of boilerplate code in every source file. This patch replaces the bulk of SPDK .c, .cpp and Makefiles with the BSD-3-Clause identifier. Almost all of these files share the exact same license text, and this patch only modifies the files that contain the most common license text. There can be slight variations because the third clause contains company names - most say "Intel Corporation", but there are instances for Nvidia, Samsung, Eideticom and even "the copyright holder". Used a bash script to automate replacement of the license text with SPDX identifier which is checked into scripts/spdx.sh. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: Iaa88ab5e92ea471691dc298cfe41ebfb5d169780 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12904 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-by: Dong Yi <dongx.yi@intel.com> Reviewed-by: Konrad Sztyber <konrad.sztyber@intel.com> Reviewed-by: Paul Luse <paul.e.luse@intel.com> Reviewed-by: <qun.wan@intel.com>
298 lines
5.5 KiB
C
298 lines
5.5 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (c) Intel Corporation.
|
|
* All rights reserved.
|
|
*/
|
|
|
|
#ifndef SPDK_UTF_H_
|
|
#define SPDK_UTF_H_
|
|
|
|
#include "spdk/stdinc.h"
|
|
|
|
#include "spdk/endian.h"
|
|
#include "spdk/likely.h"
|
|
#include "spdk/string.h"
|
|
|
|
static inline bool
|
|
utf8_tail(uint8_t c)
|
|
{
|
|
/* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
|
|
return (c & 0xC0) == 0x80;
|
|
}
|
|
|
|
/*
|
|
* Check for a valid UTF-8 encoding of a single codepoint.
|
|
*
|
|
* \return Length of valid UTF-8 byte sequence, or negative if invalid.
|
|
*/
|
|
static inline int
|
|
utf8_valid(const uint8_t *start, const uint8_t *end)
|
|
{
|
|
const uint8_t *p = start;
|
|
uint8_t b0, b1, b2, b3;
|
|
|
|
if (p == end) {
|
|
return 0;
|
|
}
|
|
|
|
b0 = *p;
|
|
|
|
if (b0 <= 0x7F) {
|
|
return 1;
|
|
}
|
|
|
|
if (b0 <= 0xC1) {
|
|
/* Invalid start byte */
|
|
return -1;
|
|
}
|
|
|
|
if (++p == end) {
|
|
/* Not enough bytes left */
|
|
return -1;
|
|
}
|
|
b1 = *p;
|
|
|
|
if (b0 <= 0xDF) {
|
|
/* C2..DF 80..BF */
|
|
if (!utf8_tail(b1)) {
|
|
return -1;
|
|
}
|
|
return 2;
|
|
}
|
|
|
|
if (++p == end) {
|
|
/* Not enough bytes left */
|
|
return -1;
|
|
}
|
|
b2 = *p;
|
|
|
|
if (b0 == 0xE0) {
|
|
/* E0 A0..BF 80..BF */
|
|
if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
|
|
return -1;
|
|
}
|
|
return 3;
|
|
} else if (b0 == 0xED && b1 >= 0xA0) {
|
|
/*
|
|
* UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
|
|
* ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
|
|
*/
|
|
return -1;
|
|
} else if (b0 <= 0xEF) {
|
|
/* E1..EF 80..BF 80..BF */
|
|
if (!utf8_tail(b1) || !utf8_tail(b2)) {
|
|
return -1;
|
|
}
|
|
return 3;
|
|
}
|
|
|
|
if (++p == end) {
|
|
/* Not enough bytes left */
|
|
return -1;
|
|
}
|
|
b3 = *p;
|
|
|
|
if (b0 == 0xF0) {
|
|
/* F0 90..BF 80..BF 80..BF */
|
|
if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
|
|
return -1;
|
|
}
|
|
return 4;
|
|
} else if (b0 <= 0xF3) {
|
|
/* F1..F3 80..BF 80..BF 80..BF */
|
|
if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
|
|
return -1;
|
|
}
|
|
return 4;
|
|
} else if (b0 == 0xF4) {
|
|
/* F4 80..8F 80..BF 80..BF */
|
|
if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
|
|
return -1;
|
|
}
|
|
return 4;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_1(const uint8_t *data)
|
|
{
|
|
return data[0];
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_2(const uint8_t *data)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
codepoint = ((data[0] & 0x1F) << 6);
|
|
codepoint |= (data[1] & 0x3F);
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_3(const uint8_t *data)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
codepoint = ((data[0] & 0x0F) << 12);
|
|
codepoint |= (data[1] & 0x3F) << 6;
|
|
codepoint |= (data[2] & 0x3F);
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_4(const uint8_t *data)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
codepoint = ((data[0] & 0x07) << 18);
|
|
codepoint |= (data[1] & 0x3F) << 12;
|
|
codepoint |= (data[2] & 0x3F) << 6;
|
|
codepoint |= (data[3] & 0x3F);
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
/*
|
|
* Encode a single Unicode codepoint as UTF-8.
|
|
*
|
|
* buf must have at least 4 bytes of space available (hence unsafe).
|
|
*
|
|
* \return Number of bytes appended to buf, or negative if encoding failed.
|
|
*/
|
|
static inline int
|
|
utf8_encode_unsafe(uint8_t *buf, uint32_t c)
|
|
{
|
|
if (c <= 0x7F) {
|
|
buf[0] = c;
|
|
return 1;
|
|
} else if (c <= 0x7FF) {
|
|
buf[0] = 0xC0 | (c >> 6);
|
|
buf[1] = 0x80 | (c & 0x3F);
|
|
return 2;
|
|
} else if (c >= 0xD800 && c <= 0xDFFF) {
|
|
/* UTF-16 surrogate pairs - invalid in UTF-8 */
|
|
return -1;
|
|
} else if (c <= 0xFFFF) {
|
|
buf[0] = 0xE0 | (c >> 12);
|
|
buf[1] = 0x80 | ((c >> 6) & 0x3F);
|
|
buf[2] = 0x80 | (c & 0x3F);
|
|
return 3;
|
|
} else if (c <= 0x10FFFF) {
|
|
buf[0] = 0xF0 | (c >> 18);
|
|
buf[1] = 0x80 | ((c >> 12) & 0x3F);
|
|
buf[2] = 0x80 | ((c >> 6) & 0x3F);
|
|
buf[3] = 0x80 | (c & 0x3F);
|
|
return 4;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static inline int
|
|
utf8_codepoint_len(uint32_t c)
|
|
{
|
|
if (c <= 0x7F) {
|
|
return 1;
|
|
} else if (c <= 0x7FF) {
|
|
return 2;
|
|
} else if (c >= 0xD800 && c <= 0xDFFF) {
|
|
/* UTF-16 surrogate pairs - invalid in UTF-8 */
|
|
return -1;
|
|
} else if (c <= 0xFFFF) {
|
|
return 3;
|
|
} else if (c <= 0x10FFFF) {
|
|
return 4;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static inline bool
|
|
utf16_valid_surrogate_high(uint32_t val)
|
|
{
|
|
return val >= 0xD800 && val <= 0xDBFF;
|
|
}
|
|
|
|
static inline bool
|
|
utf16_valid_surrogate_low(uint32_t val)
|
|
{
|
|
return val >= 0xDC00 && val <= 0xDFFF;
|
|
}
|
|
|
|
/*
|
|
* Check for a valid UTF-16LE encoding of a single codepoint.
|
|
*
|
|
* \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
|
|
*/
|
|
static inline int
|
|
utf16le_valid(const uint16_t *start, const uint16_t *end)
|
|
{
|
|
const uint16_t *p = start;
|
|
uint16_t high, low;
|
|
|
|
if (p == end) {
|
|
return 0;
|
|
}
|
|
|
|
high = from_le16(p);
|
|
|
|
if (high <= 0xD7FF || high >= 0xE000) {
|
|
/* Single code unit in BMP */
|
|
return 1;
|
|
}
|
|
|
|
if (high >= 0xDC00) {
|
|
/* Low surrogate in first code unit - invalid */
|
|
return -1;
|
|
}
|
|
|
|
assert(utf16_valid_surrogate_high(high));
|
|
|
|
if (++p == end) {
|
|
/* Not enough code units left */
|
|
return -1;
|
|
}
|
|
low = from_le16(p);
|
|
|
|
if (!utf16_valid_surrogate_low(low)) {
|
|
return -1;
|
|
}
|
|
|
|
/* Valid surrogate pair */
|
|
return 2;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
assert(utf16_valid_surrogate_high(high));
|
|
assert(utf16_valid_surrogate_low(low));
|
|
|
|
codepoint = low;
|
|
codepoint &= 0x3FF;
|
|
codepoint |= ((high & 0x3FF) << 10);
|
|
codepoint += 0x10000;
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
static inline void
|
|
utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
|
|
{
|
|
assert(codepoint >= 0x10000);
|
|
assert(codepoint <= 0x10FFFF);
|
|
|
|
codepoint -= 0x10000;
|
|
*high = 0xD800 | (codepoint >> 10);
|
|
*low = 0xDC00 | (codepoint & 0x3FF);
|
|
|
|
assert(utf16_valid_surrogate_high(*high));
|
|
assert(utf16_valid_surrogate_low(*low));
|
|
}
|
|
|
|
#endif
|