per Intel policy to include file commit date using git cmd below. The policy does not apply to non-Intel (C) notices. git log --follow -C90% --format=%ad --date default <file> | tail -1 and then pull just the 4 digit year from the result. Intel copyrights were not added to files where Intel either had no contribution ot the contribution lacked substance (ie license header updates, formatting changes, etc). Contribution date used "--follow -C95%" to get the most accurate date. Note that several files in this patch didn't end the license/(c) block with a blank comment line so these were added as the vast majority of files do have this last blank line. Simply there for consistency. Signed-off-by: paul luse <paul.e.luse@intel.com> Change-Id: Id5b7ce4f658fe87132f14139ead58d6e285c04d4 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15192 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Community-CI: Mellanox Build Bot
298 lines
5.5 KiB
C
298 lines
5.5 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (C) 2016 Intel Corporation.
|
|
* All rights reserved.
|
|
*/
|
|
|
|
#ifndef SPDK_UTF_H_
|
|
#define SPDK_UTF_H_
|
|
|
|
#include "spdk/stdinc.h"
|
|
|
|
#include "spdk/endian.h"
|
|
#include "spdk/likely.h"
|
|
#include "spdk/string.h"
|
|
|
|
static inline bool
|
|
utf8_tail(uint8_t c)
|
|
{
|
|
/* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
|
|
return (c & 0xC0) == 0x80;
|
|
}
|
|
|
|
/*
|
|
* Check for a valid UTF-8 encoding of a single codepoint.
|
|
*
|
|
* \return Length of valid UTF-8 byte sequence, or negative if invalid.
|
|
*/
|
|
static inline int
|
|
utf8_valid(const uint8_t *start, const uint8_t *end)
|
|
{
|
|
const uint8_t *p = start;
|
|
uint8_t b0, b1, b2, b3;
|
|
|
|
if (p == end) {
|
|
return 0;
|
|
}
|
|
|
|
b0 = *p;
|
|
|
|
if (b0 <= 0x7F) {
|
|
return 1;
|
|
}
|
|
|
|
if (b0 <= 0xC1) {
|
|
/* Invalid start byte */
|
|
return -1;
|
|
}
|
|
|
|
if (++p == end) {
|
|
/* Not enough bytes left */
|
|
return -1;
|
|
}
|
|
b1 = *p;
|
|
|
|
if (b0 <= 0xDF) {
|
|
/* C2..DF 80..BF */
|
|
if (!utf8_tail(b1)) {
|
|
return -1;
|
|
}
|
|
return 2;
|
|
}
|
|
|
|
if (++p == end) {
|
|
/* Not enough bytes left */
|
|
return -1;
|
|
}
|
|
b2 = *p;
|
|
|
|
if (b0 == 0xE0) {
|
|
/* E0 A0..BF 80..BF */
|
|
if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
|
|
return -1;
|
|
}
|
|
return 3;
|
|
} else if (b0 == 0xED && b1 >= 0xA0) {
|
|
/*
|
|
* UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
|
|
* ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
|
|
*/
|
|
return -1;
|
|
} else if (b0 <= 0xEF) {
|
|
/* E1..EF 80..BF 80..BF */
|
|
if (!utf8_tail(b1) || !utf8_tail(b2)) {
|
|
return -1;
|
|
}
|
|
return 3;
|
|
}
|
|
|
|
if (++p == end) {
|
|
/* Not enough bytes left */
|
|
return -1;
|
|
}
|
|
b3 = *p;
|
|
|
|
if (b0 == 0xF0) {
|
|
/* F0 90..BF 80..BF 80..BF */
|
|
if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
|
|
return -1;
|
|
}
|
|
return 4;
|
|
} else if (b0 <= 0xF3) {
|
|
/* F1..F3 80..BF 80..BF 80..BF */
|
|
if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
|
|
return -1;
|
|
}
|
|
return 4;
|
|
} else if (b0 == 0xF4) {
|
|
/* F4 80..8F 80..BF 80..BF */
|
|
if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
|
|
return -1;
|
|
}
|
|
return 4;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_1(const uint8_t *data)
|
|
{
|
|
return data[0];
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_2(const uint8_t *data)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
codepoint = ((data[0] & 0x1F) << 6);
|
|
codepoint |= (data[1] & 0x3F);
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_3(const uint8_t *data)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
codepoint = ((data[0] & 0x0F) << 12);
|
|
codepoint |= (data[1] & 0x3F) << 6;
|
|
codepoint |= (data[2] & 0x3F);
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf8_decode_unsafe_4(const uint8_t *data)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
codepoint = ((data[0] & 0x07) << 18);
|
|
codepoint |= (data[1] & 0x3F) << 12;
|
|
codepoint |= (data[2] & 0x3F) << 6;
|
|
codepoint |= (data[3] & 0x3F);
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
/*
|
|
* Encode a single Unicode codepoint as UTF-8.
|
|
*
|
|
* buf must have at least 4 bytes of space available (hence unsafe).
|
|
*
|
|
* \return Number of bytes appended to buf, or negative if encoding failed.
|
|
*/
|
|
static inline int
|
|
utf8_encode_unsafe(uint8_t *buf, uint32_t c)
|
|
{
|
|
if (c <= 0x7F) {
|
|
buf[0] = c;
|
|
return 1;
|
|
} else if (c <= 0x7FF) {
|
|
buf[0] = 0xC0 | (c >> 6);
|
|
buf[1] = 0x80 | (c & 0x3F);
|
|
return 2;
|
|
} else if (c >= 0xD800 && c <= 0xDFFF) {
|
|
/* UTF-16 surrogate pairs - invalid in UTF-8 */
|
|
return -1;
|
|
} else if (c <= 0xFFFF) {
|
|
buf[0] = 0xE0 | (c >> 12);
|
|
buf[1] = 0x80 | ((c >> 6) & 0x3F);
|
|
buf[2] = 0x80 | (c & 0x3F);
|
|
return 3;
|
|
} else if (c <= 0x10FFFF) {
|
|
buf[0] = 0xF0 | (c >> 18);
|
|
buf[1] = 0x80 | ((c >> 12) & 0x3F);
|
|
buf[2] = 0x80 | ((c >> 6) & 0x3F);
|
|
buf[3] = 0x80 | (c & 0x3F);
|
|
return 4;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static inline int
|
|
utf8_codepoint_len(uint32_t c)
|
|
{
|
|
if (c <= 0x7F) {
|
|
return 1;
|
|
} else if (c <= 0x7FF) {
|
|
return 2;
|
|
} else if (c >= 0xD800 && c <= 0xDFFF) {
|
|
/* UTF-16 surrogate pairs - invalid in UTF-8 */
|
|
return -1;
|
|
} else if (c <= 0xFFFF) {
|
|
return 3;
|
|
} else if (c <= 0x10FFFF) {
|
|
return 4;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static inline bool
|
|
utf16_valid_surrogate_high(uint32_t val)
|
|
{
|
|
return val >= 0xD800 && val <= 0xDBFF;
|
|
}
|
|
|
|
static inline bool
|
|
utf16_valid_surrogate_low(uint32_t val)
|
|
{
|
|
return val >= 0xDC00 && val <= 0xDFFF;
|
|
}
|
|
|
|
/*
|
|
* Check for a valid UTF-16LE encoding of a single codepoint.
|
|
*
|
|
* \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
|
|
*/
|
|
static inline int
|
|
utf16le_valid(const uint16_t *start, const uint16_t *end)
|
|
{
|
|
const uint16_t *p = start;
|
|
uint16_t high, low;
|
|
|
|
if (p == end) {
|
|
return 0;
|
|
}
|
|
|
|
high = from_le16(p);
|
|
|
|
if (high <= 0xD7FF || high >= 0xE000) {
|
|
/* Single code unit in BMP */
|
|
return 1;
|
|
}
|
|
|
|
if (high >= 0xDC00) {
|
|
/* Low surrogate in first code unit - invalid */
|
|
return -1;
|
|
}
|
|
|
|
assert(utf16_valid_surrogate_high(high));
|
|
|
|
if (++p == end) {
|
|
/* Not enough code units left */
|
|
return -1;
|
|
}
|
|
low = from_le16(p);
|
|
|
|
if (!utf16_valid_surrogate_low(low)) {
|
|
return -1;
|
|
}
|
|
|
|
/* Valid surrogate pair */
|
|
return 2;
|
|
}
|
|
|
|
static inline uint32_t
|
|
utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
|
|
{
|
|
uint32_t codepoint;
|
|
|
|
assert(utf16_valid_surrogate_high(high));
|
|
assert(utf16_valid_surrogate_low(low));
|
|
|
|
codepoint = low;
|
|
codepoint &= 0x3FF;
|
|
codepoint |= ((high & 0x3FF) << 10);
|
|
codepoint += 0x10000;
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
static inline void
|
|
utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
|
|
{
|
|
assert(codepoint >= 0x10000);
|
|
assert(codepoint <= 0x10FFFF);
|
|
|
|
codepoint -= 0x10000;
|
|
*high = 0xD800 | (codepoint >> 10);
|
|
*low = 0xDC00 | (codepoint & 0x3FF);
|
|
|
|
assert(utf16_valid_surrogate_high(*high));
|
|
assert(utf16_valid_surrogate_low(*low));
|
|
}
|
|
|
|
#endif
|