Spdk/include/spdk_internal/utf.h
paul luse a6dbe3721e update Intel copyright notices
per Intel policy to include file commit date using git cmd
below.  The policy does not apply to non-Intel (C) notices.

git log --follow -C90% --format=%ad --date default <file> | tail -1

and then pull just the 4 digit year from the result.

Intel copyrights were not added to files where Intel either had
no contribution ot the contribution lacked substance (ie license
header updates, formatting changes, etc).  Contribution date used
"--follow -C95%" to get the most accurate date.

Note that several files in this patch didn't end the license/(c)
block with a blank comment line so these were added as the vast
majority of files do have this last blank line.  Simply there for
consistency.

Signed-off-by: paul luse <paul.e.luse@intel.com>
Change-Id: Id5b7ce4f658fe87132f14139ead58d6e285c04d4
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15192
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Community-CI: Mellanox Build Bot
2022-11-10 08:28:53 +00:00

298 lines
5.5 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (C) 2016 Intel Corporation.
* All rights reserved.
*/
#ifndef SPDK_UTF_H_
#define SPDK_UTF_H_
#include "spdk/stdinc.h"
#include "spdk/endian.h"
#include "spdk/likely.h"
#include "spdk/string.h"
static inline bool
utf8_tail(uint8_t c)
{
/* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
return (c & 0xC0) == 0x80;
}
/*
* Check for a valid UTF-8 encoding of a single codepoint.
*
* \return Length of valid UTF-8 byte sequence, or negative if invalid.
*/
static inline int
utf8_valid(const uint8_t *start, const uint8_t *end)
{
const uint8_t *p = start;
uint8_t b0, b1, b2, b3;
if (p == end) {
return 0;
}
b0 = *p;
if (b0 <= 0x7F) {
return 1;
}
if (b0 <= 0xC1) {
/* Invalid start byte */
return -1;
}
if (++p == end) {
/* Not enough bytes left */
return -1;
}
b1 = *p;
if (b0 <= 0xDF) {
/* C2..DF 80..BF */
if (!utf8_tail(b1)) {
return -1;
}
return 2;
}
if (++p == end) {
/* Not enough bytes left */
return -1;
}
b2 = *p;
if (b0 == 0xE0) {
/* E0 A0..BF 80..BF */
if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
return -1;
}
return 3;
} else if (b0 == 0xED && b1 >= 0xA0) {
/*
* UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
* ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
*/
return -1;
} else if (b0 <= 0xEF) {
/* E1..EF 80..BF 80..BF */
if (!utf8_tail(b1) || !utf8_tail(b2)) {
return -1;
}
return 3;
}
if (++p == end) {
/* Not enough bytes left */
return -1;
}
b3 = *p;
if (b0 == 0xF0) {
/* F0 90..BF 80..BF 80..BF */
if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
return -1;
}
return 4;
} else if (b0 <= 0xF3) {
/* F1..F3 80..BF 80..BF 80..BF */
if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
return -1;
}
return 4;
} else if (b0 == 0xF4) {
/* F4 80..8F 80..BF 80..BF */
if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
return -1;
}
return 4;
}
return -1;
}
static inline uint32_t
utf8_decode_unsafe_1(const uint8_t *data)
{
return data[0];
}
static inline uint32_t
utf8_decode_unsafe_2(const uint8_t *data)
{
uint32_t codepoint;
codepoint = ((data[0] & 0x1F) << 6);
codepoint |= (data[1] & 0x3F);
return codepoint;
}
static inline uint32_t
utf8_decode_unsafe_3(const uint8_t *data)
{
uint32_t codepoint;
codepoint = ((data[0] & 0x0F) << 12);
codepoint |= (data[1] & 0x3F) << 6;
codepoint |= (data[2] & 0x3F);
return codepoint;
}
static inline uint32_t
utf8_decode_unsafe_4(const uint8_t *data)
{
uint32_t codepoint;
codepoint = ((data[0] & 0x07) << 18);
codepoint |= (data[1] & 0x3F) << 12;
codepoint |= (data[2] & 0x3F) << 6;
codepoint |= (data[3] & 0x3F);
return codepoint;
}
/*
* Encode a single Unicode codepoint as UTF-8.
*
* buf must have at least 4 bytes of space available (hence unsafe).
*
* \return Number of bytes appended to buf, or negative if encoding failed.
*/
static inline int
utf8_encode_unsafe(uint8_t *buf, uint32_t c)
{
if (c <= 0x7F) {
buf[0] = c;
return 1;
} else if (c <= 0x7FF) {
buf[0] = 0xC0 | (c >> 6);
buf[1] = 0x80 | (c & 0x3F);
return 2;
} else if (c >= 0xD800 && c <= 0xDFFF) {
/* UTF-16 surrogate pairs - invalid in UTF-8 */
return -1;
} else if (c <= 0xFFFF) {
buf[0] = 0xE0 | (c >> 12);
buf[1] = 0x80 | ((c >> 6) & 0x3F);
buf[2] = 0x80 | (c & 0x3F);
return 3;
} else if (c <= 0x10FFFF) {
buf[0] = 0xF0 | (c >> 18);
buf[1] = 0x80 | ((c >> 12) & 0x3F);
buf[2] = 0x80 | ((c >> 6) & 0x3F);
buf[3] = 0x80 | (c & 0x3F);
return 4;
}
return -1;
}
static inline int
utf8_codepoint_len(uint32_t c)
{
if (c <= 0x7F) {
return 1;
} else if (c <= 0x7FF) {
return 2;
} else if (c >= 0xD800 && c <= 0xDFFF) {
/* UTF-16 surrogate pairs - invalid in UTF-8 */
return -1;
} else if (c <= 0xFFFF) {
return 3;
} else if (c <= 0x10FFFF) {
return 4;
}
return -1;
}
static inline bool
utf16_valid_surrogate_high(uint32_t val)
{
return val >= 0xD800 && val <= 0xDBFF;
}
static inline bool
utf16_valid_surrogate_low(uint32_t val)
{
return val >= 0xDC00 && val <= 0xDFFF;
}
/*
* Check for a valid UTF-16LE encoding of a single codepoint.
*
* \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
*/
static inline int
utf16le_valid(const uint16_t *start, const uint16_t *end)
{
const uint16_t *p = start;
uint16_t high, low;
if (p == end) {
return 0;
}
high = from_le16(p);
if (high <= 0xD7FF || high >= 0xE000) {
/* Single code unit in BMP */
return 1;
}
if (high >= 0xDC00) {
/* Low surrogate in first code unit - invalid */
return -1;
}
assert(utf16_valid_surrogate_high(high));
if (++p == end) {
/* Not enough code units left */
return -1;
}
low = from_le16(p);
if (!utf16_valid_surrogate_low(low)) {
return -1;
}
/* Valid surrogate pair */
return 2;
}
static inline uint32_t
utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
{
uint32_t codepoint;
assert(utf16_valid_surrogate_high(high));
assert(utf16_valid_surrogate_low(low));
codepoint = low;
codepoint &= 0x3FF;
codepoint |= ((high & 0x3FF) << 10);
codepoint += 0x10000;
return codepoint;
}
static inline void
utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
{
assert(codepoint >= 0x10000);
assert(codepoint <= 0x10FFFF);
codepoint -= 0x10000;
*high = 0xD800 | (codepoint >> 10);
*low = 0xDC00 | (codepoint & 0x3FF);
assert(utf16_valid_surrogate_high(*high));
assert(utf16_valid_surrogate_low(*low));
}
#endif