From 429672d31b8bf0946de6ca98a0c871abdf2657f1 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Fri, 7 Jul 2017 15:01:12 -0700 Subject: [PATCH] json: add function to write UTF-16LE strings spdk_json_write_string_utf16le() writes a UTF-16LE string to a JSON write context. Change-Id: I413ffb8a3dee6e1b44ec96ce2415fd1b9c36320f Signed-off-by: Daniel Verkamp Reviewed-on: https://review.gerrithub.io/368625 Tested-by: SPDK Automated Test System Reviewed-by: Ben Walker Reviewed-by: Jim Harris --- include/spdk/json.h | 21 +++ lib/json/json_internal.h | 44 +++++++ lib/json/json_write.c | 124 +++++++++++++----- .../lib/json/json_write.c/json_write_ut.c | 44 +++++++ 4 files changed, 200 insertions(+), 33 deletions(-) diff --git a/include/spdk/json.h b/include/spdk/json.h index 9d11d677e..8ec7726d2 100644 --- a/include/spdk/json.h +++ b/include/spdk/json.h @@ -198,6 +198,27 @@ int spdk_json_write_int64(struct spdk_json_write_ctx *w, int64_t val); int spdk_json_write_uint64(struct spdk_json_write_ctx *w, uint64_t val); int spdk_json_write_string(struct spdk_json_write_ctx *w, const char *val); int spdk_json_write_string_raw(struct spdk_json_write_ctx *w, const char *val, size_t len); + +/** + * Write null-terminated UTF-16LE string. + * + * \param w JSON write context. + * \param val UTF-16LE string; must be null terminated. + * \return 0 on success or negative on failure. + */ +int spdk_json_write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val); + +/** + * Write UTF-16LE string. + * + * \param w JSON write context. + * \param val UTF-16LE string; may contain embedded null characters. + * \param len Length of val in 16-bit code units (i.e. size of string in bytes divided by 2). + * \return 0 on success or negative on failure. + */ +int spdk_json_write_string_utf16le_raw(struct spdk_json_write_ctx *w, const uint16_t *val, + size_t len); + int spdk_json_write_string_fmt(struct spdk_json_write_ctx *w, const char *fmt, ...) __attribute__((__format__(__printf__, 2, 3))); int spdk_json_write_array_begin(struct spdk_json_write_ctx *w); diff --git a/lib/json/json_internal.h b/lib/json/json_internal.h index fa2258e9f..01f700570 100644 --- a/lib/json/json_internal.h +++ b/lib/json/json_internal.h @@ -36,6 +36,7 @@ #include "spdk/stdinc.h" +#include "spdk/endian.h" #include "spdk/json.h" #include "spdk/likely.h" #include "spdk/string.h" @@ -251,6 +252,49 @@ utf16_valid_surrogate_low(uint32_t val) return val >= 0xDC00 && val <= 0xDFFF; } +/* + * Check for a valid UTF-16LE encoding of a single codepoint. + * + * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid. + */ +static inline int +utf16le_valid(const uint16_t *start, const uint16_t *end) +{ + const uint16_t *p = start; + uint16_t high, low; + + if (p == end) { + return 0; + } + + high = from_le16(p); + + if (high <= 0xD7FF || high >= 0xE000) { + /* Single code unit in BMP */ + return 1; + } + + if (high >= 0xDC00) { + /* Low surrogate in first code unit - invalid */ + return -1; + } + + assert(utf16_valid_surrogate_high(high)); + + if (++p == end) { + /* Not enough code units left */ + return -1; + } + low = from_le16(p); + + if (!utf16_valid_surrogate_low(low)) { + return -1; + } + + /* Valid surrogate pair */ + return 2; +} + static inline uint32_t utf16_decode_surrogate_pair(uint32_t high, uint32_t low) { diff --git a/lib/json/json_write.c b/lib/json/json_write.c index af085f617..6be19572e 100644 --- a/lib/json/json_write.c +++ b/lib/json/json_write.c @@ -275,11 +275,9 @@ write_hex_4(void *dest, uint16_t val) p[3] = hex[val & 0xF]; } -static int -write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len) +static inline int +write_codepoint(struct spdk_json_write_ctx *w, uint32_t codepoint) { - const uint8_t *p = val; - const uint8_t *end = val + len; static const uint8_t escapes[] = { ['\b'] = 'b', ['\f'] = 'f', @@ -293,15 +291,51 @@ write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len) * (it is valid unescaped). */ }; + uint16_t high, low; + char out[13]; + size_t out_len; + + if (codepoint < sizeof(escapes) && escapes[codepoint]) { + out[0] = '\\'; + out[1] = escapes[codepoint]; + out_len = 2; + } else if (codepoint >= 0x20 && codepoint < 0x7F) { + /* + * Encode plain ASCII directly (except 0x7F, since it is really + * a control character, despite the JSON spec not considering it one). + */ + out[0] = (uint8_t)codepoint; + out_len = 1; + } else if (codepoint < 0x10000) { + out[0] = '\\'; + out[1] = 'u'; + write_hex_4(&out[2], (uint16_t)codepoint); + out_len = 6; + } else { + utf16_encode_surrogate_pair(codepoint, &high, &low); + out[0] = '\\'; + out[1] = 'u'; + write_hex_4(&out[2], high); + out[6] = '\\'; + out[7] = 'u'; + write_hex_4(&out[8], low); + out_len = 12; + } + + return emit(w, out, out_len); +} + +static int +write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len) +{ + const uint8_t *p = val; + const uint8_t *end = val + len; if (emit(w, "\"", 1)) return fail(w); while (p != end) { int codepoint_len; uint32_t codepoint; - uint16_t high, low; - char out[13]; - size_t out_len; codepoint_len = utf8_valid(p, end); switch (codepoint_len) { @@ -321,34 +355,38 @@ write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len) return fail(w); } - if (codepoint < sizeof(escapes) && escapes[codepoint]) { - out[0] = '\\'; - out[1] = escapes[codepoint]; - out_len = 2; - } else if (codepoint >= 0x20 && codepoint < 0x7F) { - /* - * Encode plain ASCII directly (except 0x7F, since it is really - * a control character, despite the JSON spec not considering it one). - */ - out[0] = (uint8_t)codepoint; - out_len = 1; - } else if (codepoint < 0x10000) { - out[0] = '\\'; - out[1] = 'u'; - write_hex_4(&out[2], (uint16_t)codepoint); - out_len = 6; - } else { - utf16_encode_surrogate_pair(codepoint, &high, &low); - out[0] = '\\'; - out[1] = 'u'; - write_hex_4(&out[2], high); - out[6] = '\\'; - out[7] = 'u'; - write_hex_4(&out[8], low); - out_len = 12; + if (write_codepoint(w, codepoint)) return fail(w); + p += codepoint_len; + } + + return emit(w, "\"", 1); +} + +static int +write_string_or_name_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len) +{ + const uint16_t *p = val; + const uint16_t *end = val + len; + + if (emit(w, "\"", 1)) return fail(w); + + while (p != end) { + int codepoint_len; + uint32_t codepoint; + + codepoint_len = utf16le_valid(p, end); + switch (codepoint_len) { + case 1: + codepoint = from_le16(&p[0]); + break; + case 2: + codepoint = utf16_decode_surrogate_pair(from_le16(&p[0]), from_le16(&p[1])); + break; + default: + return fail(w); } - if (emit(w, out, out_len)) return fail(w); + if (write_codepoint(w, codepoint)) return fail(w); p += codepoint_len; } @@ -368,6 +406,26 @@ spdk_json_write_string(struct spdk_json_write_ctx *w, const char *val) return spdk_json_write_string_raw(w, val, strlen(val)); } +int +spdk_json_write_string_utf16le_raw(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len) +{ + if (begin_value(w)) return fail(w); + return write_string_or_name_utf16le(w, val, len); +} + +int +spdk_json_write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val) +{ + const uint16_t *p; + size_t len; + + for (len = 0, p = val; *p; p++) { + len++; + } + + return spdk_json_write_string_utf16le_raw(w, val, len); +} + int spdk_json_write_string_fmt(struct spdk_json_write_ctx *w, const char *fmt, ...) { diff --git a/test/unit/lib/json/json_write.c/json_write_ut.c b/test/unit/lib/json/json_write.c/json_write_ut.c index a00ff1456..2bb2a7104 100644 --- a/test/unit/lib/json/json_write.c/json_write_ut.c +++ b/test/unit/lib/json/json_write.c/json_write_ut.c @@ -86,6 +86,18 @@ write_cb(void *cb_ctx, const void *data, size_t size) #define STR_FAIL(in) \ BEGIN(); VAL_STRING_FAIL(in); END_FAIL() +#define VAL_STRING_UTF16LE(str) \ + CU_ASSERT(spdk_json_write_string_utf16le_raw(w, (const uint16_t *)str, sizeof(str) / sizeof(uint16_t) - 1) == 0) + +#define VAL_STRING_UTF16LE_FAIL(str) \ + CU_ASSERT(spdk_json_write_string_utf16le_raw(w, (const uint16_t *)str, sizeof(str) / sizeof(uint16_t) - 1) < 0) + +#define STR_UTF16LE_PASS(in, out) \ + BEGIN(); VAL_STRING_UTF16LE(in); END("\"" out "\"") + +#define STR_UTF16LE_FAIL(in) \ + BEGIN(); VAL_STRING_UTF16LE_FAIL(in); END_FAIL() + #define VAL_NAME(name) \ CU_ASSERT(spdk_json_write_name_raw(w, name, sizeof(name) - 1) == 0) @@ -248,6 +260,37 @@ test_write_string_escapes(void) STR_FAIL("\xED\xA1\x8C\xED\xBE\xB4"); /* U+233B4 (invalid surrogate pair encoding) */ } +static void +test_write_string_utf16le(void) +{ + struct spdk_json_write_ctx *w; + + /* All characters in BMP */ + STR_UTF16LE_PASS(((uint8_t[]) { + 'H', 0, 'e', 0, 'l', 0, 'l', 0, 'o', 0, 0x15, 0xFE, 0, 0 + }), "Hello\\uFE15"); + + /* Surrogate pair */ + STR_UTF16LE_PASS(((uint8_t[]) { + 'H', 0, 'i', 0, 0x34, 0xD8, 0x1E, 0xDD, '!', 0, 0, 0 + }), "Hi\\uD834\\uDD1E!"); + + /* Valid high surrogate, but no low surrogate */ + STR_UTF16LE_FAIL(((uint8_t[]) { + 0x00, 0xD8, 0, 0 /* U+D800 */ + })); + + /* Invalid leading low surrogate */ + STR_UTF16LE_FAIL(((uint8_t[]) { + 0x00, 0xDC, 0x00, 0xDC, 0, 0 /* U+DC00 U+DC00 */ + })); + + /* Valid high surrogate followed by another high surrogate (invalid) */ + STR_UTF16LE_FAIL(((uint8_t[]) { + 0x00, 0xD8, 0x00, 0xD8, 0, 0 /* U+D800 U+D800 */ + })); +} + static void test_write_number_int32(void) { @@ -618,6 +661,7 @@ int main(int argc, char **argv) CU_add_test(suite, "write_literal", test_write_literal) == NULL || CU_add_test(suite, "write_string_simple", test_write_string_simple) == NULL || CU_add_test(suite, "write_string_escapes", test_write_string_escapes) == NULL || + CU_add_test(suite, "write_string_utf16le", test_write_string_utf16le) == NULL || CU_add_test(suite, "write_number_int32", test_write_number_int32) == NULL || CU_add_test(suite, "write_number_uint32", test_write_number_uint32) == NULL || CU_add_test(suite, "write_array", test_write_array) == NULL ||