From c4121d2dba84e4f8f8f25bf048337bd8e6792469 Mon Sep 17 00:00:00 2001 From: Jordan Bancino Date: Sat, 15 Jul 2023 17:57:21 +0000 Subject: [PATCH] Fix Unicode handling in Json, don't sign-extend bytes in Stream. --- src/Json.c | 44 ++++++++++++++++++++++++++++++++++++---- src/Str.c | 51 +++++++++++++++++++++++++++++++++-------------- src/Stream.c | 5 +++-- src/include/Str.h | 11 ++++++++-- 4 files changed, 88 insertions(+), 23 deletions(-) diff --git a/src/Json.c b/src/Json.c index 15c5577..16f9a77 100644 --- a/src/Json.c +++ b/src/Json.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -401,7 +402,10 @@ JsonDecodeString(Stream * in) int c; char a[5]; - unsigned long utf8; + UInt32 codepoint; + UInt16 high; + UInt16 low; + char *utf8Ptr; len = 0; @@ -480,14 +484,46 @@ JsonDecodeString(Stream * in) return NULL; } /* Interpret characters as a hex number */ - if (sscanf(a, "%04lx", &utf8) != 1) + if (sscanf(a, "%04hx", &high) != 1) { /* Bad hex value */ Free(str); return NULL; } - if (utf8 == 0) + /* If this is a two-byte UTF-16 codepoint, grab + * the second byte */ + if (high > 0xD7FF && high <= 0xDBFF) + { + if (StreamGetc(in) != '\\' || StreamGetc(in) != 'u') + { + Free(str); + return NULL; + } + + /* Read 4 characters into a */ + if (!StreamGets(in, a, sizeof(a))) + { + Free(str); + return NULL; + } + + /* Interpret characters as a hex number */ + if (sscanf(a, "%04hx", &low) != 1) + { + Free(str); + return NULL; + } + + codepoint = StrUtf16Decode(high, low); + } + else + { + codepoint = high; + } + + + if (codepoint == 0) { /* * We read in a 0000, null. There is no @@ -507,7 +543,7 @@ JsonDecodeString(Stream * in) /* Encode the 4-byte UTF-8 buffer into a series * of 1-byte characters */ - utf8Ptr = StrUtf8Encode(utf8); + utf8Ptr = StrUtf8Encode(codepoint); if (!utf8Ptr) { /* Mem error */ diff --git a/src/Str.c b/src/Str.c index f81f023..98dea0b 100644 --- a/src/Str.c +++ b/src/Str.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -34,8 +35,28 @@ #include #include +UInt32 +StrUtf16Decode(UInt16 high, UInt16 low) +{ + if (high <= 0xD7FF) + { + return high; + } + else if (high <= 0xDBFF) + { + unsigned short hS = (high - 0xD800) * 0x400; + unsigned short lS = low - 0xDC00; + + return (lS | hS) + 0x10000; + } + else + { + return 0; + } +} + char * -StrUtf8Encode(unsigned long utf8) +StrUtf8Encode(UInt32 codepoint) { char *str; @@ -45,30 +66,30 @@ StrUtf8Encode(unsigned long utf8) return NULL; } - if (utf8 <= 0x7F) /* Plain ASCII */ + if (codepoint <= 0x7F && codepoint != 0) /* Plain ASCII */ { - str[0] = (char) utf8; + str[0] = (char) codepoint; str[1] = '\0'; } - else if (utf8 <= 0x07FF) /* 2-byte */ + else if (codepoint <= 0x07FF) /* 2-byte */ { - str[0] = (char) (((utf8 >> 6) & 0x1F) | 0xC0); - str[1] = (char) (((utf8 >> 0) & 0x3F) | 0x80); + str[0] = (char) (((codepoint >> 6) & 0x1F) | 0xC0); + str[1] = (char) (((codepoint >> 0) & 0x3F) | 0x80); str[2] = '\0'; } - else if (utf8 <= 0xFFFF) /* 3-byte */ + else if (codepoint <= 0xFFFF) /* 3-byte */ { - str[0] = (char) (((utf8 >> 12) & 0x0F) | 0xE0); - str[1] = (char) (((utf8 >> 6) & 0x3F) | 0x80); - str[2] = (char) (((utf8 >> 0) & 0x3F) | 0x80); + str[0] = (char) (((codepoint >> 12) & 0x0F) | 0xE0); + str[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80); + str[2] = (char) (((codepoint >> 0) & 0x3F) | 0x80); str[3] = '\0'; } - else if (utf8 <= 0x10FFFF) /* 4-byte */ + else if (codepoint <= 0x10FFFF)/* 4-byte */ { - str[0] = (char) (((utf8 >> 18) & 0x07) | 0xF0); - str[1] = (char) (((utf8 >> 12) & 0x3F) | 0x80); - str[2] = (char) (((utf8 >> 6) & 0x3F) | 0x80); - str[3] = (char) (((utf8 >> 0) & 0x3F) | 0x80); + str[0] = (char) (((codepoint >> 18) & 0x07) | 0xF0); + str[1] = (char) (((codepoint >> 12) & 0x3F) | 0x80); + str[2] = (char) (((codepoint >> 6) & 0x3F) | 0x80); + str[3] = (char) (((codepoint >> 0) & 0x3F) | 0x80); str[4] = '\0'; } else diff --git a/src/Stream.c b/src/Stream.c index 2cb299f..b0b787a 100644 --- a/src/Stream.c +++ b/src/Stream.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -49,11 +50,11 @@ struct Stream { Io *io; - char *rBuf; + UInt8 *rBuf; size_t rLen; size_t rOff; - char *wBuf; + UInt8 *wBuf; size_t wLen; char *ugBuf; diff --git a/src/include/Str.h b/src/include/Str.h index 1bc63cc..714b8d8 100644 --- a/src/include/Str.h +++ b/src/include/Str.h @@ -39,14 +39,21 @@ * is a standard library header. */ +#include + #include /** - * Take a UTF-8 codepoint and encode it into a string buffer containing + * Convert UTF-16 into a Unicode codepoint. + */ +extern UInt32 StrUtf16Decode(UInt16, UInt16); + +/** + * Take a Unicode codepoint and encode it into a string buffer containing * between 1 and 4 bytes. The string buffer is allocated on the heap, * so it should be freed when it is no longer needed. */ -extern char * StrUtf8Encode(unsigned long); +extern char * StrUtf8Encode(UInt32); /** * Duplicate a null-terminated string, returning a new string on the