Fix Unicode handling in Json, don't sign-extend bytes in Stream.

2023-07-15 17:57:21 +00:00 · 2023-07-15 17:57:21 +00:00 · c96ac30f28
commit c96ac30f28
parent a4330123b9
7 changed files with 99 additions and 33 deletions
--- a/Cytoplasm/src/Json.c
+++ b/Cytoplasm/src/Json.c
@ -26,6 +26,7 @@
 #include <Memory.h>
 #include <Str.h>
 #include <Util.h>
+#include <Int.h>

 #include <stdio.h>
 #include <stddef.h>
@ -401,7 +402,10 @@ JsonDecodeString(Stream * in)
    int c;
    char a[5];

-    unsigned long utf8;
+    UInt32 codepoint;
+    UInt16 high;
+    UInt16 low;
+
    char *utf8Ptr;

    len = 0;
@ -480,14 +484,46 @@ JsonDecodeString(Stream * in)
                            return NULL;
                        }
                        /* Interpret characters as a hex number */
-                        if (sscanf(a, "%04lx", &utf8) != 1)
+                        if (sscanf(a, "%04hx", &high) != 1)
                        {
                            /* Bad hex value */
                            Free(str);
                            return NULL;
                        }

-                        if (utf8 == 0)
+                        /* If this is a two-byte UTF-16 codepoint, grab
+                         * the second byte */
+                        if (high > 0xD7FF && high <= 0xDBFF)
+                        {
+                            if (StreamGetc(in) != '\\' || StreamGetc(in) != 'u')
+                            {
+                                Free(str);
+                                return NULL;
+                            }
+
+                            /* Read 4 characters into a */
+                            if (!StreamGets(in, a, sizeof(a)))
+                            {
+                                Free(str);
+                                return NULL;
+                            }
+
+                            /* Interpret characters as a hex number */
+                            if (sscanf(a, "%04hx", &low) != 1)
+                            {
+                                Free(str);
+                                return NULL;
+                            }
+
+                            codepoint = StrUtf16Decode(high, low);
+                        }
+                        else
+                        {
+                            codepoint = high;
+                        }
+
+
+                        if (codepoint == 0)
                        {
                            /*
                             * We read in a 0000, null. There is no
@ -507,7 +543,7 @@ JsonDecodeString(Stream * in)

                        /* Encode the 4-byte UTF-8 buffer into a series
                         * of 1-byte characters */
-                        utf8Ptr = StrUtf8Encode(utf8);
+                        utf8Ptr = StrUtf8Encode(codepoint);
                        if (!utf8Ptr)
                        {
                            /* Mem error */
--- a/Cytoplasm/src/Str.c
+++ b/Cytoplasm/src/Str.c
@ -26,6 +26,7 @@
 #include <Memory.h>
 #include <Util.h>
 #include <Rand.h>
+#include <Int.h>

 #include <stdlib.h>
 #include <string.h>
@ -34,8 +35,28 @@
 #include <pthread.h>
 #include <unistd.h>

+UInt32
+StrUtf16Decode(UInt16 high, UInt16 low)
+{
+    if (high <= 0xD7FF)
+    {
+        return high;
+    }
+    else if (high <= 0xDBFF)
+    {
+        unsigned short hS = (high - 0xD800) * 0x400;
+        unsigned short lS = low - 0xDC00;
+
+        return (lS | hS) + 0x10000;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
 char *
-StrUtf8Encode(unsigned long utf8)
+StrUtf8Encode(UInt32 codepoint)
 {
    char *str;

@ -45,30 +66,30 @@ StrUtf8Encode(unsigned long utf8)
        return NULL;
    }

-    if (utf8 <= 0x7F)              /* Plain ASCII */
+    if (codepoint <= 0x7F && codepoint != 0)    /* Plain ASCII */
    {
-        str[0] = (char) utf8;
+        str[0] = (char) codepoint;
        str[1] = '\0';
    }
-    else if (utf8 <= 0x07FF)       /* 2-byte */
+    else if (codepoint <= 0x07FF)  /* 2-byte */
    {
-        str[0] = (char) (((utf8 >> 6) & 0x1F) | 0xC0);
-        str[1] = (char) (((utf8 >> 0) & 0x3F) | 0x80);
+        str[0] = (char) (((codepoint >> 6) & 0x1F) | 0xC0);
+        str[1] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
        str[2] = '\0';
    }
-    else if (utf8 <= 0xFFFF)       /* 3-byte */
+    else if (codepoint <= 0xFFFF)  /* 3-byte */
    {
-        str[0] = (char) (((utf8 >> 12) & 0x0F) | 0xE0);
-        str[1] = (char) (((utf8 >> 6) & 0x3F) | 0x80);
-        str[2] = (char) (((utf8 >> 0) & 0x3F) | 0x80);
+        str[0] = (char) (((codepoint >> 12) & 0x0F) | 0xE0);
+        str[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
+        str[2] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
        str[3] = '\0';
    }
-    else if (utf8 <= 0x10FFFF)     /* 4-byte */
+    else if (codepoint <= 0x10FFFF)/* 4-byte */
    {
-        str[0] = (char) (((utf8 >> 18) & 0x07) | 0xF0);
-        str[1] = (char) (((utf8 >> 12) & 0x3F) | 0x80);
-        str[2] = (char) (((utf8 >> 6) & 0x3F) | 0x80);
-        str[3] = (char) (((utf8 >> 0) & 0x3F) | 0x80);
+        str[0] = (char) (((codepoint >> 18) & 0x07) | 0xF0);
+        str[1] = (char) (((codepoint >> 12) & 0x3F) | 0x80);
+        str[2] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
+        str[3] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
        str[4] = '\0';
    }
    else
--- a/Cytoplasm/src/Stream.c
+++ b/Cytoplasm/src/Stream.c
@ -26,6 +26,7 @@
 #include <Io.h>
 #include <Memory.h>
 #include <Util.h>
+#include <Int.h>

 #include <stdio.h>
 #include <stdlib.h>
@ -49,11 +50,11 @@ struct Stream
 {
    Io *io;

-    char *rBuf;
+    UInt8 *rBuf;
    size_t rLen;
    size_t rOff;

-    char *wBuf;
+    UInt8 *wBuf;
    size_t wLen;

    char *ugBuf;
--- a/Cytoplasm/src/include/Str.h
+++ b/Cytoplasm/src/include/Str.h
@ -39,14 +39,21 @@
 * is a standard library header.
 */

+#include <Int.h>
+
 #include <stddef.h>

 /**
- * Take a UTF-8 codepoint and encode it into a string buffer containing
+ * Convert UTF-16 into a Unicode codepoint.
+ */
+extern UInt32 StrUtf16Decode(UInt16, UInt16);
+
+/**
+ * Take a Unicode codepoint and encode it into a string buffer containing
 * between 1 and 4 bytes. The string buffer is allocated on the heap,
 * so it should be freed when it is no longer needed.
 */
-extern char * StrUtf8Encode(unsigned long);
+extern char * StrUtf8Encode(UInt32);

 /**
 * Duplicate a null-terminated string, returning a new string on the
--- a/tools/src/json.c
+++ b/tools/src/json.c
@ -256,7 +256,8 @@ Main(Array * args)
    switch (flag)
    {
        case FLAG_SELECT:
-            query(input, json, canonical);    /* This will implicitly free json */
+            query(input, json, canonical);      /* This will implicitly
+                                                 * free json */
            break;
        case FLAG_ENCODE:
            encode(input, canonical);