Fix Unicode handling in Json, don't sign-extend bytes in Stream.

This commit is contained in:
Jordan Bancino 2023-07-15 17:57:21 +00:00
parent a4330123b9
commit c96ac30f28
7 changed files with 99 additions and 33 deletions

View file

@ -26,6 +26,7 @@
#include <Memory.h> #include <Memory.h>
#include <Str.h> #include <Str.h>
#include <Util.h> #include <Util.h>
#include <Int.h>
#include <stdio.h> #include <stdio.h>
#include <stddef.h> #include <stddef.h>
@ -401,7 +402,10 @@ JsonDecodeString(Stream * in)
int c; int c;
char a[5]; char a[5];
unsigned long utf8; UInt32 codepoint;
UInt16 high;
UInt16 low;
char *utf8Ptr; char *utf8Ptr;
len = 0; len = 0;
@ -480,14 +484,46 @@ JsonDecodeString(Stream * in)
return NULL; return NULL;
} }
/* Interpret characters as a hex number */ /* Interpret characters as a hex number */
if (sscanf(a, "%04lx", &utf8) != 1) if (sscanf(a, "%04hx", &high) != 1)
{ {
/* Bad hex value */ /* Bad hex value */
Free(str); Free(str);
return NULL; return NULL;
} }
if (utf8 == 0) /* If this is a two-byte UTF-16 codepoint, grab
* the second byte */
if (high > 0xD7FF && high <= 0xDBFF)
{
if (StreamGetc(in) != '\\' || StreamGetc(in) != 'u')
{
Free(str);
return NULL;
}
/* Read 4 characters into a */
if (!StreamGets(in, a, sizeof(a)))
{
Free(str);
return NULL;
}
/* Interpret characters as a hex number */
if (sscanf(a, "%04hx", &low) != 1)
{
Free(str);
return NULL;
}
codepoint = StrUtf16Decode(high, low);
}
else
{
codepoint = high;
}
if (codepoint == 0)
{ {
/* /*
* We read in a 0000, null. There is no * We read in a 0000, null. There is no
@ -507,7 +543,7 @@ JsonDecodeString(Stream * in)
/* Encode the 4-byte UTF-8 buffer into a series /* Encode the 4-byte UTF-8 buffer into a series
* of 1-byte characters */ * of 1-byte characters */
utf8Ptr = StrUtf8Encode(utf8); utf8Ptr = StrUtf8Encode(codepoint);
if (!utf8Ptr) if (!utf8Ptr)
{ {
/* Mem error */ /* Mem error */

View file

@ -26,6 +26,7 @@
#include <Memory.h> #include <Memory.h>
#include <Util.h> #include <Util.h>
#include <Rand.h> #include <Rand.h>
#include <Int.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
@ -34,8 +35,28 @@
#include <pthread.h> #include <pthread.h>
#include <unistd.h> #include <unistd.h>
UInt32
StrUtf16Decode(UInt16 high, UInt16 low)
{
if (high <= 0xD7FF)
{
return high;
}
else if (high <= 0xDBFF)
{
unsigned short hS = (high - 0xD800) * 0x400;
unsigned short lS = low - 0xDC00;
return (lS | hS) + 0x10000;
}
else
{
return 0;
}
}
char * char *
StrUtf8Encode(unsigned long utf8) StrUtf8Encode(UInt32 codepoint)
{ {
char *str; char *str;
@ -45,30 +66,30 @@ StrUtf8Encode(unsigned long utf8)
return NULL; return NULL;
} }
if (utf8 <= 0x7F) /* Plain ASCII */ if (codepoint <= 0x7F && codepoint != 0) /* Plain ASCII */
{ {
str[0] = (char) utf8; str[0] = (char) codepoint;
str[1] = '\0'; str[1] = '\0';
} }
else if (utf8 <= 0x07FF) /* 2-byte */ else if (codepoint <= 0x07FF) /* 2-byte */
{ {
str[0] = (char) (((utf8 >> 6) & 0x1F) | 0xC0); str[0] = (char) (((codepoint >> 6) & 0x1F) | 0xC0);
str[1] = (char) (((utf8 >> 0) & 0x3F) | 0x80); str[1] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
str[2] = '\0'; str[2] = '\0';
} }
else if (utf8 <= 0xFFFF) /* 3-byte */ else if (codepoint <= 0xFFFF) /* 3-byte */
{ {
str[0] = (char) (((utf8 >> 12) & 0x0F) | 0xE0); str[0] = (char) (((codepoint >> 12) & 0x0F) | 0xE0);
str[1] = (char) (((utf8 >> 6) & 0x3F) | 0x80); str[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
str[2] = (char) (((utf8 >> 0) & 0x3F) | 0x80); str[2] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
str[3] = '\0'; str[3] = '\0';
} }
else if (utf8 <= 0x10FFFF) /* 4-byte */ else if (codepoint <= 0x10FFFF)/* 4-byte */
{ {
str[0] = (char) (((utf8 >> 18) & 0x07) | 0xF0); str[0] = (char) (((codepoint >> 18) & 0x07) | 0xF0);
str[1] = (char) (((utf8 >> 12) & 0x3F) | 0x80); str[1] = (char) (((codepoint >> 12) & 0x3F) | 0x80);
str[2] = (char) (((utf8 >> 6) & 0x3F) | 0x80); str[2] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
str[3] = (char) (((utf8 >> 0) & 0x3F) | 0x80); str[3] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
str[4] = '\0'; str[4] = '\0';
} }
else else

View file

@ -26,6 +26,7 @@
#include <Io.h> #include <Io.h>
#include <Memory.h> #include <Memory.h>
#include <Util.h> #include <Util.h>
#include <Int.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -49,11 +50,11 @@ struct Stream
{ {
Io *io; Io *io;
char *rBuf; UInt8 *rBuf;
size_t rLen; size_t rLen;
size_t rOff; size_t rOff;
char *wBuf; UInt8 *wBuf;
size_t wLen; size_t wLen;
char *ugBuf; char *ugBuf;

View file

@ -39,14 +39,21 @@
* is a standard library header. * is a standard library header.
*/ */
#include <Int.h>
#include <stddef.h> #include <stddef.h>
/** /**
* Take a UTF-8 codepoint and encode it into a string buffer containing * Convert UTF-16 into a Unicode codepoint.
*/
extern UInt32 StrUtf16Decode(UInt16, UInt16);
/**
* Take a Unicode codepoint and encode it into a string buffer containing
* between 1 and 4 bytes. The string buffer is allocated on the heap, * between 1 and 4 bytes. The string buffer is allocated on the heap,
* so it should be freed when it is no longer needed. * so it should be freed when it is no longer needed.
*/ */
extern char * StrUtf8Encode(unsigned long); extern char * StrUtf8Encode(UInt32);
/** /**
* Duplicate a null-terminated string, returning a new string on the * Duplicate a null-terminated string, returning a new string on the

View file

@ -256,7 +256,8 @@ Main(Array * args)
switch (flag) switch (flag)
{ {
case FLAG_SELECT: case FLAG_SELECT:
query(input, json, canonical); /* This will implicitly free json */ query(input, json, canonical); /* This will implicitly
* free json */
break; break;
case FLAG_ENCODE: case FLAG_ENCODE:
encode(input, canonical); encode(input, canonical);