forked from Telodendria/Telodendria
Fix Unicode handling in Json, don't sign-extend bytes in Stream.
This commit is contained in:
parent
a4330123b9
commit
c96ac30f28
7 changed files with 99 additions and 33 deletions
|
@ -26,6 +26,7 @@
|
||||||
#include <Memory.h>
|
#include <Memory.h>
|
||||||
#include <Str.h>
|
#include <Str.h>
|
||||||
#include <Util.h>
|
#include <Util.h>
|
||||||
|
#include <Int.h>
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
@ -401,7 +402,10 @@ JsonDecodeString(Stream * in)
|
||||||
int c;
|
int c;
|
||||||
char a[5];
|
char a[5];
|
||||||
|
|
||||||
unsigned long utf8;
|
UInt32 codepoint;
|
||||||
|
UInt16 high;
|
||||||
|
UInt16 low;
|
||||||
|
|
||||||
char *utf8Ptr;
|
char *utf8Ptr;
|
||||||
|
|
||||||
len = 0;
|
len = 0;
|
||||||
|
@ -480,14 +484,46 @@ JsonDecodeString(Stream * in)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
/* Interpret characters as a hex number */
|
/* Interpret characters as a hex number */
|
||||||
if (sscanf(a, "%04lx", &utf8) != 1)
|
if (sscanf(a, "%04hx", &high) != 1)
|
||||||
{
|
{
|
||||||
/* Bad hex value */
|
/* Bad hex value */
|
||||||
Free(str);
|
Free(str);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (utf8 == 0)
|
/* If this is a two-byte UTF-16 codepoint, grab
|
||||||
|
* the second byte */
|
||||||
|
if (high > 0xD7FF && high <= 0xDBFF)
|
||||||
|
{
|
||||||
|
if (StreamGetc(in) != '\\' || StreamGetc(in) != 'u')
|
||||||
|
{
|
||||||
|
Free(str);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read 4 characters into a */
|
||||||
|
if (!StreamGets(in, a, sizeof(a)))
|
||||||
|
{
|
||||||
|
Free(str);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Interpret characters as a hex number */
|
||||||
|
if (sscanf(a, "%04hx", &low) != 1)
|
||||||
|
{
|
||||||
|
Free(str);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
codepoint = StrUtf16Decode(high, low);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
codepoint = high;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (codepoint == 0)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* We read in a 0000, null. There is no
|
* We read in a 0000, null. There is no
|
||||||
|
@ -507,7 +543,7 @@ JsonDecodeString(Stream * in)
|
||||||
|
|
||||||
/* Encode the 4-byte UTF-8 buffer into a series
|
/* Encode the 4-byte UTF-8 buffer into a series
|
||||||
* of 1-byte characters */
|
* of 1-byte characters */
|
||||||
utf8Ptr = StrUtf8Encode(utf8);
|
utf8Ptr = StrUtf8Encode(codepoint);
|
||||||
if (!utf8Ptr)
|
if (!utf8Ptr)
|
||||||
{
|
{
|
||||||
/* Mem error */
|
/* Mem error */
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include <Memory.h>
|
#include <Memory.h>
|
||||||
#include <Util.h>
|
#include <Util.h>
|
||||||
#include <Rand.h>
|
#include <Rand.h>
|
||||||
|
#include <Int.h>
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
@ -34,8 +35,28 @@
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
UInt32
|
||||||
|
StrUtf16Decode(UInt16 high, UInt16 low)
|
||||||
|
{
|
||||||
|
if (high <= 0xD7FF)
|
||||||
|
{
|
||||||
|
return high;
|
||||||
|
}
|
||||||
|
else if (high <= 0xDBFF)
|
||||||
|
{
|
||||||
|
unsigned short hS = (high - 0xD800) * 0x400;
|
||||||
|
unsigned short lS = low - 0xDC00;
|
||||||
|
|
||||||
|
return (lS | hS) + 0x10000;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
char *
|
char *
|
||||||
StrUtf8Encode(unsigned long utf8)
|
StrUtf8Encode(UInt32 codepoint)
|
||||||
{
|
{
|
||||||
char *str;
|
char *str;
|
||||||
|
|
||||||
|
@ -45,30 +66,30 @@ StrUtf8Encode(unsigned long utf8)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (utf8 <= 0x7F) /* Plain ASCII */
|
if (codepoint <= 0x7F && codepoint != 0) /* Plain ASCII */
|
||||||
{
|
{
|
||||||
str[0] = (char) utf8;
|
str[0] = (char) codepoint;
|
||||||
str[1] = '\0';
|
str[1] = '\0';
|
||||||
}
|
}
|
||||||
else if (utf8 <= 0x07FF) /* 2-byte */
|
else if (codepoint <= 0x07FF) /* 2-byte */
|
||||||
{
|
{
|
||||||
str[0] = (char) (((utf8 >> 6) & 0x1F) | 0xC0);
|
str[0] = (char) (((codepoint >> 6) & 0x1F) | 0xC0);
|
||||||
str[1] = (char) (((utf8 >> 0) & 0x3F) | 0x80);
|
str[1] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
|
||||||
str[2] = '\0';
|
str[2] = '\0';
|
||||||
}
|
}
|
||||||
else if (utf8 <= 0xFFFF) /* 3-byte */
|
else if (codepoint <= 0xFFFF) /* 3-byte */
|
||||||
{
|
{
|
||||||
str[0] = (char) (((utf8 >> 12) & 0x0F) | 0xE0);
|
str[0] = (char) (((codepoint >> 12) & 0x0F) | 0xE0);
|
||||||
str[1] = (char) (((utf8 >> 6) & 0x3F) | 0x80);
|
str[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
|
||||||
str[2] = (char) (((utf8 >> 0) & 0x3F) | 0x80);
|
str[2] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
|
||||||
str[3] = '\0';
|
str[3] = '\0';
|
||||||
}
|
}
|
||||||
else if (utf8 <= 0x10FFFF) /* 4-byte */
|
else if (codepoint <= 0x10FFFF)/* 4-byte */
|
||||||
{
|
{
|
||||||
str[0] = (char) (((utf8 >> 18) & 0x07) | 0xF0);
|
str[0] = (char) (((codepoint >> 18) & 0x07) | 0xF0);
|
||||||
str[1] = (char) (((utf8 >> 12) & 0x3F) | 0x80);
|
str[1] = (char) (((codepoint >> 12) & 0x3F) | 0x80);
|
||||||
str[2] = (char) (((utf8 >> 6) & 0x3F) | 0x80);
|
str[2] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
|
||||||
str[3] = (char) (((utf8 >> 0) & 0x3F) | 0x80);
|
str[3] = (char) (((codepoint >> 0) & 0x3F) | 0x80);
|
||||||
str[4] = '\0';
|
str[4] = '\0';
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include <Io.h>
|
#include <Io.h>
|
||||||
#include <Memory.h>
|
#include <Memory.h>
|
||||||
#include <Util.h>
|
#include <Util.h>
|
||||||
|
#include <Int.h>
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
@ -49,11 +50,11 @@ struct Stream
|
||||||
{
|
{
|
||||||
Io *io;
|
Io *io;
|
||||||
|
|
||||||
char *rBuf;
|
UInt8 *rBuf;
|
||||||
size_t rLen;
|
size_t rLen;
|
||||||
size_t rOff;
|
size_t rOff;
|
||||||
|
|
||||||
char *wBuf;
|
UInt8 *wBuf;
|
||||||
size_t wLen;
|
size_t wLen;
|
||||||
|
|
||||||
char *ugBuf;
|
char *ugBuf;
|
||||||
|
|
|
@ -39,14 +39,21 @@
|
||||||
* is a standard library header.
|
* is a standard library header.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <Int.h>
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Take a UTF-8 codepoint and encode it into a string buffer containing
|
* Convert UTF-16 into a Unicode codepoint.
|
||||||
|
*/
|
||||||
|
extern UInt32 StrUtf16Decode(UInt16, UInt16);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Take a Unicode codepoint and encode it into a string buffer containing
|
||||||
* between 1 and 4 bytes. The string buffer is allocated on the heap,
|
* between 1 and 4 bytes. The string buffer is allocated on the heap,
|
||||||
* so it should be freed when it is no longer needed.
|
* so it should be freed when it is no longer needed.
|
||||||
*/
|
*/
|
||||||
extern char * StrUtf8Encode(unsigned long);
|
extern char * StrUtf8Encode(UInt32);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Duplicate a null-terminated string, returning a new string on the
|
* Duplicate a null-terminated string, returning a new string on the
|
||||||
|
|
|
@ -256,7 +256,8 @@ Main(Array * args)
|
||||||
switch (flag)
|
switch (flag)
|
||||||
{
|
{
|
||||||
case FLAG_SELECT:
|
case FLAG_SELECT:
|
||||||
query(input, json, canonical); /* This will implicitly free json */
|
query(input, json, canonical); /* This will implicitly
|
||||||
|
* free json */
|
||||||
break;
|
break;
|
||||||
case FLAG_ENCODE:
|
case FLAG_ENCODE:
|
||||||
encode(input, canonical);
|
encode(input, canonical);
|
||||||
|
|
Loading…
Reference in a new issue