YAJL support for JSON5 \xXX hex escapes in strings, with tests

Teach the lexer/parser to recognize and decode them in JSON5 mode.
Teach the encoder to use them in JSON5 mode.
Add another error message for bad hex digits.
Test cases to show they work, and that the bad-digit check fires.
This commit is contained in:
Andrew Johnson
2020-08-05 00:06:53 -05:00
parent 55f4e55383
commit 549d6f67e3
6 changed files with 91 additions and 13 deletions

View File

@@ -33,13 +33,22 @@ yajl_string_encode(const yajl_print_t print,
void * ctx,
const unsigned char * str,
size_t len,
int escape_solidus)
int escape_solidus,
int output_json5)
{
size_t beg = 0;
size_t end = 0;
char hexBuf[7];
hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
hexBuf[6] = 0;
char *hexAt;
if (output_json5) {
hexBuf[0] = '\\'; hexBuf[1] = 'x';
hexBuf[4] = 0;
hexAt = &hexBuf[2];
} else {
hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
hexBuf[6] = 0;
hexAt = &hexBuf[4];
}
while (end < len) {
const char * escaped = NULL;
@@ -57,9 +66,20 @@ yajl_string_encode(const yajl_print_t print,
case '\f': escaped = "\\f"; break;
case '\b': escaped = "\\b"; break;
case '\t': escaped = "\\t"; break;
case '\0':
if (output_json5) {
escaped = "\\0"; break;
}
goto ashex;
case '\v':
if (output_json5) {
escaped = "\\v"; break;
}
goto ashex;
default:
if ((unsigned char) str[end] < 32) {
CharToHex(str[end], hexBuf + 4);
ashex:
CharToHex(str[end], hexAt);
escaped = hexBuf;
}
break;
@@ -75,10 +95,10 @@ yajl_string_encode(const yajl_print_t print,
print(ctx, (const char *) (str + beg), end - beg);
}
static void hexToDigit(unsigned int * val, const unsigned char * hex)
static void hexToDigit(unsigned int * val, unsigned int len, const unsigned char * hex)
{
unsigned int i;
for (i=0;i<4;i++) {
for (i=0;i<len;i++) {
unsigned char c = hex[i];
if (c >= 'A') c = (c & ~0x20) - 7;
c -= '0';
@@ -133,14 +153,14 @@ void yajl_string_decode(yajl_buf buf, const unsigned char * str,
case 't': unescaped = "\t"; break;
case 'u': {
unsigned int codepoint = 0;
hexToDigit(&codepoint, str + ++end);
hexToDigit(&codepoint, 4, str + ++end);
end+=3;
/* check if this is a surrogate */
if ((codepoint & 0xFC00) == 0xD800) {
end++;
if (str[end] == '\\' && str[end + 1] == 'u') {
unsigned int surrogate = 0;
hexToDigit(&surrogate, str + end + 2);
hexToDigit(&surrogate, 4, str + end + 2);
codepoint =
(((codepoint & 0x3F) << 10) |
((((codepoint >> 6) & 0xF) + 1) << 16) |
@@ -177,6 +197,15 @@ void yajl_string_decode(yajl_buf buf, const unsigned char * str,
beg = ++end;
continue;
case 'v': unescaped = "\v"; break;
case 'x': {
unsigned int codepoint = 0;
hexToDigit(&codepoint, 2, str + ++end);
end++;
utf8Buf[0] = (char) codepoint;
yajl_buf_append(buf, utf8Buf, 1);
beg = ++end;
continue;
}
default:
utf8Buf[0] = str[end];
utf8Buf[1] = 0;

View File

@@ -28,7 +28,8 @@ void yajl_string_encode(const yajl_print_t printer,
void * ctx,
const unsigned char * str,
size_t length,
int escape_solidus);
int escape_solidus,
int output_json5);
void yajl_string_decode(yajl_buf buf, const unsigned char * str,
size_t length);

View File

@@ -270,7 +270,8 @@ yajl_gen_string(yajl_gen g, const unsigned char * str,
}
else {
g->print(g->ctx, "\"", 1);
yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus);
yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus,
g->flags & yajl_gen_json5);
g->print(g->ctx, "\"", 1);
}
APPENDED_ATOM;

View File

@@ -332,7 +332,21 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
if (!(charLookupTable[curChar] & VHC)) {
/* back up to offending char */
unreadChar(lexer, offset);
lexer->error = yajl_lex_string_invalid_hex_char;
lexer->error = yajl_lex_string_invalid_hex_u_char;
goto finish_string_lex;
}
}
}
else if (lexer->allowJson5 && curChar == 'x') {
unsigned int i = 0;
for (i=0;i<2;i++) {
STR_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if (!(charLookupTable[curChar] & VHC)) {
/* back up to offending char */
unreadChar(lexer, offset);
lexer->error = yajl_lex_string_invalid_hex_x_char;
goto finish_string_lex;
}
}
@@ -905,9 +919,12 @@ yajl_lex_error_to_string(yajl_lex_error error)
"which it may not.";
case yajl_lex_string_invalid_json_char:
return "invalid character inside string.";
case yajl_lex_string_invalid_hex_char:
case yajl_lex_string_invalid_hex_u_char:
return "invalid (non-hex) character occurs after '\\u' inside "
"string.";
case yajl_lex_string_invalid_hex_x_char:
return "invalid (non-hex) character occurs after '\\x' inside "
"string.";
case yajl_lex_invalid_char:
return "invalid char in json text.";
case yajl_lex_invalid_string:

View File

@@ -107,7 +107,8 @@ typedef enum {
yajl_lex_string_invalid_utf8,
yajl_lex_string_invalid_escaped_char,
yajl_lex_string_invalid_json_char,
yajl_lex_string_invalid_hex_char,
yajl_lex_string_invalid_hex_u_char,
yajl_lex_string_invalid_hex_x_char,
yajl_lex_invalid_char,
yajl_lex_invalid_string,
yajl_lex_missing_integer_after_decimal,

View File

@@ -7,6 +7,21 @@
sub cases {
my $VAR1 = [
{
name => "codepoints_from_hex",
opts => [
-5
],
input => [
"\"\\x0a\\x07\\x21\\x40\\x7c\"",
""
],
gives => [
"string: '",
"\a!\@|'",
"memory leaks:\t0"
]
},
{
name => "doubles",
opts => [
@@ -94,6 +109,20 @@ sub cases {
"memory leaks:\t0"
]
},
{
name => "invalid_hex_char",
opts => [
-5
],
input => [
"\"yabba dabba do \\x1g !!\"",
""
],
gives => [
"lexical error: invalid (non-hex) character occurs after '\\x' inside string.",
"memory leaks:\t0"
]
},
{
name => "map_identifiers",
opts => [