From 549d6f67e32e16705343882df04aa7a3f314efca Mon Sep 17 00:00:00 2001 From: Andrew Johnson Date: Wed, 5 Aug 2020 00:06:53 -0500 Subject: [PATCH] YAJL support for JSON5 \xXX hex escapes in strings, with tests Teach the lexer/parser to recognize and decode them in JSON5 mode. Teach the encoder to use them in JSON5 mode. Add another error message for bad hex digits. Test cases to show they work, and that the bad-digit check fires. --- modules/libcom/src/yajl/yajl_encode.c | 45 ++++++++++++++++++++++----- modules/libcom/src/yajl/yajl_encode.h | 3 +- modules/libcom/src/yajl/yajl_gen.c | 3 +- modules/libcom/src/yajl/yajl_lex.c | 21 +++++++++++-- modules/libcom/src/yajl/yajl_lex.h | 3 +- modules/libcom/test/yajlTestCases.pm | 29 +++++++++++++++++ 6 files changed, 91 insertions(+), 13 deletions(-) diff --git a/modules/libcom/src/yajl/yajl_encode.c b/modules/libcom/src/yajl/yajl_encode.c index 947dce1d7..732f451f9 100644 --- a/modules/libcom/src/yajl/yajl_encode.c +++ b/modules/libcom/src/yajl/yajl_encode.c @@ -33,13 +33,22 @@ yajl_string_encode(const yajl_print_t print, void * ctx, const unsigned char * str, size_t len, - int escape_solidus) + int escape_solidus, + int output_json5) { size_t beg = 0; size_t end = 0; char hexBuf[7]; - hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0'; - hexBuf[6] = 0; + char *hexAt; + if (output_json5) { + hexBuf[0] = '\\'; hexBuf[1] = 'x'; + hexBuf[4] = 0; + hexAt = &hexBuf[2]; + } else { + hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0'; + hexBuf[6] = 0; + hexAt = &hexBuf[4]; + } while (end < len) { const char * escaped = NULL; @@ -57,9 +66,20 @@ yajl_string_encode(const yajl_print_t print, case '\f': escaped = "\\f"; break; case '\b': escaped = "\\b"; break; case '\t': escaped = "\\t"; break; + case '\0': + if (output_json5) { + escaped = "\\0"; break; + } + goto ashex; + case '\v': + if (output_json5) { + escaped = "\\v"; break; + } + goto ashex; default: if ((unsigned char) str[end] < 32) { - CharToHex(str[end], hexBuf + 4); + ashex: + CharToHex(str[end], hexAt); escaped = hexBuf; } break; @@ -75,10 +95,10 @@ yajl_string_encode(const yajl_print_t print, print(ctx, (const char *) (str + beg), end - beg); } -static void hexToDigit(unsigned int * val, const unsigned char * hex) +static void hexToDigit(unsigned int * val, unsigned int len, const unsigned char * hex) { unsigned int i; - for (i=0;i<4;i++) { + for (i=0;i= 'A') c = (c & ~0x20) - 7; c -= '0'; @@ -133,14 +153,14 @@ void yajl_string_decode(yajl_buf buf, const unsigned char * str, case 't': unescaped = "\t"; break; case 'u': { unsigned int codepoint = 0; - hexToDigit(&codepoint, str + ++end); + hexToDigit(&codepoint, 4, str + ++end); end+=3; /* check if this is a surrogate */ if ((codepoint & 0xFC00) == 0xD800) { end++; if (str[end] == '\\' && str[end + 1] == 'u') { unsigned int surrogate = 0; - hexToDigit(&surrogate, str + end + 2); + hexToDigit(&surrogate, 4, str + end + 2); codepoint = (((codepoint & 0x3F) << 10) | ((((codepoint >> 6) & 0xF) + 1) << 16) | @@ -177,6 +197,15 @@ void yajl_string_decode(yajl_buf buf, const unsigned char * str, beg = ++end; continue; case 'v': unescaped = "\v"; break; + case 'x': { + unsigned int codepoint = 0; + hexToDigit(&codepoint, 2, str + ++end); + end++; + utf8Buf[0] = (char) codepoint; + yajl_buf_append(buf, utf8Buf, 1); + beg = ++end; + continue; + } default: utf8Buf[0] = str[end]; utf8Buf[1] = 0; diff --git a/modules/libcom/src/yajl/yajl_encode.h b/modules/libcom/src/yajl/yajl_encode.h index cb3895f9a..fd58dec9c 100644 --- a/modules/libcom/src/yajl/yajl_encode.h +++ b/modules/libcom/src/yajl/yajl_encode.h @@ -28,7 +28,8 @@ void yajl_string_encode(const yajl_print_t printer, void * ctx, const unsigned char * str, size_t length, - int escape_solidus); + int escape_solidus, + int output_json5); void yajl_string_decode(yajl_buf buf, const unsigned char * str, size_t length); diff --git a/modules/libcom/src/yajl/yajl_gen.c b/modules/libcom/src/yajl/yajl_gen.c index d5f5fdcd7..7d86ec8c7 100644 --- a/modules/libcom/src/yajl/yajl_gen.c +++ b/modules/libcom/src/yajl/yajl_gen.c @@ -270,7 +270,8 @@ yajl_gen_string(yajl_gen g, const unsigned char * str, } else { g->print(g->ctx, "\"", 1); - yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus); + yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus, + g->flags & yajl_gen_json5); g->print(g->ctx, "\"", 1); } APPENDED_ATOM; diff --git a/modules/libcom/src/yajl/yajl_lex.c b/modules/libcom/src/yajl/yajl_lex.c index dca39a55a..f780a3b0a 100644 --- a/modules/libcom/src/yajl/yajl_lex.c +++ b/modules/libcom/src/yajl/yajl_lex.c @@ -332,7 +332,21 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, if (!(charLookupTable[curChar] & VHC)) { /* back up to offending char */ unreadChar(lexer, offset); - lexer->error = yajl_lex_string_invalid_hex_char; + lexer->error = yajl_lex_string_invalid_hex_u_char; + goto finish_string_lex; + } + } + } + else if (lexer->allowJson5 && curChar == 'x') { + unsigned int i = 0; + + for (i=0;i<2;i++) { + STR_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if (!(charLookupTable[curChar] & VHC)) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_hex_x_char; goto finish_string_lex; } } @@ -905,9 +919,12 @@ yajl_lex_error_to_string(yajl_lex_error error) "which it may not."; case yajl_lex_string_invalid_json_char: return "invalid character inside string."; - case yajl_lex_string_invalid_hex_char: + case yajl_lex_string_invalid_hex_u_char: return "invalid (non-hex) character occurs after '\\u' inside " "string."; + case yajl_lex_string_invalid_hex_x_char: + return "invalid (non-hex) character occurs after '\\x' inside " + "string."; case yajl_lex_invalid_char: return "invalid char in json text."; case yajl_lex_invalid_string: diff --git a/modules/libcom/src/yajl/yajl_lex.h b/modules/libcom/src/yajl/yajl_lex.h index ebe647bd2..7c2a6b9d8 100644 --- a/modules/libcom/src/yajl/yajl_lex.h +++ b/modules/libcom/src/yajl/yajl_lex.h @@ -107,7 +107,8 @@ typedef enum { yajl_lex_string_invalid_utf8, yajl_lex_string_invalid_escaped_char, yajl_lex_string_invalid_json_char, - yajl_lex_string_invalid_hex_char, + yajl_lex_string_invalid_hex_u_char, + yajl_lex_string_invalid_hex_x_char, yajl_lex_invalid_char, yajl_lex_invalid_string, yajl_lex_missing_integer_after_decimal, diff --git a/modules/libcom/test/yajlTestCases.pm b/modules/libcom/test/yajlTestCases.pm index 2eef474b1..f2e7a8f93 100644 --- a/modules/libcom/test/yajlTestCases.pm +++ b/modules/libcom/test/yajlTestCases.pm @@ -7,6 +7,21 @@ sub cases { my $VAR1 = [ + { + name => "codepoints_from_hex", + opts => [ + -5 + ], + input => [ + "\"\\x0a\\x07\\x21\\x40\\x7c\"", + "" + ], + gives => [ + "string: '", + "\a!\@|'", + "memory leaks:\t0" + ] + }, { name => "doubles", opts => [ @@ -94,6 +109,20 @@ sub cases { "memory leaks:\t0" ] }, + { + name => "invalid_hex_char", + opts => [ + -5 + ], + input => [ + "\"yabba dabba do \\x1g !!\"", + "" + ], + gives => [ + "lexical error: invalid (non-hex) character occurs after '\\x' inside string.", + "memory leaks:\t0" + ] + }, { name => "map_identifiers", opts => [