# HG changeset patch # User jbe # Date 1406853073 -7200 # Node ID 84497222db4e4580c995f7042950871a49d6039c # Parent 7885d1ae35ffbaf7adab6b14efdda0da8f4c0aa2 Decoding of unicode escapes in JSON strings diff -r 7885d1ae35ff -r 84497222db4e libraries/json/json.c --- a/libraries/json/json.c Thu Jul 31 23:33:28 2014 +0200 +++ b/libraries/json/json.c Fri Aug 01 02:31:13 2014 +0200 @@ -100,6 +100,10 @@ return json_mark(L, json_regpointer(arraymt)); } +#define json_utf16_surrogate(x) ((x) >= 0xD800 && (x) <= 0xDFFF) +#define json_utf16_lead(x) ((x) >= 0xD800 && (x) <= 0xDBFF) +#define json_utf16_tail(x) ((x) >= 0xDC00 && (x) <= 0xDFFF) + // internal states of JSON parser: #define JSON_STATE_VALUE 0 #define JSON_STATE_OBJECT_KEY 1 @@ -115,8 +119,24 @@ #define json_import_arraymt_idx 3 #define json_import_shadowtbl_idx 4 +// macro for hex decoding: +#define json_import_readhex(x) \ + do { \ + x = 0; \ + for (i=0; i<4; i++) { \ + x <<= 4; \ + c = str[pos++]; \ + if (c >= '0' && c <= '9') x += c - '0'; \ + else if (c >= 'A' && c <= 'F') x += c - 'A' + 10; \ + else if (c >= 'a' && c <= 'f') x += c - 'a' + 10; \ + else if (c == 0) goto json_import_unexpected_eof; \ + else goto json_import_unexpected_escape; \ + } \ + } while (0) + // decodes a JSON document: static int json_import(lua_State *L) { + int i; // loop variable const char *str; // string to parse size_t total; // total length of string to parse size_t pos = 0; // current position in string to parse @@ -126,6 +146,8 @@ luaL_Buffer luabuf; // Lua buffer to decode JSON string values char *cbuf; // C buffer to decode JSON string values size_t outlen; // maximum length or write position of C buffer + long codepoint; // decoded UTF-16 character or higher codepoint + long utf16tail; // second decoded UTF-16 character (surrogate tail) size_t arraylen; // variable to temporarily store the array length // require string as argument and convert to C string with length information: str = luaL_checklstring(L, 1, &total); @@ -157,10 +179,14 @@ // if end of JSON document was expected, then return top element of stack as result: if (mode == JSON_STATE_END) return 1; // otherwise, the JSON document was malformed: - json_import_unexpected_eof: - lua_pushnil(L); - if (level == 0) lua_pushliteral(L, "Empty string"); - else lua_pushliteral(L, "Unexpected end of JSON document"); + if (level == 0) { + lua_pushnil(L); + lua_pushliteral(L, "Empty string"); + } else { + json_import_unexpected_eof: + lua_pushnil(L); + lua_pushliteral(L, "Unexpected end of JSON document"); + } return 2; // new JSON object: case '{': @@ -351,11 +377,51 @@ case 't': cbuf[outlen++] = '\t'; break; // unescaping of UTF-16 characters case 'u': - lua_pushnil(L); - lua_pushliteral(L, "JSON unicode escape sequences are not implemented yet"); // TODO - return 2; + // decode 4 hex nibbles: + json_import_readhex(codepoint); + // handle surrogate character: + if (json_utf16_surrogate(codepoint)) { + // check if first surrogate is in valid range: + if (json_utf16_lead(codepoint)) { + // require second surrogate: + if ((c = str[pos++]) != '\\' || (c = str[pos++]) != 'u') { + if (c == 0) goto json_import_unexpected_eof; + else goto json_import_wrong_surrogate; + } + // read 4 hex nibbles of second surrogate character: + json_import_readhex(utf16tail); + // check if second surrogate is in valid range: + if (!json_utf16_tail(utf16tail)) goto json_import_wrong_surrogate; + // calculate codepoint: + codepoint = 0x10000 + (utf16tail - 0xDC00) + (codepoint - 0xD800) * 0x400; + } else { + // throw error for wrong surrogates: + json_import_wrong_surrogate: + lua_pushnil(L); + lua_pushliteral(L, "Illegal UTF-16 surrogate in JSON string escape sequence"); + return 2; + } + } + // encode as UTF-8: + if (codepoint < 0x80) { + cbuf[outlen++] = (char)codepoint; + } else if (codepoint < 0x800) { + cbuf[outlen++] = (char)(0xc0 | (codepoint >> 6)); + cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f)); + } else if (codepoint < 0x10000) { + cbuf[outlen++] = (char)(0xe0 | (codepoint >> 12)); + cbuf[outlen++] = (char)(0x80 | ((codepoint >> 6) & 0x3f)); + cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f)); + } else { + cbuf[outlen++] = (char)(0xf0 | (codepoint >> 18)); + cbuf[outlen++] = (char)(0x80 | ((codepoint >> 12) & 0x3f)); + cbuf[outlen++] = (char)(0x80 | ((codepoint >> 6) & 0x3f)); + cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f)); + } + break; // unexpected escape sequence: default: + json_import_unexpected_escape: lua_pushnil(L); lua_pushliteral(L, "Unexpected string escape sequence in JSON document"); return 2; @@ -371,6 +437,8 @@ // if JSON string is empty, // push empty Lua string: lua_pushliteral(L, ""); + // consume closing quote: + pos++; } // continue with processing of decoded string: goto json_import_process_value; @@ -379,8 +447,8 @@ if ((c >= '0' && c <= '9') || c == '-' || c == '+') { // for numbers, // use strtod() call to parse a (double precision) floating point number: + double numval; char *endptr; - double numval; numval = strtod(str+pos, &endptr); // catch parsing errors: if (endptr == str+pos) goto json_import_syntax_error;