webmcp
diff libraries/json/json.c @ 167:84497222db4e
Decoding of unicode escapes in JSON strings
author | jbe |
---|---|
date | Fri Aug 01 02:31:13 2014 +0200 (2014-08-01) |
parents | 7885d1ae35ff |
children | e618ccd017a3 |
line diff
1.1 --- a/libraries/json/json.c Thu Jul 31 23:33:28 2014 +0200 1.2 +++ b/libraries/json/json.c Fri Aug 01 02:31:13 2014 +0200 1.3 @@ -100,6 +100,10 @@ 1.4 return json_mark(L, json_regpointer(arraymt)); 1.5 } 1.6 1.7 +#define json_utf16_surrogate(x) ((x) >= 0xD800 && (x) <= 0xDFFF) 1.8 +#define json_utf16_lead(x) ((x) >= 0xD800 && (x) <= 0xDBFF) 1.9 +#define json_utf16_tail(x) ((x) >= 0xDC00 && (x) <= 0xDFFF) 1.10 + 1.11 // internal states of JSON parser: 1.12 #define JSON_STATE_VALUE 0 1.13 #define JSON_STATE_OBJECT_KEY 1 1.14 @@ -115,8 +119,24 @@ 1.15 #define json_import_arraymt_idx 3 1.16 #define json_import_shadowtbl_idx 4 1.17 1.18 +// macro for hex decoding: 1.19 +#define json_import_readhex(x) \ 1.20 + do { \ 1.21 + x = 0; \ 1.22 + for (i=0; i<4; i++) { \ 1.23 + x <<= 4; \ 1.24 + c = str[pos++]; \ 1.25 + if (c >= '0' && c <= '9') x += c - '0'; \ 1.26 + else if (c >= 'A' && c <= 'F') x += c - 'A' + 10; \ 1.27 + else if (c >= 'a' && c <= 'f') x += c - 'a' + 10; \ 1.28 + else if (c == 0) goto json_import_unexpected_eof; \ 1.29 + else goto json_import_unexpected_escape; \ 1.30 + } \ 1.31 + } while (0) 1.32 + 1.33 // decodes a JSON document: 1.34 static int json_import(lua_State *L) { 1.35 + int i; // loop variable 1.36 const char *str; // string to parse 1.37 size_t total; // total length of string to parse 1.38 size_t pos = 0; // current position in string to parse 1.39 @@ -126,6 +146,8 @@ 1.40 luaL_Buffer luabuf; // Lua buffer to decode JSON string values 1.41 char *cbuf; // C buffer to decode JSON string values 1.42 size_t outlen; // maximum length or write position of C buffer 1.43 + long codepoint; // decoded UTF-16 character or higher codepoint 1.44 + long utf16tail; // second decoded UTF-16 character (surrogate tail) 1.45 size_t arraylen; // variable to temporarily store the array length 1.46 // require string as argument and convert to C string with length information: 1.47 str = luaL_checklstring(L, 1, &total); 1.48 @@ -157,10 +179,14 @@ 1.49 // if end of JSON document was expected, then return top element of stack as result: 1.50 if (mode == JSON_STATE_END) return 1; 1.51 // otherwise, the JSON document was malformed: 1.52 - json_import_unexpected_eof: 1.53 - lua_pushnil(L); 1.54 - if (level == 0) lua_pushliteral(L, "Empty string"); 1.55 - else lua_pushliteral(L, "Unexpected end of JSON document"); 1.56 + if (level == 0) { 1.57 + lua_pushnil(L); 1.58 + lua_pushliteral(L, "Empty string"); 1.59 + } else { 1.60 + json_import_unexpected_eof: 1.61 + lua_pushnil(L); 1.62 + lua_pushliteral(L, "Unexpected end of JSON document"); 1.63 + } 1.64 return 2; 1.65 // new JSON object: 1.66 case '{': 1.67 @@ -351,11 +377,51 @@ 1.68 case 't': cbuf[outlen++] = '\t'; break; 1.69 // unescaping of UTF-16 characters 1.70 case 'u': 1.71 - lua_pushnil(L); 1.72 - lua_pushliteral(L, "JSON unicode escape sequences are not implemented yet"); // TODO 1.73 - return 2; 1.74 + // decode 4 hex nibbles: 1.75 + json_import_readhex(codepoint); 1.76 + // handle surrogate character: 1.77 + if (json_utf16_surrogate(codepoint)) { 1.78 + // check if first surrogate is in valid range: 1.79 + if (json_utf16_lead(codepoint)) { 1.80 + // require second surrogate: 1.81 + if ((c = str[pos++]) != '\\' || (c = str[pos++]) != 'u') { 1.82 + if (c == 0) goto json_import_unexpected_eof; 1.83 + else goto json_import_wrong_surrogate; 1.84 + } 1.85 + // read 4 hex nibbles of second surrogate character: 1.86 + json_import_readhex(utf16tail); 1.87 + // check if second surrogate is in valid range: 1.88 + if (!json_utf16_tail(utf16tail)) goto json_import_wrong_surrogate; 1.89 + // calculate codepoint: 1.90 + codepoint = 0x10000 + (utf16tail - 0xDC00) + (codepoint - 0xD800) * 0x400; 1.91 + } else { 1.92 + // throw error for wrong surrogates: 1.93 + json_import_wrong_surrogate: 1.94 + lua_pushnil(L); 1.95 + lua_pushliteral(L, "Illegal UTF-16 surrogate in JSON string escape sequence"); 1.96 + return 2; 1.97 + } 1.98 + } 1.99 + // encode as UTF-8: 1.100 + if (codepoint < 0x80) { 1.101 + cbuf[outlen++] = (char)codepoint; 1.102 + } else if (codepoint < 0x800) { 1.103 + cbuf[outlen++] = (char)(0xc0 | (codepoint >> 6)); 1.104 + cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f)); 1.105 + } else if (codepoint < 0x10000) { 1.106 + cbuf[outlen++] = (char)(0xe0 | (codepoint >> 12)); 1.107 + cbuf[outlen++] = (char)(0x80 | ((codepoint >> 6) & 0x3f)); 1.108 + cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f)); 1.109 + } else { 1.110 + cbuf[outlen++] = (char)(0xf0 | (codepoint >> 18)); 1.111 + cbuf[outlen++] = (char)(0x80 | ((codepoint >> 12) & 0x3f)); 1.112 + cbuf[outlen++] = (char)(0x80 | ((codepoint >> 6) & 0x3f)); 1.113 + cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f)); 1.114 + } 1.115 + break; 1.116 // unexpected escape sequence: 1.117 default: 1.118 + json_import_unexpected_escape: 1.119 lua_pushnil(L); 1.120 lua_pushliteral(L, "Unexpected string escape sequence in JSON document"); 1.121 return 2; 1.122 @@ -371,6 +437,8 @@ 1.123 // if JSON string is empty, 1.124 // push empty Lua string: 1.125 lua_pushliteral(L, ""); 1.126 + // consume closing quote: 1.127 + pos++; 1.128 } 1.129 // continue with processing of decoded string: 1.130 goto json_import_process_value; 1.131 @@ -379,8 +447,8 @@ 1.132 if ((c >= '0' && c <= '9') || c == '-' || c == '+') { 1.133 // for numbers, 1.134 // use strtod() call to parse a (double precision) floating point number: 1.135 + double numval; 1.136 char *endptr; 1.137 - double numval; 1.138 numval = strtod(str+pos, &endptr); 1.139 // catch parsing errors: 1.140 if (endptr == str+pos) goto json_import_syntax_error;