webmcp

changeset 167:84497222db4e

Decoding of unicode escapes in JSON strings
author jbe
date Fri Aug 01 02:31:13 2014 +0200 (2014-08-01)
parents 7885d1ae35ff
children e618ccd017a3
files libraries/json/json.c
line diff
     1.1 --- a/libraries/json/json.c	Thu Jul 31 23:33:28 2014 +0200
     1.2 +++ b/libraries/json/json.c	Fri Aug 01 02:31:13 2014 +0200
     1.3 @@ -100,6 +100,10 @@
     1.4    return json_mark(L, json_regpointer(arraymt));
     1.5  }
     1.6  
     1.7 +#define json_utf16_surrogate(x) ((x) >= 0xD800 && (x) <= 0xDFFF)
     1.8 +#define json_utf16_lead(x) ((x) >= 0xD800 && (x) <= 0xDBFF)
     1.9 +#define json_utf16_tail(x) ((x) >= 0xDC00 && (x) <= 0xDFFF)
    1.10 +
    1.11  // internal states of JSON parser:
    1.12  #define JSON_STATE_VALUE 0
    1.13  #define JSON_STATE_OBJECT_KEY 1
    1.14 @@ -115,8 +119,24 @@
    1.15  #define json_import_arraymt_idx 3
    1.16  #define json_import_shadowtbl_idx 4
    1.17  
    1.18 +// macro for hex decoding:
    1.19 +#define json_import_readhex(x) \
    1.20 +  do { \
    1.21 +    x = 0; \
    1.22 +    for (i=0; i<4; i++) { \
    1.23 +      x <<= 4; \
    1.24 +      c = str[pos++]; \
    1.25 +      if (c >= '0' && c <= '9') x += c - '0'; \
    1.26 +      else if (c >= 'A' && c <= 'F') x += c - 'A' + 10; \
    1.27 +      else if (c >= 'a' && c <= 'f') x += c - 'a' + 10; \
    1.28 +      else if (c == 0) goto json_import_unexpected_eof; \
    1.29 +      else goto json_import_unexpected_escape; \
    1.30 +    } \
    1.31 +  } while (0)
    1.32 +
    1.33  // decodes a JSON document:
    1.34  static int json_import(lua_State *L) {
    1.35 +  int i;             // loop variable
    1.36    const char *str;   // string to parse
    1.37    size_t total;      // total length of string to parse
    1.38    size_t pos = 0;    // current position in string to parse
    1.39 @@ -126,6 +146,8 @@
    1.40    luaL_Buffer luabuf;  // Lua buffer to decode JSON string values
    1.41    char *cbuf;          // C buffer to decode JSON string values
    1.42    size_t outlen;       // maximum length or write position of C buffer
    1.43 +  long codepoint;      // decoded UTF-16 character or higher codepoint
    1.44 +  long utf16tail;      // second decoded UTF-16 character (surrogate tail)
    1.45    size_t arraylen;     // variable to temporarily store the array length
    1.46    // require string as argument and convert to C string with length information:
    1.47    str = luaL_checklstring(L, 1, &total);
    1.48 @@ -157,10 +179,14 @@
    1.49      // if end of JSON document was expected, then return top element of stack as result:
    1.50      if (mode == JSON_STATE_END) return 1;
    1.51      // otherwise, the JSON document was malformed:
    1.52 -    json_import_unexpected_eof:
    1.53 -    lua_pushnil(L);
    1.54 -    if (level == 0) lua_pushliteral(L, "Empty string");
    1.55 -    else lua_pushliteral(L, "Unexpected end of JSON document");
    1.56 +    if (level == 0) {
    1.57 +      lua_pushnil(L);
    1.58 +      lua_pushliteral(L, "Empty string");
    1.59 +    } else {
    1.60 +      json_import_unexpected_eof:
    1.61 +      lua_pushnil(L);
    1.62 +      lua_pushliteral(L, "Unexpected end of JSON document");
    1.63 +    }
    1.64      return 2;
    1.65    // new JSON object:
    1.66    case '{':
    1.67 @@ -351,11 +377,51 @@
    1.68            case 't': cbuf[outlen++] = '\t'; break;
    1.69            // unescaping of UTF-16 characters
    1.70            case 'u':
    1.71 -            lua_pushnil(L);
    1.72 -            lua_pushliteral(L, "JSON unicode escape sequences are not implemented yet");  // TODO
    1.73 -            return 2;
    1.74 +            // decode 4 hex nibbles:
    1.75 +            json_import_readhex(codepoint);
    1.76 +            // handle surrogate character:
    1.77 +            if (json_utf16_surrogate(codepoint)) {
    1.78 +              // check if first surrogate is in valid range:
    1.79 +              if (json_utf16_lead(codepoint)) {
    1.80 +                // require second surrogate:
    1.81 +                if ((c = str[pos++]) != '\\' || (c = str[pos++]) != 'u') {
    1.82 +                  if (c == 0) goto json_import_unexpected_eof;
    1.83 +                  else goto json_import_wrong_surrogate;
    1.84 +                }
    1.85 +                // read 4 hex nibbles of second surrogate character:
    1.86 +                json_import_readhex(utf16tail);
    1.87 +                // check if second surrogate is in valid range:
    1.88 +                if (!json_utf16_tail(utf16tail)) goto json_import_wrong_surrogate;
    1.89 +                // calculate codepoint:
    1.90 +                codepoint = 0x10000 + (utf16tail - 0xDC00) + (codepoint - 0xD800) * 0x400;
    1.91 +              } else {
    1.92 +                // throw error for wrong surrogates:
    1.93 +                json_import_wrong_surrogate:
    1.94 +                lua_pushnil(L);
    1.95 +                lua_pushliteral(L, "Illegal UTF-16 surrogate in JSON string escape sequence");
    1.96 +                return 2;
    1.97 +              }
    1.98 +            }
    1.99 +            // encode as UTF-8:
   1.100 +            if (codepoint < 0x80) {
   1.101 +              cbuf[outlen++] = (char)codepoint;
   1.102 +            } else if (codepoint < 0x800) {
   1.103 +              cbuf[outlen++] = (char)(0xc0 | (codepoint >> 6));
   1.104 +              cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f));
   1.105 +            } else if (codepoint < 0x10000) {
   1.106 +              cbuf[outlen++] = (char)(0xe0 | (codepoint >> 12));
   1.107 +              cbuf[outlen++] = (char)(0x80 | ((codepoint >> 6) & 0x3f));
   1.108 +              cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f));
   1.109 +            } else {
   1.110 +              cbuf[outlen++] = (char)(0xf0 | (codepoint >> 18));
   1.111 +              cbuf[outlen++] = (char)(0x80 | ((codepoint >> 12) & 0x3f));
   1.112 +              cbuf[outlen++] = (char)(0x80 | ((codepoint >> 6) & 0x3f));
   1.113 +              cbuf[outlen++] = (char)(0x80 | (codepoint & 0x3f));
   1.114 +            }
   1.115 +            break;
   1.116            // unexpected escape sequence:
   1.117            default:
   1.118 +            json_import_unexpected_escape:
   1.119              lua_pushnil(L);
   1.120              lua_pushliteral(L, "Unexpected string escape sequence in JSON document");
   1.121              return 2;
   1.122 @@ -371,6 +437,8 @@
   1.123        // if JSON string is empty,
   1.124        // push empty Lua string:
   1.125        lua_pushliteral(L, "");
   1.126 +      // consume closing quote:
   1.127 +      pos++;
   1.128      }
   1.129      // continue with processing of decoded string:
   1.130      goto json_import_process_value;
   1.131 @@ -379,8 +447,8 @@
   1.132    if ((c >= '0' && c <= '9') || c == '-' || c == '+') {
   1.133      // for numbers,
   1.134      // use strtod() call to parse a (double precision) floating point number:
   1.135 +    double numval;
   1.136      char *endptr;
   1.137 -    double numval;
   1.138      numval = strtod(str+pos, &endptr);
   1.139      // catch parsing errors:
   1.140      if (endptr == str+pos) goto json_import_syntax_error;

Impressum / About Us