utf8proc
diff utf8proc.c @ 2:aaad485d5335
Version 0.3
- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
author | jbe |
---|---|
date | Fri Aug 04 12:00:00 2006 +0200 (2006-08-04) |
parents | 61a89ecc2fb9 |
children | 4ee0d5f54af1 |
line diff
1.1 --- a/utf8proc.c Tue Jun 20 12:00:00 2006 +0200 1.2 +++ b/utf8proc.c Fri Aug 04 12:00:00 2006 +0200 1.3 @@ -42,8 +42,8 @@ 1.4 1.5 /* 1.6 * File name: utf8proc.c 1.7 - * Version: 0.2 1.8 - * Last changed: 2006-05-31 1.9 + * Version: 0.3 1.10 + * Last changed: 2006-08-04 1.11 * 1.12 * Description: 1.13 * Implementation of libutf8proc. 1.14 @@ -81,6 +81,29 @@ 1.15 #define UTF8PROC_HANGUL_TCOUNT 28 1.16 #define UTF8PROC_HANGUL_NCOUNT 588 1.17 #define UTF8PROC_HANGUL_SCOUNT 11172 1.18 +// END is exclusive 1.19 +#define UTF8PROC_HANGUL_L_START 0x1100 1.20 +#define UTF8PROC_HANGUL_L_END 0x115A 1.21 +#define UTF8PROC_HANGUL_L_FILLER 0x115F 1.22 +#define UTF8PROC_HANGUL_V_START 0x1160 1.23 +#define UTF8PROC_HANGUL_V_END 0x11A3 1.24 +#define UTF8PROC_HANGUL_T_START 0x11A8 1.25 +#define UTF8PROC_HANGUL_T_END 0x11FA 1.26 +#define UTF8PROC_HANGUL_S_START 0xAC00 1.27 +#define UTF8PROC_HANGUL_S_END 0xD7A4 1.28 + 1.29 + 1.30 +#define UTF8PROC_BOUNDCLASS_START 0 1.31 +#define UTF8PROC_BOUNDCLASS_OTHER 1 1.32 +#define UTF8PROC_BOUNDCLASS_CR 2 1.33 +#define UTF8PROC_BOUNDCLASS_LF 3 1.34 +#define UTF8PROC_BOUNDCLASS_CONTROL 4 1.35 +#define UTF8PROC_BOUNDCLASS_EXTEND 5 1.36 +#define UTF8PROC_BOUNDCLASS_L 6 1.37 +#define UTF8PROC_BOUNDCLASS_V 7 1.38 +#define UTF8PROC_BOUNDCLASS_T 8 1.39 +#define UTF8PROC_BOUNDCLASS_LV 9 1.40 +#define UTF8PROC_BOUNDCLASS_LVT 10 1.41 1.42 1.43 const char *utf8proc_errmsg(ssize_t errcode) { 1.44 @@ -145,6 +168,12 @@ 1.45 dst[0] = 0xC0 + (uc >> 6); 1.46 dst[1] = 0x80 + (uc & 0x3F); 1.47 return 2; 1.48 + } else if (uc == 0xFFFF) { 1.49 + dst[0] = 0xFF; 1.50 + return 1; 1.51 + } else if (uc == 0xFFFE) { 1.52 + dst[0] = 0xFE; 1.53 + return 1; 1.54 } else if (uc < 0x10000) { 1.55 dst[0] = 0xE0 + (uc >> 12); 1.56 dst[1] = 0x80 + ((uc >> 6) & 0x3F); 1.57 @@ -169,13 +198,14 @@ 1.58 } 1.59 1.60 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, 1.61 - int options) { 1.62 + int options, int *last_boundclass) { 1.63 // ASSERT: uc >= 0 && uc < 0x110000 1.64 const utf8proc_property_t *property; 1.65 int32_t hangul_sindex; 1.66 property = utf8proc_get_property(uc); 1.67 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 1.68 - if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 1.69 + if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && 1.70 + hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 1.71 int32_t hangul_tindex; 1.72 if (bufsize >= 1) { 1.73 dst[0] = UTF8PROC_HANGUL_LBASE + 1.74 @@ -197,25 +227,79 @@ 1.75 for (casefold_entry = property->casefold_mapping; 1.76 *casefold_entry >= 0; casefold_entry++) { 1.77 written += utf8proc_decompose_char(*casefold_entry, dst+written, 1.78 - (bufsize > written) ? (bufsize - written) : 0, options); 1.79 + (bufsize > written) ? (bufsize - written) : 0, options, 1.80 + last_boundclass); 1.81 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.82 } 1.83 return written; 1.84 - } else if (property->decomp_mapping && 1.85 + } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && 1.86 + property->decomp_mapping && 1.87 (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 1.88 const int32_t *decomp_entry; 1.89 ssize_t written = 0; 1.90 for (decomp_entry = property->decomp_mapping; 1.91 *decomp_entry >= 0; decomp_entry++) { 1.92 written += utf8proc_decompose_char(*decomp_entry, dst+written, 1.93 - (bufsize > written) ? (bufsize - written) : 0, options); 1.94 + (bufsize > written) ? (bufsize - written) : 0, options, 1.95 + last_boundclass); 1.96 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.97 } 1.98 return written; 1.99 - } else { 1.100 - if (bufsize >= 1) *dst = uc; 1.101 - return 1; 1.102 + } else if (options & UTF8PROC_CHARBOUND) { 1.103 + bool boundary; 1.104 + int tbc, lbc; 1.105 + int category; 1.106 + category = property->category; 1.107 + tbc = 1.108 + (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : 1.109 + (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : 1.110 + ((category == UTF8PROC_CATEGORY_ZL || 1.111 + category == UTF8PROC_CATEGORY_ZP || 1.112 + category == UTF8PROC_CATEGORY_CC || 1.113 + category == UTF8PROC_CATEGORY_CF) && 1.114 + !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL : 1.115 + property->extend ? UTF8PROC_BOUNDCLASS_EXTEND : 1.116 + ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) || 1.117 + uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L : 1.118 + (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ? 1.119 + UTF8PROC_BOUNDCLASS_V : 1.120 + (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ? 1.121 + UTF8PROC_BOUNDCLASS_T : 1.122 + (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? ( 1.123 + ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ? 1.124 + UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT 1.125 + ) : 1.126 + UTF8PROC_BOUNDCLASS_OTHER; 1.127 + lbc = *last_boundclass; 1.128 + boundary = 1.129 + (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : 1.130 + (lbc == UTF8PROC_BOUNDCLASS_START) ? true : 1.131 + (lbc == UTF8PROC_BOUNDCLASS_CR && 1.132 + tbc == UTF8PROC_BOUNDCLASS_LF) ? false : 1.133 + (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : 1.134 + (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : 1.135 + (lbc == UTF8PROC_BOUNDCLASS_L && 1.136 + (tbc == UTF8PROC_BOUNDCLASS_L || 1.137 + tbc == UTF8PROC_BOUNDCLASS_V || 1.138 + tbc == UTF8PROC_BOUNDCLASS_LV || 1.139 + tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : 1.140 + ((lbc == UTF8PROC_BOUNDCLASS_LV || 1.141 + lbc == UTF8PROC_BOUNDCLASS_V) && 1.142 + (tbc == UTF8PROC_BOUNDCLASS_V || 1.143 + tbc == UTF8PROC_BOUNDCLASS_T)) ? false : 1.144 + ((lbc == UTF8PROC_BOUNDCLASS_LVT || 1.145 + lbc == UTF8PROC_BOUNDCLASS_T) && 1.146 + tbc == UTF8PROC_BOUNDCLASS_T) ? false : 1.147 + true; 1.148 + *last_boundclass = tbc; 1.149 + if (boundary) { 1.150 + if (bufsize >= 1) dst[0] = 0xFFFF; 1.151 + if (bufsize >= 2) dst[1] = uc; 1.152 + return 2; 1.153 + } 1.154 } 1.155 + if (bufsize >= 1) *dst = uc; 1.156 + return 1; 1.157 } 1.158 1.159 ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen, 1.160 @@ -226,6 +310,7 @@ 1.161 int32_t uc; 1.162 ssize_t rpos = 0; 1.163 ssize_t decomp_result; 1.164 + int boundclass = UTF8PROC_BOUNDCLASS_START; 1.165 while (1) { 1.166 if (options & UTF8PROC_NULLTERM) { 1.167 rpos += utf8proc_iterate(str + rpos, -1, &uc); 1.168 @@ -240,7 +325,8 @@ 1.169 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 1.170 } 1.171 decomp_result = utf8proc_decompose_char( 1.172 - uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options 1.173 + uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, 1.174 + &boundclass 1.175 ); 1.176 if (decomp_result < 0) return decomp_result; 1.177 wpos += decomp_result; 1.178 @@ -249,7 +335,7 @@ 1.179 return UTF8PROC_ERROR_OVERFLOW; 1.180 } 1.181 } 1.182 - if (bufsize >= wpos) { 1.183 + if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { 1.184 ssize_t pos = 0; 1.185 while (pos < wpos-1) { 1.186 int32_t uc1, uc2; 1.187 @@ -416,7 +502,8 @@ 1.188 1.189 uint8_t *utf8proc_NFD(uint8_t *str) { 1.190 uint8_t *retval; 1.191 - utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE); 1.192 + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.193 + UTF8PROC_DECOMPOSE); 1.194 return retval; 1.195 } 1.196 1.197 @@ -430,7 +517,7 @@ 1.198 uint8_t *utf8proc_NFKD(uint8_t *str) { 1.199 uint8_t *retval; 1.200 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.201 - UTF8PROC_COMPAT); 1.202 + UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); 1.203 return retval; 1.204 } 1.205