utf8proc
diff utf8proc.c @ 3:4ee0d5f54af1
Version 1.0
- added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function)
- added the STRIPMARK option, which strips marking characters (or marks of composed characters)
- deprecated ruby method String#char_ary in favour of String#utf8chars
- added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function)
- added the STRIPMARK option, which strips marking characters (or marks of composed characters)
- deprecated ruby method String#char_ary in favour of String#utf8chars
| author | jbe |
|---|---|
| date | Sun Sep 17 12:00:00 2006 +0200 (2006-09-17) |
| parents | aaad485d5335 |
| children | fcfd8c836c64 |
line diff
1.1 --- a/utf8proc.c Fri Aug 04 12:00:00 2006 +0200 1.2 +++ b/utf8proc.c Sun Sep 17 12:00:00 2006 +0200 1.3 @@ -42,8 +42,8 @@ 1.4 1.5 /* 1.6 * File name: utf8proc.c 1.7 - * Version: 0.3 1.8 - * Last changed: 2006-08-04 1.9 + * Version: 1.0 1.10 + * Last changed: 2006-09-17 1.11 * 1.12 * Description: 1.13 * Implementation of libutf8proc. 1.14 @@ -116,6 +116,8 @@ 1.15 return "Invalid UTF-8 string"; 1.16 case UTF8PROC_ERROR_NOTASSIGNED: 1.17 return "Unassigned Unicode code point found in UTF-8 string."; 1.18 + case UTF8PROC_ERROR_INVALIDOPTS: 1.19 + return "Invalid options for UTF-8 processing chosen."; 1.20 default: 1.21 return "An unknown error occured while processing UTF-8 data."; 1.22 } 1.23 @@ -197,59 +199,103 @@ 1.24 ); 1.25 } 1.26 1.27 +#define utf8proc_decompose_lump(replacement_uc) \ 1.28 + return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 1.29 + options & ~UTF8PROC_LUMP, last_boundclass) 1.30 + 1.31 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, 1.32 int options, int *last_boundclass) { 1.33 // ASSERT: uc >= 0 && uc < 0x110000 1.34 const utf8proc_property_t *property; 1.35 + utf8proc_propval_t category; 1.36 int32_t hangul_sindex; 1.37 property = utf8proc_get_property(uc); 1.38 + category = property->category; 1.39 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 1.40 - if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && 1.41 - hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 1.42 - int32_t hangul_tindex; 1.43 - if (bufsize >= 1) { 1.44 - dst[0] = UTF8PROC_HANGUL_LBASE + 1.45 - hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 1.46 - if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 1.47 - (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 1.48 + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 1.49 + if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 1.50 + int32_t hangul_tindex; 1.51 + if (bufsize >= 1) { 1.52 + dst[0] = UTF8PROC_HANGUL_LBASE + 1.53 + hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 1.54 + if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 1.55 + (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 1.56 + } 1.57 + hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 1.58 + if (!hangul_tindex) return 2; 1.59 + if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 1.60 + return 3; 1.61 } 1.62 - hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 1.63 - if (!hangul_tindex) return 2; 1.64 - if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 1.65 - return 3; 1.66 - } else if ((options & UTF8PROC_REJECTNA) && !property->category) { 1.67 - return UTF8PROC_ERROR_NOTASSIGNED; 1.68 - } else if ((options & UTF8PROC_IGNORE) && property->ignorable) { 1.69 - return 0; 1.70 - } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) { 1.71 - const int32_t *casefold_entry; 1.72 - ssize_t written = 0; 1.73 - for (casefold_entry = property->casefold_mapping; 1.74 - *casefold_entry >= 0; casefold_entry++) { 1.75 - written += utf8proc_decompose_char(*casefold_entry, dst+written, 1.76 - (bufsize > written) ? (bufsize - written) : 0, options, 1.77 + } 1.78 + if (options & UTF8PROC_REJECTNA) { 1.79 + if (!category) return UTF8PROC_ERROR_NOTASSIGNED; 1.80 + } 1.81 + if (options & UTF8PROC_IGNORE) { 1.82 + if (property->ignorable) return 0; 1.83 + } 1.84 + if (options & UTF8PROC_LUMP) { 1.85 + if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); 1.86 + if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) 1.87 + utf8proc_decompose_lump(0x0027); 1.88 + if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) 1.89 + utf8proc_decompose_lump(0x002D); 1.90 + if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); 1.91 + if (uc == 0x2236) utf8proc_decompose_lump(0x003A); 1.92 + if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) 1.93 + utf8proc_decompose_lump(0x003C); 1.94 + if (uc == 0x203A || uc == 0x232A || uc == 0x3009) 1.95 + utf8proc_decompose_lump(0x003E); 1.96 + if (uc == 0x2216) utf8proc_decompose_lump(0x005C); 1.97 + if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) 1.98 + utf8proc_decompose_lump(0x005E); 1.99 + if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) 1.100 + utf8proc_decompose_lump(0x005F); 1.101 + if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); 1.102 + if (uc == 0x2223) utf8proc_decompose_lump(0x007C); 1.103 + if (uc == 0x223C) utf8proc_decompose_lump(0x007E); 1.104 + if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { 1.105 + if (category == UTF8PROC_CATEGORY_ZL || 1.106 + category == UTF8PROC_CATEGORY_ZP) 1.107 + utf8proc_decompose_lump(0x000A); 1.108 + } 1.109 + } 1.110 + if (options & UTF8PROC_STRIPMARK) { 1.111 + if (category == UTF8PROC_CATEGORY_MN || 1.112 + category == UTF8PROC_CATEGORY_MC || 1.113 + category == UTF8PROC_CATEGORY_ME) return 0; 1.114 + } 1.115 + if (options & UTF8PROC_CASEFOLD) { 1.116 + if (property->casefold_mapping) { 1.117 + const int32_t *casefold_entry; 1.118 + ssize_t written = 0; 1.119 + for (casefold_entry = property->casefold_mapping; 1.120 + *casefold_entry >= 0; casefold_entry++) { 1.121 + written += utf8proc_decompose_char(*casefold_entry, dst+written, 1.122 + (bufsize > written) ? (bufsize - written) : 0, options, 1.123 + last_boundclass); 1.124 + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.125 + } 1.126 + return written; 1.127 + } 1.128 + } 1.129 + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 1.130 + if (property->decomp_mapping && 1.131 + (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 1.132 + const int32_t *decomp_entry; 1.133 + ssize_t written = 0; 1.134 + for (decomp_entry = property->decomp_mapping; 1.135 + *decomp_entry >= 0; decomp_entry++) { 1.136 + written += utf8proc_decompose_char(*decomp_entry, dst+written, 1.137 + (bufsize > written) ? (bufsize - written) : 0, options, 1.138 last_boundclass); 1.139 - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.140 + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.141 + } 1.142 + return written; 1.143 } 1.144 - return written; 1.145 - } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && 1.146 - property->decomp_mapping && 1.147 - (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 1.148 - const int32_t *decomp_entry; 1.149 - ssize_t written = 0; 1.150 - for (decomp_entry = property->decomp_mapping; 1.151 - *decomp_entry >= 0; decomp_entry++) { 1.152 - written += utf8proc_decompose_char(*decomp_entry, dst+written, 1.153 - (bufsize > written) ? (bufsize - written) : 0, options, 1.154 - last_boundclass); 1.155 - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.156 - } 1.157 - return written; 1.158 - } else if (options & UTF8PROC_CHARBOUND) { 1.159 + } 1.160 + if (options & UTF8PROC_CHARBOUND) { 1.161 bool boundary; 1.162 int tbc, lbc; 1.163 - int category; 1.164 - category = property->category; 1.165 tbc = 1.166 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : 1.167 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : 1.168 @@ -306,6 +352,11 @@ 1.169 int32_t *buffer, ssize_t bufsize, int options) { 1.170 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options 1.171 ssize_t wpos = 0; 1.172 + if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 1.173 + return UTF8PROC_ERROR_INVALIDOPTS; 1.174 + if ((options & UTF8PROC_STRIPMARK) && 1.175 + !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 1.176 + return UTF8PROC_ERROR_INVALIDOPTS; 1.177 { 1.178 int32_t uc; 1.179 ssize_t rpos = 0; 1.180 @@ -395,7 +446,7 @@ 1.181 int32_t *starter = NULL; 1.182 int32_t current_char; 1.183 const utf8proc_property_t *starter_property = NULL, *current_property; 1.184 - int16_t max_combining_class = -1; 1.185 + utf8proc_propval_t max_combining_class = -1; 1.186 ssize_t rpos; 1.187 ssize_t wpos = 0; 1.188 int32_t composition;