rev |
line source |
jbe@0
|
1 /*
|
jbe@10
|
2 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
jbe@0
|
3 *
|
jbe@7
|
4 * Permission is hereby granted, free of charge, to any person obtaining a
|
jbe@7
|
5 * copy of this software and associated documentation files (the "Software"),
|
jbe@7
|
6 * to deal in the Software without restriction, including without limitation
|
jbe@7
|
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
jbe@7
|
8 * and/or sell copies of the Software, and to permit persons to whom the
|
jbe@7
|
9 * Software is furnished to do so, subject to the following conditions:
|
jbe@0
|
10 *
|
jbe@7
|
11 * The above copyright notice and this permission notice shall be included in
|
jbe@7
|
12 * all copies or substantial portions of the Software.
|
jbe@0
|
13 *
|
jbe@7
|
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jbe@7
|
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jbe@7
|
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
jbe@7
|
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jbe@7
|
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
jbe@7
|
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
jbe@7
|
20 * DEALINGS IN THE SOFTWARE.
|
jbe@7
|
21 */
|
jbe@7
|
22
|
jbe@7
|
23 /*
|
jbe@0
|
24 * This library contains derived data from a modified version of the
|
jbe@0
|
25 * Unicode data files.
|
jbe@0
|
26 *
|
jbe@0
|
27 * The original data files are available at
|
jbe@0
|
28 * http://www.unicode.org/Public/UNIDATA/
|
jbe@0
|
29 *
|
jbe@0
|
30 * Please notice the copyright statement in the file "utf8proc_data.c".
|
jbe@0
|
31 */
|
jbe@0
|
32
|
jbe@0
|
33
|
jbe@0
|
34 /*
|
jbe@0
|
35 * File name: utf8proc.c
|
jbe@0
|
36 *
|
jbe@0
|
37 * Description:
|
jbe@0
|
38 * Implementation of libutf8proc.
|
jbe@0
|
39 */
|
jbe@0
|
40
|
jbe@0
|
41
|
jbe@0
|
42 #include "utf8proc.h"
|
jbe@0
|
43 #include "utf8proc_data.c"
|
jbe@0
|
44
|
jbe@0
|
45
|
Jiahao@15
|
46 DLLEXPORT const int8_t utf8proc_utf8class[256] = {
|
jbe@0
|
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
jbe@0
|
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
jbe@0
|
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
jbe@0
|
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
jbe@0
|
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
jbe@0
|
59 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
jbe@0
|
60 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
jbe@0
|
61 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
jbe@0
|
62 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
|
jbe@0
|
63
|
jbe@0
|
64 #define UTF8PROC_HANGUL_SBASE 0xAC00
|
jbe@0
|
65 #define UTF8PROC_HANGUL_LBASE 0x1100
|
jbe@0
|
66 #define UTF8PROC_HANGUL_VBASE 0x1161
|
jbe@0
|
67 #define UTF8PROC_HANGUL_TBASE 0x11A7
|
jbe@0
|
68 #define UTF8PROC_HANGUL_LCOUNT 19
|
jbe@0
|
69 #define UTF8PROC_HANGUL_VCOUNT 21
|
jbe@0
|
70 #define UTF8PROC_HANGUL_TCOUNT 28
|
jbe@0
|
71 #define UTF8PROC_HANGUL_NCOUNT 588
|
jbe@0
|
72 #define UTF8PROC_HANGUL_SCOUNT 11172
|
jbe@10
|
73 /* END is exclusive */
|
jbe@2
|
74 #define UTF8PROC_HANGUL_L_START 0x1100
|
jbe@2
|
75 #define UTF8PROC_HANGUL_L_END 0x115A
|
jbe@2
|
76 #define UTF8PROC_HANGUL_L_FILLER 0x115F
|
jbe@2
|
77 #define UTF8PROC_HANGUL_V_START 0x1160
|
jbe@2
|
78 #define UTF8PROC_HANGUL_V_END 0x11A3
|
jbe@2
|
79 #define UTF8PROC_HANGUL_T_START 0x11A8
|
jbe@2
|
80 #define UTF8PROC_HANGUL_T_END 0x11FA
|
jbe@2
|
81 #define UTF8PROC_HANGUL_S_START 0xAC00
|
jbe@2
|
82 #define UTF8PROC_HANGUL_S_END 0xD7A4
|
jbe@2
|
83
|
jbe@2
|
84
|
jbe@2
|
85 #define UTF8PROC_BOUNDCLASS_START 0
|
jbe@2
|
86 #define UTF8PROC_BOUNDCLASS_OTHER 1
|
jbe@2
|
87 #define UTF8PROC_BOUNDCLASS_CR 2
|
jbe@2
|
88 #define UTF8PROC_BOUNDCLASS_LF 3
|
jbe@2
|
89 #define UTF8PROC_BOUNDCLASS_CONTROL 4
|
jbe@2
|
90 #define UTF8PROC_BOUNDCLASS_EXTEND 5
|
jbe@2
|
91 #define UTF8PROC_BOUNDCLASS_L 6
|
jbe@2
|
92 #define UTF8PROC_BOUNDCLASS_V 7
|
jbe@2
|
93 #define UTF8PROC_BOUNDCLASS_T 8
|
jbe@2
|
94 #define UTF8PROC_BOUNDCLASS_LV 9
|
jbe@2
|
95 #define UTF8PROC_BOUNDCLASS_LVT 10
|
jbe@0
|
96
|
jbe@0
|
97
|
Jiahao@15
|
98 DLLEXPORT const char *utf8proc_version(void) {
|
jbe@14
|
99 return "1.1.6";
|
jbe@9
|
100 }
|
jbe@9
|
101
|
Jiahao@15
|
102 DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode) {
|
jbe@0
|
103 switch (errcode) {
|
jbe@0
|
104 case UTF8PROC_ERROR_NOMEM:
|
jbe@0
|
105 return "Memory for processing UTF-8 data could not be allocated.";
|
jbe@0
|
106 case UTF8PROC_ERROR_OVERFLOW:
|
jbe@0
|
107 return "UTF-8 string is too long to be processed.";
|
jbe@0
|
108 case UTF8PROC_ERROR_INVALIDUTF8:
|
jbe@0
|
109 return "Invalid UTF-8 string";
|
jbe@0
|
110 case UTF8PROC_ERROR_NOTASSIGNED:
|
jbe@0
|
111 return "Unassigned Unicode code point found in UTF-8 string.";
|
jbe@3
|
112 case UTF8PROC_ERROR_INVALIDOPTS:
|
jbe@3
|
113 return "Invalid options for UTF-8 processing chosen.";
|
jbe@0
|
114 default:
|
Jiahao@15
|
115 return "An unknown error occurred while processing UTF-8 data.";
|
jbe@0
|
116 }
|
jbe@0
|
117 }
|
jbe@0
|
118
|
Jiahao@15
|
119 DLLEXPORT ssize_t utf8proc_iterate(
|
jbe@7
|
120 const uint8_t *str, ssize_t strlen, int32_t *dst
|
jbe@7
|
121 ) {
|
jbe@0
|
122 int length;
|
jbe@0
|
123 int i;
|
jbe@0
|
124 int32_t uc = -1;
|
jbe@0
|
125 *dst = -1;
|
jbe@0
|
126 if (!strlen) return 0;
|
jbe@0
|
127 length = utf8proc_utf8class[str[0]];
|
jbe@0
|
128 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
|
jbe@0
|
129 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
|
jbe@0
|
130 for (i=1; i<length; i++) {
|
jbe@0
|
131 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
|
jbe@0
|
132 }
|
jbe@0
|
133 switch (length) {
|
jbe@0
|
134 case 1:
|
jbe@0
|
135 uc = str[0];
|
jbe@0
|
136 break;
|
jbe@0
|
137 case 2:
|
jbe@0
|
138 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
|
jbe@0
|
139 if (uc < 0x80) uc = -1;
|
jbe@0
|
140 break;
|
jbe@0
|
141 case 3:
|
jbe@0
|
142 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
|
jbe@0
|
143 + (str[2] & 0x3F);
|
jbe@0
|
144 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
|
jbe@0
|
145 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
|
jbe@0
|
146 break;
|
jbe@0
|
147 case 4:
|
jbe@0
|
148 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
|
jbe@0
|
149 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
|
jbe@0
|
150 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
|
jbe@0
|
151 break;
|
jbe@0
|
152 }
|
jbe@7
|
153 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
|
jbe@7
|
154 return UTF8PROC_ERROR_INVALIDUTF8;
|
jbe@0
|
155 *dst = uc;
|
jbe@0
|
156 return length;
|
jbe@0
|
157 }
|
jbe@0
|
158
|
Jiahao@15
|
159 DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc) {
|
jbe@7
|
160 if (uc < 0 || uc >= 0x110000 ||
|
jbe@7
|
161 ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
|
jbe@7
|
162 (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
|
jbe@7
|
163 else return true;
|
jbe@7
|
164 }
|
jbe@7
|
165
|
Jiahao@15
|
166 DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
|
jbe@0
|
167 if (uc < 0x00) {
|
jbe@0
|
168 return 0;
|
jbe@0
|
169 } else if (uc < 0x80) {
|
jbe@0
|
170 dst[0] = uc;
|
jbe@0
|
171 return 1;
|
jbe@0
|
172 } else if (uc < 0x800) {
|
jbe@0
|
173 dst[0] = 0xC0 + (uc >> 6);
|
jbe@0
|
174 dst[1] = 0x80 + (uc & 0x3F);
|
jbe@0
|
175 return 2;
|
jbe@2
|
176 } else if (uc == 0xFFFF) {
|
jbe@2
|
177 dst[0] = 0xFF;
|
jbe@2
|
178 return 1;
|
jbe@2
|
179 } else if (uc == 0xFFFE) {
|
jbe@2
|
180 dst[0] = 0xFE;
|
jbe@2
|
181 return 1;
|
jbe@0
|
182 } else if (uc < 0x10000) {
|
jbe@0
|
183 dst[0] = 0xE0 + (uc >> 12);
|
jbe@0
|
184 dst[1] = 0x80 + ((uc >> 6) & 0x3F);
|
jbe@0
|
185 dst[2] = 0x80 + (uc & 0x3F);
|
jbe@0
|
186 return 3;
|
jbe@0
|
187 } else if (uc < 0x110000) {
|
jbe@0
|
188 dst[0] = 0xF0 + (uc >> 18);
|
jbe@0
|
189 dst[1] = 0x80 + ((uc >> 12) & 0x3F);
|
jbe@0
|
190 dst[2] = 0x80 + ((uc >> 6) & 0x3F);
|
jbe@0
|
191 dst[3] = 0x80 + (uc & 0x3F);
|
jbe@0
|
192 return 4;
|
jbe@0
|
193 } else return 0;
|
jbe@0
|
194 }
|
jbe@0
|
195
|
Jiahao@15
|
196 DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
|
jbe@10
|
197 /* ASSERT: uc >= 0 && uc < 0x110000 */
|
jbe@0
|
198 return utf8proc_properties + (
|
jbe@0
|
199 utf8proc_stage2table[
|
jbe@0
|
200 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
|
jbe@0
|
201 ]
|
jbe@0
|
202 );
|
jbe@0
|
203 }
|
jbe@0
|
204
|
jbe@3
|
205 #define utf8proc_decompose_lump(replacement_uc) \
|
jbe@3
|
206 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
jbe@3
|
207 options & ~UTF8PROC_LUMP, last_boundclass)
|
jbe@3
|
208
|
Jiahao@15
|
209 DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
|
jbe@2
|
210 int options, int *last_boundclass) {
|
jbe@10
|
211 /* ASSERT: uc >= 0 && uc < 0x110000 */
|
jbe@0
|
212 const utf8proc_property_t *property;
|
jbe@3
|
213 utf8proc_propval_t category;
|
jbe@0
|
214 int32_t hangul_sindex;
|
jbe@0
|
215 property = utf8proc_get_property(uc);
|
jbe@3
|
216 category = property->category;
|
jbe@0
|
217 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
|
jbe@3
|
218 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
jbe@3
|
219 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
|
jbe@3
|
220 int32_t hangul_tindex;
|
jbe@3
|
221 if (bufsize >= 1) {
|
jbe@3
|
222 dst[0] = UTF8PROC_HANGUL_LBASE +
|
jbe@3
|
223 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
|
jbe@3
|
224 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
|
jbe@3
|
225 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
|
jbe@3
|
226 }
|
jbe@3
|
227 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
|
jbe@3
|
228 if (!hangul_tindex) return 2;
|
jbe@3
|
229 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
|
jbe@3
|
230 return 3;
|
jbe@0
|
231 }
|
jbe@3
|
232 }
|
jbe@3
|
233 if (options & UTF8PROC_REJECTNA) {
|
jbe@3
|
234 if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
|
jbe@3
|
235 }
|
jbe@3
|
236 if (options & UTF8PROC_IGNORE) {
|
jbe@3
|
237 if (property->ignorable) return 0;
|
jbe@3
|
238 }
|
jbe@3
|
239 if (options & UTF8PROC_LUMP) {
|
jbe@3
|
240 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
|
jbe@3
|
241 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
|
jbe@3
|
242 utf8proc_decompose_lump(0x0027);
|
jbe@3
|
243 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
|
jbe@3
|
244 utf8proc_decompose_lump(0x002D);
|
jbe@3
|
245 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
|
jbe@3
|
246 if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
|
jbe@3
|
247 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
|
jbe@3
|
248 utf8proc_decompose_lump(0x003C);
|
jbe@3
|
249 if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
|
jbe@3
|
250 utf8proc_decompose_lump(0x003E);
|
jbe@3
|
251 if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
|
jbe@3
|
252 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
|
jbe@3
|
253 utf8proc_decompose_lump(0x005E);
|
jbe@3
|
254 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
|
jbe@3
|
255 utf8proc_decompose_lump(0x005F);
|
jbe@3
|
256 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
|
jbe@3
|
257 if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
|
jbe@3
|
258 if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
|
jbe@3
|
259 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
|
jbe@3
|
260 if (category == UTF8PROC_CATEGORY_ZL ||
|
jbe@3
|
261 category == UTF8PROC_CATEGORY_ZP)
|
jbe@3
|
262 utf8proc_decompose_lump(0x000A);
|
jbe@3
|
263 }
|
jbe@3
|
264 }
|
jbe@3
|
265 if (options & UTF8PROC_STRIPMARK) {
|
jbe@3
|
266 if (category == UTF8PROC_CATEGORY_MN ||
|
jbe@3
|
267 category == UTF8PROC_CATEGORY_MC ||
|
jbe@3
|
268 category == UTF8PROC_CATEGORY_ME) return 0;
|
jbe@3
|
269 }
|
jbe@3
|
270 if (options & UTF8PROC_CASEFOLD) {
|
jbe@3
|
271 if (property->casefold_mapping) {
|
jbe@3
|
272 const int32_t *casefold_entry;
|
jbe@3
|
273 ssize_t written = 0;
|
jbe@3
|
274 for (casefold_entry = property->casefold_mapping;
|
jbe@3
|
275 *casefold_entry >= 0; casefold_entry++) {
|
jbe@3
|
276 written += utf8proc_decompose_char(*casefold_entry, dst+written,
|
jbe@3
|
277 (bufsize > written) ? (bufsize - written) : 0, options,
|
jbe@3
|
278 last_boundclass);
|
jbe@3
|
279 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
jbe@3
|
280 }
|
jbe@3
|
281 return written;
|
jbe@3
|
282 }
|
jbe@3
|
283 }
|
jbe@3
|
284 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
jbe@3
|
285 if (property->decomp_mapping &&
|
jbe@3
|
286 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
jbe@3
|
287 const int32_t *decomp_entry;
|
jbe@3
|
288 ssize_t written = 0;
|
jbe@3
|
289 for (decomp_entry = property->decomp_mapping;
|
jbe@3
|
290 *decomp_entry >= 0; decomp_entry++) {
|
jbe@3
|
291 written += utf8proc_decompose_char(*decomp_entry, dst+written,
|
jbe@3
|
292 (bufsize > written) ? (bufsize - written) : 0, options,
|
jbe@2
|
293 last_boundclass);
|
jbe@3
|
294 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
jbe@3
|
295 }
|
jbe@3
|
296 return written;
|
jbe@0
|
297 }
|
jbe@3
|
298 }
|
jbe@3
|
299 if (options & UTF8PROC_CHARBOUND) {
|
jbe@2
|
300 bool boundary;
|
jbe@2
|
301 int tbc, lbc;
|
jbe@2
|
302 tbc =
|
jbe@2
|
303 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
|
jbe@2
|
304 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
|
jbe@2
|
305 ((category == UTF8PROC_CATEGORY_ZL ||
|
jbe@2
|
306 category == UTF8PROC_CATEGORY_ZP ||
|
jbe@2
|
307 category == UTF8PROC_CATEGORY_CC ||
|
jbe@2
|
308 category == UTF8PROC_CATEGORY_CF) &&
|
jbe@2
|
309 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
|
jbe@2
|
310 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
|
jbe@2
|
311 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
|
jbe@2
|
312 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
|
jbe@2
|
313 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
|
jbe@2
|
314 UTF8PROC_BOUNDCLASS_V :
|
jbe@2
|
315 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
|
jbe@2
|
316 UTF8PROC_BOUNDCLASS_T :
|
jbe@2
|
317 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
|
jbe@2
|
318 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
|
jbe@2
|
319 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
|
jbe@2
|
320 ) :
|
jbe@2
|
321 UTF8PROC_BOUNDCLASS_OTHER;
|
jbe@2
|
322 lbc = *last_boundclass;
|
jbe@2
|
323 boundary =
|
jbe@2
|
324 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
jbe@2
|
325 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
jbe@2
|
326 (lbc == UTF8PROC_BOUNDCLASS_CR &&
|
jbe@2
|
327 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
jbe@2
|
328 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
jbe@2
|
329 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
jbe@2
|
330 (lbc == UTF8PROC_BOUNDCLASS_L &&
|
jbe@2
|
331 (tbc == UTF8PROC_BOUNDCLASS_L ||
|
jbe@2
|
332 tbc == UTF8PROC_BOUNDCLASS_V ||
|
jbe@2
|
333 tbc == UTF8PROC_BOUNDCLASS_LV ||
|
jbe@2
|
334 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
jbe@2
|
335 ((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
jbe@2
|
336 lbc == UTF8PROC_BOUNDCLASS_V) &&
|
jbe@2
|
337 (tbc == UTF8PROC_BOUNDCLASS_V ||
|
jbe@2
|
338 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
jbe@2
|
339 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
jbe@2
|
340 lbc == UTF8PROC_BOUNDCLASS_T) &&
|
jbe@2
|
341 tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
jbe@2
|
342 true;
|
jbe@2
|
343 *last_boundclass = tbc;
|
jbe@2
|
344 if (boundary) {
|
jbe@2
|
345 if (bufsize >= 1) dst[0] = 0xFFFF;
|
jbe@2
|
346 if (bufsize >= 2) dst[1] = uc;
|
jbe@2
|
347 return 2;
|
jbe@2
|
348 }
|
jbe@0
|
349 }
|
jbe@2
|
350 if (bufsize >= 1) *dst = uc;
|
jbe@2
|
351 return 1;
|
jbe@0
|
352 }
|
jbe@0
|
353
|
Jiahao@15
|
354 DLLEXPORT ssize_t utf8proc_decompose(
|
jbe@7
|
355 const uint8_t *str, ssize_t strlen,
|
jbe@7
|
356 int32_t *buffer, ssize_t bufsize, int options
|
jbe@7
|
357 ) {
|
jbe@10
|
358 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
jbe@0
|
359 ssize_t wpos = 0;
|
jbe@3
|
360 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
|
jbe@3
|
361 return UTF8PROC_ERROR_INVALIDOPTS;
|
jbe@3
|
362 if ((options & UTF8PROC_STRIPMARK) &&
|
jbe@3
|
363 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
|
jbe@3
|
364 return UTF8PROC_ERROR_INVALIDOPTS;
|
jbe@0
|
365 {
|
jbe@0
|
366 int32_t uc;
|
jbe@0
|
367 ssize_t rpos = 0;
|
jbe@0
|
368 ssize_t decomp_result;
|
jbe@2
|
369 int boundclass = UTF8PROC_BOUNDCLASS_START;
|
jbe@0
|
370 while (1) {
|
jbe@0
|
371 if (options & UTF8PROC_NULLTERM) {
|
jbe@0
|
372 rpos += utf8proc_iterate(str + rpos, -1, &uc);
|
Jiahao@15
|
373 /* checking of return value is not necessary,
|
jbe@10
|
374 as 'uc' is < 0 in case of error */
|
jbe@0
|
375 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
jbe@0
|
376 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
|
jbe@0
|
377 if (uc == 0) break;
|
jbe@0
|
378 } else {
|
jbe@0
|
379 if (rpos >= strlen) break;
|
jbe@0
|
380 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
|
jbe@0
|
381 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
jbe@0
|
382 }
|
jbe@0
|
383 decomp_result = utf8proc_decompose_char(
|
jbe@2
|
384 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
jbe@2
|
385 &boundclass
|
jbe@0
|
386 );
|
jbe@0
|
387 if (decomp_result < 0) return decomp_result;
|
jbe@0
|
388 wpos += decomp_result;
|
jbe@10
|
389 /* prohibiting integer overflows due to too long strings: */
|
jbe@0
|
390 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
|
jbe@0
|
391 return UTF8PROC_ERROR_OVERFLOW;
|
jbe@0
|
392 }
|
jbe@0
|
393 }
|
jbe@2
|
394 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
|
jbe@0
|
395 ssize_t pos = 0;
|
jbe@0
|
396 while (pos < wpos-1) {
|
jbe@0
|
397 int32_t uc1, uc2;
|
jbe@0
|
398 const utf8proc_property_t *property1, *property2;
|
jbe@0
|
399 uc1 = buffer[pos];
|
jbe@0
|
400 uc2 = buffer[pos+1];
|
jbe@0
|
401 property1 = utf8proc_get_property(uc1);
|
jbe@0
|
402 property2 = utf8proc_get_property(uc2);
|
jbe@0
|
403 if (property1->combining_class > property2->combining_class &&
|
jbe@0
|
404 property2->combining_class > 0) {
|
jbe@0
|
405 buffer[pos] = uc2;
|
jbe@0
|
406 buffer[pos+1] = uc1;
|
jbe@0
|
407 if (pos > 0) pos--; else pos++;
|
jbe@0
|
408 } else {
|
jbe@0
|
409 pos++;
|
jbe@0
|
410 }
|
jbe@0
|
411 }
|
jbe@0
|
412 }
|
jbe@0
|
413 return wpos;
|
jbe@0
|
414 }
|
jbe@0
|
415
|
Jiahao@15
|
416 DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
jbe@10
|
417 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
jbe@10
|
418 ASSERT: 'buffer' has one spare byte of free space at the end! */
|
jbe@0
|
419 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
jbe@0
|
420 ssize_t rpos;
|
jbe@0
|
421 ssize_t wpos = 0;
|
jbe@0
|
422 int32_t uc;
|
jbe@0
|
423 for (rpos = 0; rpos < length; rpos++) {
|
jbe@0
|
424 uc = buffer[rpos];
|
jbe@0
|
425 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
|
jbe@0
|
426 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
|
jbe@0
|
427 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
|
jbe@0
|
428 if (options & UTF8PROC_NLF2LS) {
|
jbe@0
|
429 if (options & UTF8PROC_NLF2PS) {
|
jbe@0
|
430 buffer[wpos++] = 0x000A;
|
jbe@0
|
431 } else {
|
jbe@0
|
432 buffer[wpos++] = 0x2028;
|
jbe@0
|
433 }
|
jbe@0
|
434 } else {
|
jbe@0
|
435 if (options & UTF8PROC_NLF2PS) {
|
jbe@0
|
436 buffer[wpos++] = 0x2029;
|
jbe@0
|
437 } else {
|
jbe@0
|
438 buffer[wpos++] = 0x0020;
|
jbe@0
|
439 }
|
jbe@0
|
440 }
|
jbe@0
|
441 } else if ((options & UTF8PROC_STRIPCC) &&
|
jbe@0
|
442 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
|
jbe@0
|
443 if (uc == 0x0009) buffer[wpos++] = 0x0020;
|
jbe@0
|
444 } else {
|
jbe@0
|
445 buffer[wpos++] = uc;
|
jbe@0
|
446 }
|
jbe@0
|
447 }
|
jbe@0
|
448 length = wpos;
|
jbe@0
|
449 }
|
jbe@0
|
450 if (options & UTF8PROC_COMPOSE) {
|
jbe@0
|
451 int32_t *starter = NULL;
|
jbe@0
|
452 int32_t current_char;
|
jbe@0
|
453 const utf8proc_property_t *starter_property = NULL, *current_property;
|
jbe@3
|
454 utf8proc_propval_t max_combining_class = -1;
|
jbe@0
|
455 ssize_t rpos;
|
jbe@0
|
456 ssize_t wpos = 0;
|
jbe@0
|
457 int32_t composition;
|
jbe@0
|
458 for (rpos = 0; rpos < length; rpos++) {
|
jbe@0
|
459 current_char = buffer[rpos];
|
jbe@0
|
460 current_property = utf8proc_get_property(current_char);
|
jbe@0
|
461 if (starter && current_property->combining_class > max_combining_class) {
|
jbe@10
|
462 /* combination perhaps possible */
|
jbe@0
|
463 int32_t hangul_lindex;
|
jbe@0
|
464 int32_t hangul_sindex;
|
jbe@0
|
465 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
|
jbe@0
|
466 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
|
jbe@0
|
467 int32_t hangul_vindex;
|
jbe@0
|
468 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
|
jbe@0
|
469 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
|
jbe@0
|
470 *starter = UTF8PROC_HANGUL_SBASE +
|
jbe@0
|
471 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
|
jbe@0
|
472 UTF8PROC_HANGUL_TCOUNT;
|
jbe@0
|
473 starter_property = NULL;
|
jbe@0
|
474 continue;
|
jbe@0
|
475 }
|
jbe@0
|
476 }
|
jbe@0
|
477 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
|
jbe@0
|
478 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
|
jbe@0
|
479 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
|
jbe@0
|
480 int32_t hangul_tindex;
|
jbe@0
|
481 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
|
jbe@0
|
482 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
|
jbe@0
|
483 *starter += hangul_tindex;
|
jbe@0
|
484 starter_property = NULL;
|
jbe@0
|
485 continue;
|
jbe@0
|
486 }
|
jbe@0
|
487 }
|
jbe@0
|
488 if (!starter_property) {
|
jbe@0
|
489 starter_property = utf8proc_get_property(*starter);
|
jbe@0
|
490 }
|
jbe@0
|
491 if (starter_property->comb1st_index >= 0 &&
|
jbe@0
|
492 current_property->comb2nd_index >= 0) {
|
jbe@0
|
493 composition = utf8proc_combinations[
|
jbe@0
|
494 starter_property->comb1st_index +
|
jbe@0
|
495 current_property->comb2nd_index
|
jbe@0
|
496 ];
|
jbe@0
|
497 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
|
jbe@0
|
498 !(utf8proc_get_property(composition)->comp_exclusion))) {
|
jbe@0
|
499 *starter = composition;
|
jbe@0
|
500 starter_property = NULL;
|
jbe@0
|
501 continue;
|
jbe@0
|
502 }
|
jbe@0
|
503 }
|
jbe@0
|
504 }
|
jbe@0
|
505 buffer[wpos] = current_char;
|
jbe@0
|
506 if (current_property->combining_class) {
|
jbe@0
|
507 if (current_property->combining_class > max_combining_class) {
|
jbe@0
|
508 max_combining_class = current_property->combining_class;
|
jbe@0
|
509 }
|
jbe@0
|
510 } else {
|
jbe@0
|
511 starter = buffer + wpos;
|
jbe@0
|
512 starter_property = NULL;
|
jbe@0
|
513 max_combining_class = -1;
|
jbe@0
|
514 }
|
jbe@0
|
515 wpos++;
|
jbe@0
|
516 }
|
jbe@0
|
517 length = wpos;
|
jbe@0
|
518 }
|
jbe@0
|
519 {
|
jbe@0
|
520 ssize_t rpos, wpos = 0;
|
jbe@0
|
521 int32_t uc;
|
jbe@0
|
522 for (rpos = 0; rpos < length; rpos++) {
|
jbe@0
|
523 uc = buffer[rpos];
|
jbe@0
|
524 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
|
jbe@0
|
525 }
|
jbe@0
|
526 ((uint8_t *)buffer)[wpos] = 0;
|
jbe@0
|
527 return wpos;
|
jbe@0
|
528 }
|
jbe@0
|
529 }
|
jbe@0
|
530
|
Jiahao@15
|
531 DLLEXPORT ssize_t utf8proc_map(
|
jbe@7
|
532 const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
jbe@7
|
533 ) {
|
jbe@0
|
534 int32_t *buffer;
|
jbe@0
|
535 ssize_t result;
|
jbe@0
|
536 *dstptr = NULL;
|
jbe@0
|
537 result = utf8proc_decompose(str, strlen, NULL, 0, options);
|
jbe@0
|
538 if (result < 0) return result;
|
Jiahao@15
|
539 buffer = (int32_t *) malloc(result * sizeof(int32_t) + 1);
|
jbe@0
|
540 if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
jbe@0
|
541 result = utf8proc_decompose(str, strlen, buffer, result, options);
|
jbe@0
|
542 if (result < 0) {
|
jbe@0
|
543 free(buffer);
|
jbe@0
|
544 return result;
|
jbe@0
|
545 }
|
jbe@0
|
546 result = utf8proc_reencode(buffer, result, options);
|
jbe@0
|
547 if (result < 0) {
|
jbe@0
|
548 free(buffer);
|
jbe@0
|
549 return result;
|
jbe@0
|
550 }
|
jbe@0
|
551 {
|
jbe@0
|
552 int32_t *newptr;
|
Jiahao@15
|
553 newptr = (int32_t *) realloc(buffer, (size_t)result+1);
|
jbe@0
|
554 if (newptr) buffer = newptr;
|
jbe@0
|
555 }
|
jbe@0
|
556 *dstptr = (uint8_t *)buffer;
|
jbe@0
|
557 return result;
|
jbe@0
|
558 }
|
jbe@0
|
559
|
Jiahao@15
|
560 DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str) {
|
jbe@0
|
561 uint8_t *retval;
|
jbe@2
|
562 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
jbe@2
|
563 UTF8PROC_DECOMPOSE);
|
jbe@0
|
564 return retval;
|
jbe@0
|
565 }
|
jbe@0
|
566
|
Jiahao@15
|
567 DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str) {
|
jbe@0
|
568 uint8_t *retval;
|
jbe@0
|
569 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
jbe@0
|
570 UTF8PROC_COMPOSE);
|
jbe@0
|
571 return retval;
|
jbe@0
|
572 }
|
jbe@0
|
573
|
Jiahao@15
|
574 DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str) {
|
jbe@0
|
575 uint8_t *retval;
|
jbe@0
|
576 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
jbe@2
|
577 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
jbe@0
|
578 return retval;
|
jbe@0
|
579 }
|
jbe@0
|
580
|
Jiahao@15
|
581 DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str) {
|
jbe@0
|
582 uint8_t *retval;
|
jbe@0
|
583 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
jbe@0
|
584 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
jbe@0
|
585 return retval;
|
jbe@0
|
586 }
|
jbe@0
|
587
|