utf8proc: aaad485d5335 utf8proc.c

utf8proc

view utf8proc.c @ 2:aaad485d5335

Version 0.3

- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on

author	jbe
date	Fri Aug 04 12:00:00 2006 +0200 (2006-08-04)
parents	61a89ecc2fb9
children	4ee0d5f54af1

line source

1 /*

3 * Author: Jan Behrens <jan.behrens@flexiguided.de>

5 *

6 * Redistribution and use in source and binary forms, with or without

7 * modification, are permitted provided that the following conditions are

8 * met:

9 *

10 * 1. Redistributions of source code must retain the above copyright

11 * notice, this list of conditions and the following disclaimer.

12 * 2. Redistributions in binary form must reproduce the above copyright

13 * notice, this list of conditions and the following disclaimer in the

14 * documentation and/or other materials provided with the distribution.

15 * 3. Neither the name of the FlexiGuided GmbH nor the names of its

16 * contributors may be used to endorse or promote products derived from

17 * this software without specific prior written permission.

18 *

19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A

22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

23 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

26 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

27 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

28 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

30 *

31 *

32 * This library contains derived data from a modified version of the

33 * Unicode data files.

34 *

35 * The original data files are available at

36 * http://www.unicode.org/Public/UNIDATA/

37 *

38 * Please notice the copyright statement in the file "utf8proc_data.c".

39 *

40 */

43 /*

44 * File name: utf8proc.c

45 * Version: 0.3

46 * Last changed: 2006-08-04

47 *

48 * Description:

49 * Implementation of libutf8proc.

50 */

53 #include "utf8proc.h"

54 #include "utf8proc_data.c"

57 const int8_t utf8proc_utf8class[256] = {

58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

71 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

72 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

73 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };

75 #define UTF8PROC_HANGUL_SBASE 0xAC00

76 #define UTF8PROC_HANGUL_LBASE 0x1100

77 #define UTF8PROC_HANGUL_VBASE 0x1161

78 #define UTF8PROC_HANGUL_TBASE 0x11A7

79 #define UTF8PROC_HANGUL_LCOUNT 19

80 #define UTF8PROC_HANGUL_VCOUNT 21

81 #define UTF8PROC_HANGUL_TCOUNT 28

82 #define UTF8PROC_HANGUL_NCOUNT 588

83 #define UTF8PROC_HANGUL_SCOUNT 11172

84 // END is exclusive

85 #define UTF8PROC_HANGUL_L_START 0x1100

86 #define UTF8PROC_HANGUL_L_END 0x115A

87 #define UTF8PROC_HANGUL_L_FILLER 0x115F

88 #define UTF8PROC_HANGUL_V_START 0x1160

89 #define UTF8PROC_HANGUL_V_END 0x11A3

90 #define UTF8PROC_HANGUL_T_START 0x11A8

91 #define UTF8PROC_HANGUL_T_END 0x11FA

92 #define UTF8PROC_HANGUL_S_START 0xAC00

93 #define UTF8PROC_HANGUL_S_END 0xD7A4

96 #define UTF8PROC_BOUNDCLASS_START 0

97 #define UTF8PROC_BOUNDCLASS_OTHER 1

98 #define UTF8PROC_BOUNDCLASS_CR 2

99 #define UTF8PROC_BOUNDCLASS_LF 3

100 #define UTF8PROC_BOUNDCLASS_CONTROL 4

101 #define UTF8PROC_BOUNDCLASS_EXTEND 5

102 #define UTF8PROC_BOUNDCLASS_L 6

103 #define UTF8PROC_BOUNDCLASS_V 7

104 #define UTF8PROC_BOUNDCLASS_T 8

105 #define UTF8PROC_BOUNDCLASS_LV 9

106 #define UTF8PROC_BOUNDCLASS_LVT 10

107

108

109 const char *utf8proc_errmsg(ssize_t errcode) {

110 switch (errcode) {

111 case UTF8PROC_ERROR_NOMEM:

112 return "Memory for processing UTF-8 data could not be allocated.";

113 case UTF8PROC_ERROR_OVERFLOW:

114 return "UTF-8 string is too long to be processed.";

115 case UTF8PROC_ERROR_INVALIDUTF8:

116 return "Invalid UTF-8 string";

117 case UTF8PROC_ERROR_NOTASSIGNED:

118 return "Unassigned Unicode code point found in UTF-8 string.";

119 default:

120 return "An unknown error occured while processing UTF-8 data.";

121 }

122 }

123

124 ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst) {

125 int length;

126 int i;

127 int32_t uc = -1;

128 *dst = -1;

129 if (!strlen) return 0;

130 length = utf8proc_utf8class[str[0]];

131 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;

132 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;

133 for (i=1; i<length; i++) {

134 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;

135 }

136 switch (length) {

137 case 1:

138 uc = str[0];

139 break;

140 case 2:

141 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);

142 if (uc < 0x80) uc = -1;

143 break;

144 case 3:

145 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)

146 + (str[2] & 0x3F);

147 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||

148 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;

149 break;

150 case 4:

151 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)

152 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);

153 if (uc < 0x10000 || uc >= 0x110000) uc = -1;

154 break;

155 }

156 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return UTF8PROC_ERROR_INVALIDUTF8;

157 *dst = uc;

158 return length;

159 }

160

161 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {

162 if (uc < 0x00) {

163 return 0;

164 } else if (uc < 0x80) {

165 dst[0] = uc;

166 return 1;

167 } else if (uc < 0x800) {

168 dst[0] = 0xC0 + (uc >> 6);

169 dst[1] = 0x80 + (uc & 0x3F);

170 return 2;

171 } else if (uc == 0xFFFF) {

172 dst[0] = 0xFF;

173 return 1;

174 } else if (uc == 0xFFFE) {

175 dst[0] = 0xFE;

176 return 1;

177 } else if (uc < 0x10000) {

178 dst[0] = 0xE0 + (uc >> 12);

179 dst[1] = 0x80 + ((uc >> 6) & 0x3F);

180 dst[2] = 0x80 + (uc & 0x3F);

181 return 3;

182 } else if (uc < 0x110000) {

183 dst[0] = 0xF0 + (uc >> 18);

184 dst[1] = 0x80 + ((uc >> 12) & 0x3F);

185 dst[2] = 0x80 + ((uc >> 6) & 0x3F);

186 dst[3] = 0x80 + (uc & 0x3F);

187 return 4;

188 } else return 0;

189 }

190

191 const utf8proc_property_t *utf8proc_get_property(int32_t uc) {

192 // ASSERT: uc >= 0 && uc < 0x110000

193 return utf8proc_properties + (

194 utf8proc_stage2table[

195 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)

196 ]

197 );

198 }

199

200 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,

201 int options, int *last_boundclass) {

202 // ASSERT: uc >= 0 && uc < 0x110000

203 const utf8proc_property_t *property;

204 int32_t hangul_sindex;

205 property = utf8proc_get_property(uc);

206 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;

207 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&

208 hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {

209 int32_t hangul_tindex;

210 if (bufsize >= 1) {

211 dst[0] = UTF8PROC_HANGUL_LBASE +

212 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;

213 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +

214 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;

215 }

216 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;

217 if (!hangul_tindex) return 2;

218 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;

219 return 3;

220 } else if ((options & UTF8PROC_REJECTNA) && !property->category) {

221 return UTF8PROC_ERROR_NOTASSIGNED;

222 } else if ((options & UTF8PROC_IGNORE) && property->ignorable) {

223 return 0;

224 } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) {

225 const int32_t *casefold_entry;

226 ssize_t written = 0;

227 for (casefold_entry = property->casefold_mapping;

228 *casefold_entry >= 0; casefold_entry++) {

229 written += utf8proc_decompose_char(*casefold_entry, dst+written,

230 (bufsize > written) ? (bufsize - written) : 0, options,

231 last_boundclass);

232 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;

233 }

234 return written;

235 } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&

236 property->decomp_mapping &&

237 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {

238 const int32_t *decomp_entry;

239 ssize_t written = 0;

240 for (decomp_entry = property->decomp_mapping;

241 *decomp_entry >= 0; decomp_entry++) {

242 written += utf8proc_decompose_char(*decomp_entry, dst+written,

243 (bufsize > written) ? (bufsize - written) : 0, options,

244 last_boundclass);

245 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;

246 }

247 return written;

248 } else if (options & UTF8PROC_CHARBOUND) {

249 bool boundary;

250 int tbc, lbc;

251 int category;

252 category = property->category;

253 tbc =

254 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :

255 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :

256 ((category == UTF8PROC_CATEGORY_ZL ||

257 category == UTF8PROC_CATEGORY_ZP ||

258 category == UTF8PROC_CATEGORY_CC ||

259 category == UTF8PROC_CATEGORY_CF) &&

260 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :

261 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :

262 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||

263 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :

264 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?

265 UTF8PROC_BOUNDCLASS_V :

266 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?

267 UTF8PROC_BOUNDCLASS_T :

268 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (

269 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?

270 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT

271 ) :

272 UTF8PROC_BOUNDCLASS_OTHER;

273 lbc = *last_boundclass;

274 boundary =

275 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :

276 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :

277 (lbc == UTF8PROC_BOUNDCLASS_CR &&

278 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :

279 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :

280 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :

281 (lbc == UTF8PROC_BOUNDCLASS_L &&

282 (tbc == UTF8PROC_BOUNDCLASS_L ||

283 tbc == UTF8PROC_BOUNDCLASS_V ||

284 tbc == UTF8PROC_BOUNDCLASS_LV ||

285 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :

286 ((lbc == UTF8PROC_BOUNDCLASS_LV ||

287 lbc == UTF8PROC_BOUNDCLASS_V) &&

288 (tbc == UTF8PROC_BOUNDCLASS_V ||

289 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :

290 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||

291 lbc == UTF8PROC_BOUNDCLASS_T) &&

292 tbc == UTF8PROC_BOUNDCLASS_T) ? false :

293 true;

294 *last_boundclass = tbc;

295 if (boundary) {

296 if (bufsize >= 1) dst[0] = 0xFFFF;

297 if (bufsize >= 2) dst[1] = uc;

298 return 2;

299 }

300 }

301 if (bufsize >= 1) *dst = uc;

302 return 1;

303 }

304

305 ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen,

306 int32_t *buffer, ssize_t bufsize, int options) {

307 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options

308 ssize_t wpos = 0;

309 {

310 int32_t uc;

311 ssize_t rpos = 0;

312 ssize_t decomp_result;

313 int boundclass = UTF8PROC_BOUNDCLASS_START;

314 while (1) {

315 if (options & UTF8PROC_NULLTERM) {

316 rpos += utf8proc_iterate(str + rpos, -1, &uc);

317 // checking of return value is not neccessary,

318 // as 'uc' is < 0 in case of error

319 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;

320 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;

321 if (uc == 0) break;

322 } else {

323 if (rpos >= strlen) break;

324 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);

325 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;

326 }

327 decomp_result = utf8proc_decompose_char(

328 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,

329 &boundclass

330 );

331 if (decomp_result < 0) return decomp_result;

332 wpos += decomp_result;

333 // prohibiting integer overflows due to too long strings:

334 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)

335 return UTF8PROC_ERROR_OVERFLOW;

336 }

337 }

338 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {

339 ssize_t pos = 0;

340 while (pos < wpos-1) {

341 int32_t uc1, uc2;

342 const utf8proc_property_t *property1, *property2;

343 uc1 = buffer[pos];

344 uc2 = buffer[pos+1];

345 property1 = utf8proc_get_property(uc1);

346 property2 = utf8proc_get_property(uc2);

347 if (property1->combining_class > property2->combining_class &&

348 property2->combining_class > 0) {

349 buffer[pos] = uc2;

350 buffer[pos+1] = uc1;

351 if (pos > 0) pos--; else pos++;

352 } else {

353 pos++;

354 }

355 }

356 }

357 return wpos;

358 }

359

360 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {

361 // UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored

362 // ASSERT: 'buffer' has one spare byte of free space at the end!

363 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {

364 ssize_t rpos;

365 ssize_t wpos = 0;

366 int32_t uc;

367 for (rpos = 0; rpos < length; rpos++) {

368 uc = buffer[rpos];

369 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;

370 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||

371 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {

372 if (options & UTF8PROC_NLF2LS) {

373 if (options & UTF8PROC_NLF2PS) {

374 buffer[wpos++] = 0x000A;

375 } else {

376 buffer[wpos++] = 0x2028;

377 }

378 } else {

379 if (options & UTF8PROC_NLF2PS) {

380 buffer[wpos++] = 0x2029;

381 } else {

382 buffer[wpos++] = 0x0020;

383 }

384 }

385 } else if ((options & UTF8PROC_STRIPCC) &&

386 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {

387 if (uc == 0x0009) buffer[wpos++] = 0x0020;

388 } else {

389 buffer[wpos++] = uc;

390 }

391 }

392 length = wpos;

393 }

394 if (options & UTF8PROC_COMPOSE) {

395 int32_t *starter = NULL;

396 int32_t current_char;

397 const utf8proc_property_t *starter_property = NULL, *current_property;

398 int16_t max_combining_class = -1;

399 ssize_t rpos;

400 ssize_t wpos = 0;

401 int32_t composition;

402 for (rpos = 0; rpos < length; rpos++) {

403 current_char = buffer[rpos];

404 current_property = utf8proc_get_property(current_char);

405 if (starter && current_property->combining_class > max_combining_class) {

406 // combination perhaps possible

407 int32_t hangul_lindex;

408 int32_t hangul_sindex;

409 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;

410 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {

411 int32_t hangul_vindex;

412 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;

413 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {

414 *starter = UTF8PROC_HANGUL_SBASE +

415 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *

416 UTF8PROC_HANGUL_TCOUNT;

417 starter_property = NULL;

418 continue;

419 }

420 }

421 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;

422 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&

423 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {

424 int32_t hangul_tindex;

425 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;

426 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {

427 *starter += hangul_tindex;

428 starter_property = NULL;

429 continue;

430 }

431 }

432 if (!starter_property) {

433 starter_property = utf8proc_get_property(*starter);

434 }

435 if (starter_property->comb1st_index >= 0 &&

436 current_property->comb2nd_index >= 0) {

437 composition = utf8proc_combinations[

438 starter_property->comb1st_index +

439 current_property->comb2nd_index

440 ];

441 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||

442 !(utf8proc_get_property(composition)->comp_exclusion))) {

443 *starter = composition;

444 starter_property = NULL;

445 continue;

446 }

447 }

448 }

449 buffer[wpos] = current_char;

450 if (current_property->combining_class) {

451 if (current_property->combining_class > max_combining_class) {

452 max_combining_class = current_property->combining_class;

453 }

454 } else {

455 starter = buffer + wpos;

456 starter_property = NULL;

457 max_combining_class = -1;

458 }

459 wpos++;

460 }

461 length = wpos;

462 }

463 {

464 ssize_t rpos, wpos = 0;

465 int32_t uc;

466 for (rpos = 0; rpos < length; rpos++) {

467 uc = buffer[rpos];

468 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);

469 }

470 ((uint8_t *)buffer)[wpos] = 0;

471 return wpos;

472 }

473 }

474

475 ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr,

476 int options) {

477 int32_t *buffer;

478 ssize_t result;

479 *dstptr = NULL;

480 result = utf8proc_decompose(str, strlen, NULL, 0, options);

481 if (result < 0) return result;

482 buffer = malloc(result * sizeof(int32_t) + 1);

483 if (!buffer) return UTF8PROC_ERROR_NOMEM;

484 result = utf8proc_decompose(str, strlen, buffer, result, options);

485 if (result < 0) {

486 free(buffer);

487 return result;

488 }

489 result = utf8proc_reencode(buffer, result, options);

490 if (result < 0) {

491 free(buffer);

492 return result;

493 }

494 {

495 int32_t *newptr;

496 newptr = realloc(buffer, result+1);

497 if (newptr) buffer = newptr;

498 }

499 *dstptr = (uint8_t *)buffer;

500 return result;

501 }

502

503 uint8_t *utf8proc_NFD(uint8_t *str) {

504 uint8_t *retval;

505 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |

506 UTF8PROC_DECOMPOSE);

507 return retval;

508 }

509

510 uint8_t *utf8proc_NFC(uint8_t *str) {

511 uint8_t *retval;

512 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |

513 UTF8PROC_COMPOSE);

514 return retval;

515 }

516

517 uint8_t *utf8proc_NFKD(uint8_t *str) {

518 uint8_t *retval;

519 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |

520 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);

521 return retval;

522 }

523

524 uint8_t *utf8proc_NFKC(uint8_t *str) {

525 uint8_t *retval;

526 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |

527 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);

528 return retval;

529 }

530

531