utf8proc

annotate utf8proc.c @ 2:aaad485d5335

Version 0.3

- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
author jbe
date Fri Aug 04 12:00:00 2006 +0200 (2006-08-04)
parents 61a89ecc2fb9
children 4ee0d5f54af1
rev   line source
jbe@0 1 /*
jbe@0 2 * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
jbe@0 3 * Author: Jan Behrens <jan.behrens@flexiguided.de>
jbe@0 4 * All rights reserved.
jbe@0 5 *
jbe@0 6 * Redistribution and use in source and binary forms, with or without
jbe@0 7 * modification, are permitted provided that the following conditions are
jbe@0 8 * met:
jbe@0 9 *
jbe@0 10 * 1. Redistributions of source code must retain the above copyright
jbe@0 11 * notice, this list of conditions and the following disclaimer.
jbe@0 12 * 2. Redistributions in binary form must reproduce the above copyright
jbe@0 13 * notice, this list of conditions and the following disclaimer in the
jbe@0 14 * documentation and/or other materials provided with the distribution.
jbe@0 15 * 3. Neither the name of the FlexiGuided GmbH nor the names of its
jbe@0 16 * contributors may be used to endorse or promote products derived from
jbe@0 17 * this software without specific prior written permission.
jbe@0 18 *
jbe@0 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
jbe@0 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
jbe@0 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
jbe@0 22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
jbe@0 23 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
jbe@0 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
jbe@0 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
jbe@0 26 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
jbe@0 27 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
jbe@0 28 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
jbe@0 29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
jbe@0 30 *
jbe@0 31 *
jbe@0 32 * This library contains derived data from a modified version of the
jbe@0 33 * Unicode data files.
jbe@0 34 *
jbe@0 35 * The original data files are available at
jbe@0 36 * http://www.unicode.org/Public/UNIDATA/
jbe@0 37 *
jbe@0 38 * Please notice the copyright statement in the file "utf8proc_data.c".
jbe@0 39 *
jbe@0 40 */
jbe@0 41
jbe@0 42
jbe@0 43 /*
jbe@0 44 * File name: utf8proc.c
jbe@2 45 * Version: 0.3
jbe@2 46 * Last changed: 2006-08-04
jbe@0 47 *
jbe@0 48 * Description:
jbe@0 49 * Implementation of libutf8proc.
jbe@0 50 */
jbe@0 51
jbe@0 52
jbe@0 53 #include "utf8proc.h"
jbe@0 54 #include "utf8proc_data.c"
jbe@0 55
jbe@0 56
jbe@0 57 const int8_t utf8proc_utf8class[256] = {
jbe@0 58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 71 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 72 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
jbe@0 73 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
jbe@0 74
jbe@0 75 #define UTF8PROC_HANGUL_SBASE 0xAC00
jbe@0 76 #define UTF8PROC_HANGUL_LBASE 0x1100
jbe@0 77 #define UTF8PROC_HANGUL_VBASE 0x1161
jbe@0 78 #define UTF8PROC_HANGUL_TBASE 0x11A7
jbe@0 79 #define UTF8PROC_HANGUL_LCOUNT 19
jbe@0 80 #define UTF8PROC_HANGUL_VCOUNT 21
jbe@0 81 #define UTF8PROC_HANGUL_TCOUNT 28
jbe@0 82 #define UTF8PROC_HANGUL_NCOUNT 588
jbe@0 83 #define UTF8PROC_HANGUL_SCOUNT 11172
jbe@2 84 // END is exclusive
jbe@2 85 #define UTF8PROC_HANGUL_L_START 0x1100
jbe@2 86 #define UTF8PROC_HANGUL_L_END 0x115A
jbe@2 87 #define UTF8PROC_HANGUL_L_FILLER 0x115F
jbe@2 88 #define UTF8PROC_HANGUL_V_START 0x1160
jbe@2 89 #define UTF8PROC_HANGUL_V_END 0x11A3
jbe@2 90 #define UTF8PROC_HANGUL_T_START 0x11A8
jbe@2 91 #define UTF8PROC_HANGUL_T_END 0x11FA
jbe@2 92 #define UTF8PROC_HANGUL_S_START 0xAC00
jbe@2 93 #define UTF8PROC_HANGUL_S_END 0xD7A4
jbe@2 94
jbe@2 95
jbe@2 96 #define UTF8PROC_BOUNDCLASS_START 0
jbe@2 97 #define UTF8PROC_BOUNDCLASS_OTHER 1
jbe@2 98 #define UTF8PROC_BOUNDCLASS_CR 2
jbe@2 99 #define UTF8PROC_BOUNDCLASS_LF 3
jbe@2 100 #define UTF8PROC_BOUNDCLASS_CONTROL 4
jbe@2 101 #define UTF8PROC_BOUNDCLASS_EXTEND 5
jbe@2 102 #define UTF8PROC_BOUNDCLASS_L 6
jbe@2 103 #define UTF8PROC_BOUNDCLASS_V 7
jbe@2 104 #define UTF8PROC_BOUNDCLASS_T 8
jbe@2 105 #define UTF8PROC_BOUNDCLASS_LV 9
jbe@2 106 #define UTF8PROC_BOUNDCLASS_LVT 10
jbe@0 107
jbe@0 108
jbe@0 109 const char *utf8proc_errmsg(ssize_t errcode) {
jbe@0 110 switch (errcode) {
jbe@0 111 case UTF8PROC_ERROR_NOMEM:
jbe@0 112 return "Memory for processing UTF-8 data could not be allocated.";
jbe@0 113 case UTF8PROC_ERROR_OVERFLOW:
jbe@0 114 return "UTF-8 string is too long to be processed.";
jbe@0 115 case UTF8PROC_ERROR_INVALIDUTF8:
jbe@0 116 return "Invalid UTF-8 string";
jbe@0 117 case UTF8PROC_ERROR_NOTASSIGNED:
jbe@0 118 return "Unassigned Unicode code point found in UTF-8 string.";
jbe@0 119 default:
jbe@0 120 return "An unknown error occured while processing UTF-8 data.";
jbe@0 121 }
jbe@0 122 }
jbe@0 123
jbe@0 124 ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst) {
jbe@0 125 int length;
jbe@0 126 int i;
jbe@0 127 int32_t uc = -1;
jbe@0 128 *dst = -1;
jbe@0 129 if (!strlen) return 0;
jbe@0 130 length = utf8proc_utf8class[str[0]];
jbe@0 131 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 132 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 133 for (i=1; i<length; i++) {
jbe@0 134 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 135 }
jbe@0 136 switch (length) {
jbe@0 137 case 1:
jbe@0 138 uc = str[0];
jbe@0 139 break;
jbe@0 140 case 2:
jbe@0 141 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
jbe@0 142 if (uc < 0x80) uc = -1;
jbe@0 143 break;
jbe@0 144 case 3:
jbe@0 145 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
jbe@0 146 + (str[2] & 0x3F);
jbe@0 147 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
jbe@0 148 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
jbe@0 149 break;
jbe@0 150 case 4:
jbe@0 151 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
jbe@0 152 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
jbe@0 153 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
jbe@0 154 break;
jbe@0 155 }
jbe@0 156 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 157 *dst = uc;
jbe@0 158 return length;
jbe@0 159 }
jbe@0 160
jbe@0 161 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
jbe@0 162 if (uc < 0x00) {
jbe@0 163 return 0;
jbe@0 164 } else if (uc < 0x80) {
jbe@0 165 dst[0] = uc;
jbe@0 166 return 1;
jbe@0 167 } else if (uc < 0x800) {
jbe@0 168 dst[0] = 0xC0 + (uc >> 6);
jbe@0 169 dst[1] = 0x80 + (uc & 0x3F);
jbe@0 170 return 2;
jbe@2 171 } else if (uc == 0xFFFF) {
jbe@2 172 dst[0] = 0xFF;
jbe@2 173 return 1;
jbe@2 174 } else if (uc == 0xFFFE) {
jbe@2 175 dst[0] = 0xFE;
jbe@2 176 return 1;
jbe@0 177 } else if (uc < 0x10000) {
jbe@0 178 dst[0] = 0xE0 + (uc >> 12);
jbe@0 179 dst[1] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 180 dst[2] = 0x80 + (uc & 0x3F);
jbe@0 181 return 3;
jbe@0 182 } else if (uc < 0x110000) {
jbe@0 183 dst[0] = 0xF0 + (uc >> 18);
jbe@0 184 dst[1] = 0x80 + ((uc >> 12) & 0x3F);
jbe@0 185 dst[2] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 186 dst[3] = 0x80 + (uc & 0x3F);
jbe@0 187 return 4;
jbe@0 188 } else return 0;
jbe@0 189 }
jbe@0 190
jbe@0 191 const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
jbe@0 192 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 193 return utf8proc_properties + (
jbe@0 194 utf8proc_stage2table[
jbe@0 195 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
jbe@0 196 ]
jbe@0 197 );
jbe@0 198 }
jbe@0 199
jbe@0 200 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
jbe@2 201 int options, int *last_boundclass) {
jbe@0 202 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 203 const utf8proc_property_t *property;
jbe@0 204 int32_t hangul_sindex;
jbe@0 205 property = utf8proc_get_property(uc);
jbe@0 206 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
jbe@2 207 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
jbe@2 208 hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
jbe@0 209 int32_t hangul_tindex;
jbe@0 210 if (bufsize >= 1) {
jbe@0 211 dst[0] = UTF8PROC_HANGUL_LBASE +
jbe@0 212 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
jbe@0 213 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
jbe@0 214 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
jbe@0 215 }
jbe@0 216 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
jbe@0 217 if (!hangul_tindex) return 2;
jbe@0 218 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
jbe@0 219 return 3;
jbe@0 220 } else if ((options & UTF8PROC_REJECTNA) && !property->category) {
jbe@0 221 return UTF8PROC_ERROR_NOTASSIGNED;
jbe@0 222 } else if ((options & UTF8PROC_IGNORE) && property->ignorable) {
jbe@0 223 return 0;
jbe@0 224 } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) {
jbe@0 225 const int32_t *casefold_entry;
jbe@0 226 ssize_t written = 0;
jbe@0 227 for (casefold_entry = property->casefold_mapping;
jbe@0 228 *casefold_entry >= 0; casefold_entry++) {
jbe@0 229 written += utf8proc_decompose_char(*casefold_entry, dst+written,
jbe@2 230 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@2 231 last_boundclass);
jbe@0 232 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@0 233 }
jbe@0 234 return written;
jbe@2 235 } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
jbe@2 236 property->decomp_mapping &&
jbe@0 237 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
jbe@0 238 const int32_t *decomp_entry;
jbe@0 239 ssize_t written = 0;
jbe@0 240 for (decomp_entry = property->decomp_mapping;
jbe@0 241 *decomp_entry >= 0; decomp_entry++) {
jbe@0 242 written += utf8proc_decompose_char(*decomp_entry, dst+written,
jbe@2 243 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@2 244 last_boundclass);
jbe@0 245 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@0 246 }
jbe@0 247 return written;
jbe@2 248 } else if (options & UTF8PROC_CHARBOUND) {
jbe@2 249 bool boundary;
jbe@2 250 int tbc, lbc;
jbe@2 251 int category;
jbe@2 252 category = property->category;
jbe@2 253 tbc =
jbe@2 254 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
jbe@2 255 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
jbe@2 256 ((category == UTF8PROC_CATEGORY_ZL ||
jbe@2 257 category == UTF8PROC_CATEGORY_ZP ||
jbe@2 258 category == UTF8PROC_CATEGORY_CC ||
jbe@2 259 category == UTF8PROC_CATEGORY_CF) &&
jbe@2 260 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
jbe@2 261 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
jbe@2 262 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
jbe@2 263 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
jbe@2 264 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
jbe@2 265 UTF8PROC_BOUNDCLASS_V :
jbe@2 266 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
jbe@2 267 UTF8PROC_BOUNDCLASS_T :
jbe@2 268 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
jbe@2 269 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
jbe@2 270 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
jbe@2 271 ) :
jbe@2 272 UTF8PROC_BOUNDCLASS_OTHER;
jbe@2 273 lbc = *last_boundclass;
jbe@2 274 boundary =
jbe@2 275 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
jbe@2 276 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
jbe@2 277 (lbc == UTF8PROC_BOUNDCLASS_CR &&
jbe@2 278 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
jbe@2 279 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 280 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 281 (lbc == UTF8PROC_BOUNDCLASS_L &&
jbe@2 282 (tbc == UTF8PROC_BOUNDCLASS_L ||
jbe@2 283 tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 284 tbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 285 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
jbe@2 286 ((lbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 287 lbc == UTF8PROC_BOUNDCLASS_V) &&
jbe@2 288 (tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 289 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
jbe@2 290 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
jbe@2 291 lbc == UTF8PROC_BOUNDCLASS_T) &&
jbe@2 292 tbc == UTF8PROC_BOUNDCLASS_T) ? false :
jbe@2 293 true;
jbe@2 294 *last_boundclass = tbc;
jbe@2 295 if (boundary) {
jbe@2 296 if (bufsize >= 1) dst[0] = 0xFFFF;
jbe@2 297 if (bufsize >= 2) dst[1] = uc;
jbe@2 298 return 2;
jbe@2 299 }
jbe@0 300 }
jbe@2 301 if (bufsize >= 1) *dst = uc;
jbe@2 302 return 1;
jbe@0 303 }
jbe@0 304
jbe@0 305 ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen,
jbe@0 306 int32_t *buffer, ssize_t bufsize, int options) {
jbe@0 307 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options
jbe@0 308 ssize_t wpos = 0;
jbe@0 309 {
jbe@0 310 int32_t uc;
jbe@0 311 ssize_t rpos = 0;
jbe@0 312 ssize_t decomp_result;
jbe@2 313 int boundclass = UTF8PROC_BOUNDCLASS_START;
jbe@0 314 while (1) {
jbe@0 315 if (options & UTF8PROC_NULLTERM) {
jbe@0 316 rpos += utf8proc_iterate(str + rpos, -1, &uc);
jbe@0 317 // checking of return value is not neccessary,
jbe@0 318 // as 'uc' is < 0 in case of error
jbe@0 319 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 320 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@0 321 if (uc == 0) break;
jbe@0 322 } else {
jbe@0 323 if (rpos >= strlen) break;
jbe@0 324 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
jbe@0 325 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 326 }
jbe@0 327 decomp_result = utf8proc_decompose_char(
jbe@2 328 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
jbe@2 329 &boundclass
jbe@0 330 );
jbe@0 331 if (decomp_result < 0) return decomp_result;
jbe@0 332 wpos += decomp_result;
jbe@0 333 // prohibiting integer overflows due to too long strings:
jbe@0 334 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
jbe@0 335 return UTF8PROC_ERROR_OVERFLOW;
jbe@0 336 }
jbe@0 337 }
jbe@2 338 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
jbe@0 339 ssize_t pos = 0;
jbe@0 340 while (pos < wpos-1) {
jbe@0 341 int32_t uc1, uc2;
jbe@0 342 const utf8proc_property_t *property1, *property2;
jbe@0 343 uc1 = buffer[pos];
jbe@0 344 uc2 = buffer[pos+1];
jbe@0 345 property1 = utf8proc_get_property(uc1);
jbe@0 346 property2 = utf8proc_get_property(uc2);
jbe@0 347 if (property1->combining_class > property2->combining_class &&
jbe@0 348 property2->combining_class > 0) {
jbe@0 349 buffer[pos] = uc2;
jbe@0 350 buffer[pos+1] = uc1;
jbe@0 351 if (pos > 0) pos--; else pos++;
jbe@0 352 } else {
jbe@0 353 pos++;
jbe@0 354 }
jbe@0 355 }
jbe@0 356 }
jbe@0 357 return wpos;
jbe@0 358 }
jbe@0 359
jbe@0 360 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
jbe@0 361 // UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
jbe@0 362 // ASSERT: 'buffer' has one spare byte of free space at the end!
jbe@0 363 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
jbe@0 364 ssize_t rpos;
jbe@0 365 ssize_t wpos = 0;
jbe@0 366 int32_t uc;
jbe@0 367 for (rpos = 0; rpos < length; rpos++) {
jbe@0 368 uc = buffer[rpos];
jbe@0 369 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
jbe@0 370 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
jbe@0 371 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
jbe@0 372 if (options & UTF8PROC_NLF2LS) {
jbe@0 373 if (options & UTF8PROC_NLF2PS) {
jbe@0 374 buffer[wpos++] = 0x000A;
jbe@0 375 } else {
jbe@0 376 buffer[wpos++] = 0x2028;
jbe@0 377 }
jbe@0 378 } else {
jbe@0 379 if (options & UTF8PROC_NLF2PS) {
jbe@0 380 buffer[wpos++] = 0x2029;
jbe@0 381 } else {
jbe@0 382 buffer[wpos++] = 0x0020;
jbe@0 383 }
jbe@0 384 }
jbe@0 385 } else if ((options & UTF8PROC_STRIPCC) &&
jbe@0 386 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
jbe@0 387 if (uc == 0x0009) buffer[wpos++] = 0x0020;
jbe@0 388 } else {
jbe@0 389 buffer[wpos++] = uc;
jbe@0 390 }
jbe@0 391 }
jbe@0 392 length = wpos;
jbe@0 393 }
jbe@0 394 if (options & UTF8PROC_COMPOSE) {
jbe@0 395 int32_t *starter = NULL;
jbe@0 396 int32_t current_char;
jbe@0 397 const utf8proc_property_t *starter_property = NULL, *current_property;
jbe@0 398 int16_t max_combining_class = -1;
jbe@0 399 ssize_t rpos;
jbe@0 400 ssize_t wpos = 0;
jbe@0 401 int32_t composition;
jbe@0 402 for (rpos = 0; rpos < length; rpos++) {
jbe@0 403 current_char = buffer[rpos];
jbe@0 404 current_property = utf8proc_get_property(current_char);
jbe@0 405 if (starter && current_property->combining_class > max_combining_class) {
jbe@0 406 // combination perhaps possible
jbe@0 407 int32_t hangul_lindex;
jbe@0 408 int32_t hangul_sindex;
jbe@0 409 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
jbe@0 410 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
jbe@0 411 int32_t hangul_vindex;
jbe@0 412 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
jbe@0 413 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
jbe@0 414 *starter = UTF8PROC_HANGUL_SBASE +
jbe@0 415 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
jbe@0 416 UTF8PROC_HANGUL_TCOUNT;
jbe@0 417 starter_property = NULL;
jbe@0 418 continue;
jbe@0 419 }
jbe@0 420 }
jbe@0 421 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
jbe@0 422 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
jbe@0 423 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
jbe@0 424 int32_t hangul_tindex;
jbe@0 425 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
jbe@0 426 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
jbe@0 427 *starter += hangul_tindex;
jbe@0 428 starter_property = NULL;
jbe@0 429 continue;
jbe@0 430 }
jbe@0 431 }
jbe@0 432 if (!starter_property) {
jbe@0 433 starter_property = utf8proc_get_property(*starter);
jbe@0 434 }
jbe@0 435 if (starter_property->comb1st_index >= 0 &&
jbe@0 436 current_property->comb2nd_index >= 0) {
jbe@0 437 composition = utf8proc_combinations[
jbe@0 438 starter_property->comb1st_index +
jbe@0 439 current_property->comb2nd_index
jbe@0 440 ];
jbe@0 441 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
jbe@0 442 !(utf8proc_get_property(composition)->comp_exclusion))) {
jbe@0 443 *starter = composition;
jbe@0 444 starter_property = NULL;
jbe@0 445 continue;
jbe@0 446 }
jbe@0 447 }
jbe@0 448 }
jbe@0 449 buffer[wpos] = current_char;
jbe@0 450 if (current_property->combining_class) {
jbe@0 451 if (current_property->combining_class > max_combining_class) {
jbe@0 452 max_combining_class = current_property->combining_class;
jbe@0 453 }
jbe@0 454 } else {
jbe@0 455 starter = buffer + wpos;
jbe@0 456 starter_property = NULL;
jbe@0 457 max_combining_class = -1;
jbe@0 458 }
jbe@0 459 wpos++;
jbe@0 460 }
jbe@0 461 length = wpos;
jbe@0 462 }
jbe@0 463 {
jbe@0 464 ssize_t rpos, wpos = 0;
jbe@0 465 int32_t uc;
jbe@0 466 for (rpos = 0; rpos < length; rpos++) {
jbe@0 467 uc = buffer[rpos];
jbe@0 468 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
jbe@0 469 }
jbe@0 470 ((uint8_t *)buffer)[wpos] = 0;
jbe@0 471 return wpos;
jbe@0 472 }
jbe@0 473 }
jbe@0 474
jbe@0 475 ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr,
jbe@0 476 int options) {
jbe@0 477 int32_t *buffer;
jbe@0 478 ssize_t result;
jbe@0 479 *dstptr = NULL;
jbe@0 480 result = utf8proc_decompose(str, strlen, NULL, 0, options);
jbe@0 481 if (result < 0) return result;
jbe@0 482 buffer = malloc(result * sizeof(int32_t) + 1);
jbe@0 483 if (!buffer) return UTF8PROC_ERROR_NOMEM;
jbe@0 484 result = utf8proc_decompose(str, strlen, buffer, result, options);
jbe@0 485 if (result < 0) {
jbe@0 486 free(buffer);
jbe@0 487 return result;
jbe@0 488 }
jbe@0 489 result = utf8proc_reencode(buffer, result, options);
jbe@0 490 if (result < 0) {
jbe@0 491 free(buffer);
jbe@0 492 return result;
jbe@0 493 }
jbe@0 494 {
jbe@0 495 int32_t *newptr;
jbe@0 496 newptr = realloc(buffer, result+1);
jbe@0 497 if (newptr) buffer = newptr;
jbe@0 498 }
jbe@0 499 *dstptr = (uint8_t *)buffer;
jbe@0 500 return result;
jbe@0 501 }
jbe@0 502
jbe@0 503 uint8_t *utf8proc_NFD(uint8_t *str) {
jbe@0 504 uint8_t *retval;
jbe@2 505 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 506 UTF8PROC_DECOMPOSE);
jbe@0 507 return retval;
jbe@0 508 }
jbe@0 509
jbe@0 510 uint8_t *utf8proc_NFC(uint8_t *str) {
jbe@0 511 uint8_t *retval;
jbe@0 512 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 513 UTF8PROC_COMPOSE);
jbe@0 514 return retval;
jbe@0 515 }
jbe@0 516
jbe@0 517 uint8_t *utf8proc_NFKD(uint8_t *str) {
jbe@0 518 uint8_t *retval;
jbe@0 519 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 520 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
jbe@0 521 return retval;
jbe@0 522 }
jbe@0 523
jbe@0 524 uint8_t *utf8proc_NFKC(uint8_t *str) {
jbe@0 525 uint8_t *retval;
jbe@0 526 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 527 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
jbe@0 528 return retval;
jbe@0 529 }
jbe@0 530
jbe@0 531

Impressum / About Us