utf8proc

annotate utf8proc.c @ 14:d0bab6ca89a5

Version 1.1.6

- PostgreSQL 9.2 and 9.3 compatibility (lowercase 'c' language name)
author jbe
date Wed Nov 27 12:00:00 2013 +0100 (2013-11-27)
parents 2c7384f1fac1
children 15450ff3d454
rev   line source
jbe@0 1 /*
jbe@10 2 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
jbe@0 3 *
jbe@7 4 * Permission is hereby granted, free of charge, to any person obtaining a
jbe@7 5 * copy of this software and associated documentation files (the "Software"),
jbe@7 6 * to deal in the Software without restriction, including without limitation
jbe@7 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
jbe@7 8 * and/or sell copies of the Software, and to permit persons to whom the
jbe@7 9 * Software is furnished to do so, subject to the following conditions:
jbe@0 10 *
jbe@7 11 * The above copyright notice and this permission notice shall be included in
jbe@7 12 * all copies or substantial portions of the Software.
jbe@0 13 *
jbe@7 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jbe@7 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jbe@7 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
jbe@7 17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jbe@7 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jbe@7 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jbe@7 20 * DEALINGS IN THE SOFTWARE.
jbe@7 21 */
jbe@7 22
jbe@7 23 /*
jbe@0 24 * This library contains derived data from a modified version of the
jbe@0 25 * Unicode data files.
jbe@0 26 *
jbe@0 27 * The original data files are available at
jbe@0 28 * http://www.unicode.org/Public/UNIDATA/
jbe@0 29 *
jbe@0 30 * Please notice the copyright statement in the file "utf8proc_data.c".
jbe@0 31 */
jbe@0 32
jbe@0 33
jbe@0 34 /*
jbe@0 35 * File name: utf8proc.c
jbe@0 36 *
jbe@0 37 * Description:
jbe@0 38 * Implementation of libutf8proc.
jbe@0 39 */
jbe@0 40
jbe@0 41
jbe@0 42 #include "utf8proc.h"
jbe@0 43 #include "utf8proc_data.c"
jbe@0 44
jbe@0 45
jbe@0 46 const int8_t utf8proc_utf8class[256] = {
jbe@0 47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 59 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 60 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 61 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
jbe@0 62 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
jbe@0 63
jbe@0 64 #define UTF8PROC_HANGUL_SBASE 0xAC00
jbe@0 65 #define UTF8PROC_HANGUL_LBASE 0x1100
jbe@0 66 #define UTF8PROC_HANGUL_VBASE 0x1161
jbe@0 67 #define UTF8PROC_HANGUL_TBASE 0x11A7
jbe@0 68 #define UTF8PROC_HANGUL_LCOUNT 19
jbe@0 69 #define UTF8PROC_HANGUL_VCOUNT 21
jbe@0 70 #define UTF8PROC_HANGUL_TCOUNT 28
jbe@0 71 #define UTF8PROC_HANGUL_NCOUNT 588
jbe@0 72 #define UTF8PROC_HANGUL_SCOUNT 11172
jbe@10 73 /* END is exclusive */
jbe@2 74 #define UTF8PROC_HANGUL_L_START 0x1100
jbe@2 75 #define UTF8PROC_HANGUL_L_END 0x115A
jbe@2 76 #define UTF8PROC_HANGUL_L_FILLER 0x115F
jbe@2 77 #define UTF8PROC_HANGUL_V_START 0x1160
jbe@2 78 #define UTF8PROC_HANGUL_V_END 0x11A3
jbe@2 79 #define UTF8PROC_HANGUL_T_START 0x11A8
jbe@2 80 #define UTF8PROC_HANGUL_T_END 0x11FA
jbe@2 81 #define UTF8PROC_HANGUL_S_START 0xAC00
jbe@2 82 #define UTF8PROC_HANGUL_S_END 0xD7A4
jbe@2 83
jbe@2 84
jbe@2 85 #define UTF8PROC_BOUNDCLASS_START 0
jbe@2 86 #define UTF8PROC_BOUNDCLASS_OTHER 1
jbe@2 87 #define UTF8PROC_BOUNDCLASS_CR 2
jbe@2 88 #define UTF8PROC_BOUNDCLASS_LF 3
jbe@2 89 #define UTF8PROC_BOUNDCLASS_CONTROL 4
jbe@2 90 #define UTF8PROC_BOUNDCLASS_EXTEND 5
jbe@2 91 #define UTF8PROC_BOUNDCLASS_L 6
jbe@2 92 #define UTF8PROC_BOUNDCLASS_V 7
jbe@2 93 #define UTF8PROC_BOUNDCLASS_T 8
jbe@2 94 #define UTF8PROC_BOUNDCLASS_LV 9
jbe@2 95 #define UTF8PROC_BOUNDCLASS_LVT 10
jbe@0 96
jbe@0 97
jbe@9 98 const char *utf8proc_version(void) {
jbe@14 99 return "1.1.6";
jbe@9 100 }
jbe@9 101
jbe@0 102 const char *utf8proc_errmsg(ssize_t errcode) {
jbe@0 103 switch (errcode) {
jbe@0 104 case UTF8PROC_ERROR_NOMEM:
jbe@0 105 return "Memory for processing UTF-8 data could not be allocated.";
jbe@0 106 case UTF8PROC_ERROR_OVERFLOW:
jbe@0 107 return "UTF-8 string is too long to be processed.";
jbe@0 108 case UTF8PROC_ERROR_INVALIDUTF8:
jbe@0 109 return "Invalid UTF-8 string";
jbe@0 110 case UTF8PROC_ERROR_NOTASSIGNED:
jbe@0 111 return "Unassigned Unicode code point found in UTF-8 string.";
jbe@3 112 case UTF8PROC_ERROR_INVALIDOPTS:
jbe@3 113 return "Invalid options for UTF-8 processing chosen.";
jbe@0 114 default:
jbe@0 115 return "An unknown error occured while processing UTF-8 data.";
jbe@0 116 }
jbe@0 117 }
jbe@0 118
jbe@7 119 ssize_t utf8proc_iterate(
jbe@7 120 const uint8_t *str, ssize_t strlen, int32_t *dst
jbe@7 121 ) {
jbe@0 122 int length;
jbe@0 123 int i;
jbe@0 124 int32_t uc = -1;
jbe@0 125 *dst = -1;
jbe@0 126 if (!strlen) return 0;
jbe@0 127 length = utf8proc_utf8class[str[0]];
jbe@0 128 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 129 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 130 for (i=1; i<length; i++) {
jbe@0 131 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 132 }
jbe@0 133 switch (length) {
jbe@0 134 case 1:
jbe@0 135 uc = str[0];
jbe@0 136 break;
jbe@0 137 case 2:
jbe@0 138 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
jbe@0 139 if (uc < 0x80) uc = -1;
jbe@0 140 break;
jbe@0 141 case 3:
jbe@0 142 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
jbe@0 143 + (str[2] & 0x3F);
jbe@0 144 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
jbe@0 145 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
jbe@0 146 break;
jbe@0 147 case 4:
jbe@0 148 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
jbe@0 149 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
jbe@0 150 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
jbe@0 151 break;
jbe@0 152 }
jbe@7 153 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
jbe@7 154 return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 155 *dst = uc;
jbe@0 156 return length;
jbe@0 157 }
jbe@0 158
jbe@7 159 bool utf8proc_codepoint_valid(int32_t uc) {
jbe@7 160 if (uc < 0 || uc >= 0x110000 ||
jbe@7 161 ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
jbe@7 162 (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
jbe@7 163 else return true;
jbe@7 164 }
jbe@7 165
jbe@0 166 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
jbe@0 167 if (uc < 0x00) {
jbe@0 168 return 0;
jbe@0 169 } else if (uc < 0x80) {
jbe@0 170 dst[0] = uc;
jbe@0 171 return 1;
jbe@0 172 } else if (uc < 0x800) {
jbe@0 173 dst[0] = 0xC0 + (uc >> 6);
jbe@0 174 dst[1] = 0x80 + (uc & 0x3F);
jbe@0 175 return 2;
jbe@2 176 } else if (uc == 0xFFFF) {
jbe@2 177 dst[0] = 0xFF;
jbe@2 178 return 1;
jbe@2 179 } else if (uc == 0xFFFE) {
jbe@2 180 dst[0] = 0xFE;
jbe@2 181 return 1;
jbe@0 182 } else if (uc < 0x10000) {
jbe@0 183 dst[0] = 0xE0 + (uc >> 12);
jbe@0 184 dst[1] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 185 dst[2] = 0x80 + (uc & 0x3F);
jbe@0 186 return 3;
jbe@0 187 } else if (uc < 0x110000) {
jbe@0 188 dst[0] = 0xF0 + (uc >> 18);
jbe@0 189 dst[1] = 0x80 + ((uc >> 12) & 0x3F);
jbe@0 190 dst[2] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 191 dst[3] = 0x80 + (uc & 0x3F);
jbe@0 192 return 4;
jbe@0 193 } else return 0;
jbe@0 194 }
jbe@0 195
jbe@0 196 const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
jbe@10 197 /* ASSERT: uc >= 0 && uc < 0x110000 */
jbe@0 198 return utf8proc_properties + (
jbe@0 199 utf8proc_stage2table[
jbe@0 200 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
jbe@0 201 ]
jbe@0 202 );
jbe@0 203 }
jbe@0 204
jbe@3 205 #define utf8proc_decompose_lump(replacement_uc) \
jbe@3 206 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
jbe@3 207 options & ~UTF8PROC_LUMP, last_boundclass)
jbe@3 208
jbe@0 209 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
jbe@2 210 int options, int *last_boundclass) {
jbe@10 211 /* ASSERT: uc >= 0 && uc < 0x110000 */
jbe@0 212 const utf8proc_property_t *property;
jbe@3 213 utf8proc_propval_t category;
jbe@0 214 int32_t hangul_sindex;
jbe@0 215 property = utf8proc_get_property(uc);
jbe@3 216 category = property->category;
jbe@0 217 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
jbe@3 218 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 219 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
jbe@3 220 int32_t hangul_tindex;
jbe@3 221 if (bufsize >= 1) {
jbe@3 222 dst[0] = UTF8PROC_HANGUL_LBASE +
jbe@3 223 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
jbe@3 224 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
jbe@3 225 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
jbe@3 226 }
jbe@3 227 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
jbe@3 228 if (!hangul_tindex) return 2;
jbe@3 229 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
jbe@3 230 return 3;
jbe@0 231 }
jbe@3 232 }
jbe@3 233 if (options & UTF8PROC_REJECTNA) {
jbe@3 234 if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
jbe@3 235 }
jbe@3 236 if (options & UTF8PROC_IGNORE) {
jbe@3 237 if (property->ignorable) return 0;
jbe@3 238 }
jbe@3 239 if (options & UTF8PROC_LUMP) {
jbe@3 240 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
jbe@3 241 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
jbe@3 242 utf8proc_decompose_lump(0x0027);
jbe@3 243 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
jbe@3 244 utf8proc_decompose_lump(0x002D);
jbe@3 245 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
jbe@3 246 if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
jbe@3 247 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
jbe@3 248 utf8proc_decompose_lump(0x003C);
jbe@3 249 if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
jbe@3 250 utf8proc_decompose_lump(0x003E);
jbe@3 251 if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
jbe@3 252 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
jbe@3 253 utf8proc_decompose_lump(0x005E);
jbe@3 254 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
jbe@3 255 utf8proc_decompose_lump(0x005F);
jbe@3 256 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
jbe@3 257 if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
jbe@3 258 if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
jbe@3 259 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
jbe@3 260 if (category == UTF8PROC_CATEGORY_ZL ||
jbe@3 261 category == UTF8PROC_CATEGORY_ZP)
jbe@3 262 utf8proc_decompose_lump(0x000A);
jbe@3 263 }
jbe@3 264 }
jbe@3 265 if (options & UTF8PROC_STRIPMARK) {
jbe@3 266 if (category == UTF8PROC_CATEGORY_MN ||
jbe@3 267 category == UTF8PROC_CATEGORY_MC ||
jbe@3 268 category == UTF8PROC_CATEGORY_ME) return 0;
jbe@3 269 }
jbe@3 270 if (options & UTF8PROC_CASEFOLD) {
jbe@3 271 if (property->casefold_mapping) {
jbe@3 272 const int32_t *casefold_entry;
jbe@3 273 ssize_t written = 0;
jbe@3 274 for (casefold_entry = property->casefold_mapping;
jbe@3 275 *casefold_entry >= 0; casefold_entry++) {
jbe@3 276 written += utf8proc_decompose_char(*casefold_entry, dst+written,
jbe@3 277 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@3 278 last_boundclass);
jbe@3 279 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 280 }
jbe@3 281 return written;
jbe@3 282 }
jbe@3 283 }
jbe@3 284 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 285 if (property->decomp_mapping &&
jbe@3 286 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
jbe@3 287 const int32_t *decomp_entry;
jbe@3 288 ssize_t written = 0;
jbe@3 289 for (decomp_entry = property->decomp_mapping;
jbe@3 290 *decomp_entry >= 0; decomp_entry++) {
jbe@3 291 written += utf8proc_decompose_char(*decomp_entry, dst+written,
jbe@3 292 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@2 293 last_boundclass);
jbe@3 294 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 295 }
jbe@3 296 return written;
jbe@0 297 }
jbe@3 298 }
jbe@3 299 if (options & UTF8PROC_CHARBOUND) {
jbe@2 300 bool boundary;
jbe@2 301 int tbc, lbc;
jbe@2 302 tbc =
jbe@2 303 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
jbe@2 304 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
jbe@2 305 ((category == UTF8PROC_CATEGORY_ZL ||
jbe@2 306 category == UTF8PROC_CATEGORY_ZP ||
jbe@2 307 category == UTF8PROC_CATEGORY_CC ||
jbe@2 308 category == UTF8PROC_CATEGORY_CF) &&
jbe@2 309 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
jbe@2 310 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
jbe@2 311 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
jbe@2 312 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
jbe@2 313 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
jbe@2 314 UTF8PROC_BOUNDCLASS_V :
jbe@2 315 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
jbe@2 316 UTF8PROC_BOUNDCLASS_T :
jbe@2 317 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
jbe@2 318 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
jbe@2 319 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
jbe@2 320 ) :
jbe@2 321 UTF8PROC_BOUNDCLASS_OTHER;
jbe@2 322 lbc = *last_boundclass;
jbe@2 323 boundary =
jbe@2 324 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
jbe@2 325 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
jbe@2 326 (lbc == UTF8PROC_BOUNDCLASS_CR &&
jbe@2 327 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
jbe@2 328 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 329 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 330 (lbc == UTF8PROC_BOUNDCLASS_L &&
jbe@2 331 (tbc == UTF8PROC_BOUNDCLASS_L ||
jbe@2 332 tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 333 tbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 334 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
jbe@2 335 ((lbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 336 lbc == UTF8PROC_BOUNDCLASS_V) &&
jbe@2 337 (tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 338 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
jbe@2 339 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
jbe@2 340 lbc == UTF8PROC_BOUNDCLASS_T) &&
jbe@2 341 tbc == UTF8PROC_BOUNDCLASS_T) ? false :
jbe@2 342 true;
jbe@2 343 *last_boundclass = tbc;
jbe@2 344 if (boundary) {
jbe@2 345 if (bufsize >= 1) dst[0] = 0xFFFF;
jbe@2 346 if (bufsize >= 2) dst[1] = uc;
jbe@2 347 return 2;
jbe@2 348 }
jbe@0 349 }
jbe@2 350 if (bufsize >= 1) *dst = uc;
jbe@2 351 return 1;
jbe@0 352 }
jbe@0 353
jbe@7 354 ssize_t utf8proc_decompose(
jbe@7 355 const uint8_t *str, ssize_t strlen,
jbe@7 356 int32_t *buffer, ssize_t bufsize, int options
jbe@7 357 ) {
jbe@10 358 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
jbe@0 359 ssize_t wpos = 0;
jbe@3 360 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
jbe@3 361 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@3 362 if ((options & UTF8PROC_STRIPMARK) &&
jbe@3 363 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
jbe@3 364 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@0 365 {
jbe@0 366 int32_t uc;
jbe@0 367 ssize_t rpos = 0;
jbe@0 368 ssize_t decomp_result;
jbe@2 369 int boundclass = UTF8PROC_BOUNDCLASS_START;
jbe@0 370 while (1) {
jbe@0 371 if (options & UTF8PROC_NULLTERM) {
jbe@0 372 rpos += utf8proc_iterate(str + rpos, -1, &uc);
jbe@10 373 /* checking of return value is not neccessary,
jbe@10 374 as 'uc' is < 0 in case of error */
jbe@0 375 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 376 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@0 377 if (uc == 0) break;
jbe@0 378 } else {
jbe@0 379 if (rpos >= strlen) break;
jbe@0 380 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
jbe@0 381 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 382 }
jbe@0 383 decomp_result = utf8proc_decompose_char(
jbe@2 384 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
jbe@2 385 &boundclass
jbe@0 386 );
jbe@0 387 if (decomp_result < 0) return decomp_result;
jbe@0 388 wpos += decomp_result;
jbe@10 389 /* prohibiting integer overflows due to too long strings: */
jbe@0 390 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
jbe@0 391 return UTF8PROC_ERROR_OVERFLOW;
jbe@0 392 }
jbe@0 393 }
jbe@2 394 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
jbe@0 395 ssize_t pos = 0;
jbe@0 396 while (pos < wpos-1) {
jbe@0 397 int32_t uc1, uc2;
jbe@0 398 const utf8proc_property_t *property1, *property2;
jbe@0 399 uc1 = buffer[pos];
jbe@0 400 uc2 = buffer[pos+1];
jbe@0 401 property1 = utf8proc_get_property(uc1);
jbe@0 402 property2 = utf8proc_get_property(uc2);
jbe@0 403 if (property1->combining_class > property2->combining_class &&
jbe@0 404 property2->combining_class > 0) {
jbe@0 405 buffer[pos] = uc2;
jbe@0 406 buffer[pos+1] = uc1;
jbe@0 407 if (pos > 0) pos--; else pos++;
jbe@0 408 } else {
jbe@0 409 pos++;
jbe@0 410 }
jbe@0 411 }
jbe@0 412 }
jbe@0 413 return wpos;
jbe@0 414 }
jbe@0 415
jbe@0 416 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
jbe@10 417 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
jbe@10 418 ASSERT: 'buffer' has one spare byte of free space at the end! */
jbe@0 419 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
jbe@0 420 ssize_t rpos;
jbe@0 421 ssize_t wpos = 0;
jbe@0 422 int32_t uc;
jbe@0 423 for (rpos = 0; rpos < length; rpos++) {
jbe@0 424 uc = buffer[rpos];
jbe@0 425 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
jbe@0 426 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
jbe@0 427 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
jbe@0 428 if (options & UTF8PROC_NLF2LS) {
jbe@0 429 if (options & UTF8PROC_NLF2PS) {
jbe@0 430 buffer[wpos++] = 0x000A;
jbe@0 431 } else {
jbe@0 432 buffer[wpos++] = 0x2028;
jbe@0 433 }
jbe@0 434 } else {
jbe@0 435 if (options & UTF8PROC_NLF2PS) {
jbe@0 436 buffer[wpos++] = 0x2029;
jbe@0 437 } else {
jbe@0 438 buffer[wpos++] = 0x0020;
jbe@0 439 }
jbe@0 440 }
jbe@0 441 } else if ((options & UTF8PROC_STRIPCC) &&
jbe@0 442 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
jbe@0 443 if (uc == 0x0009) buffer[wpos++] = 0x0020;
jbe@0 444 } else {
jbe@0 445 buffer[wpos++] = uc;
jbe@0 446 }
jbe@0 447 }
jbe@0 448 length = wpos;
jbe@0 449 }
jbe@0 450 if (options & UTF8PROC_COMPOSE) {
jbe@0 451 int32_t *starter = NULL;
jbe@0 452 int32_t current_char;
jbe@0 453 const utf8proc_property_t *starter_property = NULL, *current_property;
jbe@3 454 utf8proc_propval_t max_combining_class = -1;
jbe@0 455 ssize_t rpos;
jbe@0 456 ssize_t wpos = 0;
jbe@0 457 int32_t composition;
jbe@0 458 for (rpos = 0; rpos < length; rpos++) {
jbe@0 459 current_char = buffer[rpos];
jbe@0 460 current_property = utf8proc_get_property(current_char);
jbe@0 461 if (starter && current_property->combining_class > max_combining_class) {
jbe@10 462 /* combination perhaps possible */
jbe@0 463 int32_t hangul_lindex;
jbe@0 464 int32_t hangul_sindex;
jbe@0 465 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
jbe@0 466 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
jbe@0 467 int32_t hangul_vindex;
jbe@0 468 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
jbe@0 469 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
jbe@0 470 *starter = UTF8PROC_HANGUL_SBASE +
jbe@0 471 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
jbe@0 472 UTF8PROC_HANGUL_TCOUNT;
jbe@0 473 starter_property = NULL;
jbe@0 474 continue;
jbe@0 475 }
jbe@0 476 }
jbe@0 477 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
jbe@0 478 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
jbe@0 479 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
jbe@0 480 int32_t hangul_tindex;
jbe@0 481 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
jbe@0 482 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
jbe@0 483 *starter += hangul_tindex;
jbe@0 484 starter_property = NULL;
jbe@0 485 continue;
jbe@0 486 }
jbe@0 487 }
jbe@0 488 if (!starter_property) {
jbe@0 489 starter_property = utf8proc_get_property(*starter);
jbe@0 490 }
jbe@0 491 if (starter_property->comb1st_index >= 0 &&
jbe@0 492 current_property->comb2nd_index >= 0) {
jbe@0 493 composition = utf8proc_combinations[
jbe@0 494 starter_property->comb1st_index +
jbe@0 495 current_property->comb2nd_index
jbe@0 496 ];
jbe@0 497 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
jbe@0 498 !(utf8proc_get_property(composition)->comp_exclusion))) {
jbe@0 499 *starter = composition;
jbe@0 500 starter_property = NULL;
jbe@0 501 continue;
jbe@0 502 }
jbe@0 503 }
jbe@0 504 }
jbe@0 505 buffer[wpos] = current_char;
jbe@0 506 if (current_property->combining_class) {
jbe@0 507 if (current_property->combining_class > max_combining_class) {
jbe@0 508 max_combining_class = current_property->combining_class;
jbe@0 509 }
jbe@0 510 } else {
jbe@0 511 starter = buffer + wpos;
jbe@0 512 starter_property = NULL;
jbe@0 513 max_combining_class = -1;
jbe@0 514 }
jbe@0 515 wpos++;
jbe@0 516 }
jbe@0 517 length = wpos;
jbe@0 518 }
jbe@0 519 {
jbe@0 520 ssize_t rpos, wpos = 0;
jbe@0 521 int32_t uc;
jbe@0 522 for (rpos = 0; rpos < length; rpos++) {
jbe@0 523 uc = buffer[rpos];
jbe@0 524 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
jbe@0 525 }
jbe@0 526 ((uint8_t *)buffer)[wpos] = 0;
jbe@0 527 return wpos;
jbe@0 528 }
jbe@0 529 }
jbe@0 530
jbe@7 531 ssize_t utf8proc_map(
jbe@7 532 const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
jbe@7 533 ) {
jbe@0 534 int32_t *buffer;
jbe@0 535 ssize_t result;
jbe@0 536 *dstptr = NULL;
jbe@0 537 result = utf8proc_decompose(str, strlen, NULL, 0, options);
jbe@0 538 if (result < 0) return result;
jbe@0 539 buffer = malloc(result * sizeof(int32_t) + 1);
jbe@0 540 if (!buffer) return UTF8PROC_ERROR_NOMEM;
jbe@0 541 result = utf8proc_decompose(str, strlen, buffer, result, options);
jbe@0 542 if (result < 0) {
jbe@0 543 free(buffer);
jbe@0 544 return result;
jbe@0 545 }
jbe@0 546 result = utf8proc_reencode(buffer, result, options);
jbe@0 547 if (result < 0) {
jbe@0 548 free(buffer);
jbe@0 549 return result;
jbe@0 550 }
jbe@0 551 {
jbe@0 552 int32_t *newptr;
jbe@10 553 newptr = realloc(buffer, (size_t)result+1);
jbe@0 554 if (newptr) buffer = newptr;
jbe@0 555 }
jbe@0 556 *dstptr = (uint8_t *)buffer;
jbe@0 557 return result;
jbe@0 558 }
jbe@0 559
jbe@7 560 uint8_t *utf8proc_NFD(const uint8_t *str) {
jbe@0 561 uint8_t *retval;
jbe@2 562 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 563 UTF8PROC_DECOMPOSE);
jbe@0 564 return retval;
jbe@0 565 }
jbe@0 566
jbe@7 567 uint8_t *utf8proc_NFC(const uint8_t *str) {
jbe@0 568 uint8_t *retval;
jbe@0 569 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 570 UTF8PROC_COMPOSE);
jbe@0 571 return retval;
jbe@0 572 }
jbe@0 573
jbe@7 574 uint8_t *utf8proc_NFKD(const uint8_t *str) {
jbe@0 575 uint8_t *retval;
jbe@0 576 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 577 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
jbe@0 578 return retval;
jbe@0 579 }
jbe@0 580
jbe@7 581 uint8_t *utf8proc_NFKC(const uint8_t *str) {
jbe@0 582 uint8_t *retval;
jbe@0 583 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 584 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
jbe@0 585 return retval;
jbe@0 586 }
jbe@0 587

Impressum / About Us