utf8proc

annotate utf8proc.c @ 7:fcfd8c836c64

Version 1.1.1

- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
author jbe
date Sun Jul 22 12:00:00 2007 +0200 (2007-07-22)
parents 4ee0d5f54af1
children 951e73a98021
rev   line source
jbe@0 1 /*
jbe@7 2 * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
jbe@0 3 *
jbe@7 4 * Permission is hereby granted, free of charge, to any person obtaining a
jbe@7 5 * copy of this software and associated documentation files (the "Software"),
jbe@7 6 * to deal in the Software without restriction, including without limitation
jbe@7 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
jbe@7 8 * and/or sell copies of the Software, and to permit persons to whom the
jbe@7 9 * Software is furnished to do so, subject to the following conditions:
jbe@0 10 *
jbe@7 11 * The above copyright notice and this permission notice shall be included in
jbe@7 12 * all copies or substantial portions of the Software.
jbe@0 13 *
jbe@7 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jbe@7 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jbe@7 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
jbe@7 17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jbe@7 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jbe@7 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jbe@7 20 * DEALINGS IN THE SOFTWARE.
jbe@7 21 */
jbe@7 22
jbe@7 23 /*
jbe@0 24 * This library contains derived data from a modified version of the
jbe@0 25 * Unicode data files.
jbe@0 26 *
jbe@0 27 * The original data files are available at
jbe@0 28 * http://www.unicode.org/Public/UNIDATA/
jbe@0 29 *
jbe@0 30 * Please notice the copyright statement in the file "utf8proc_data.c".
jbe@0 31 */
jbe@0 32
jbe@0 33
jbe@0 34 /*
jbe@0 35 * File name: utf8proc.c
jbe@7 36 * Version: 1.1.1
jbe@7 37 * Last changed: 2007-07-22
jbe@0 38 *
jbe@0 39 * Description:
jbe@0 40 * Implementation of libutf8proc.
jbe@0 41 */
jbe@0 42
jbe@0 43
jbe@0 44 #include "utf8proc.h"
jbe@0 45 #include "utf8proc_data.c"
jbe@0 46
jbe@0 47
jbe@0 48 const int8_t utf8proc_utf8class[256] = {
jbe@0 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 61 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 62 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 63 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
jbe@0 64 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
jbe@0 65
jbe@0 66 #define UTF8PROC_HANGUL_SBASE 0xAC00
jbe@0 67 #define UTF8PROC_HANGUL_LBASE 0x1100
jbe@0 68 #define UTF8PROC_HANGUL_VBASE 0x1161
jbe@0 69 #define UTF8PROC_HANGUL_TBASE 0x11A7
jbe@0 70 #define UTF8PROC_HANGUL_LCOUNT 19
jbe@0 71 #define UTF8PROC_HANGUL_VCOUNT 21
jbe@0 72 #define UTF8PROC_HANGUL_TCOUNT 28
jbe@0 73 #define UTF8PROC_HANGUL_NCOUNT 588
jbe@0 74 #define UTF8PROC_HANGUL_SCOUNT 11172
jbe@2 75 // END is exclusive
jbe@2 76 #define UTF8PROC_HANGUL_L_START 0x1100
jbe@2 77 #define UTF8PROC_HANGUL_L_END 0x115A
jbe@2 78 #define UTF8PROC_HANGUL_L_FILLER 0x115F
jbe@2 79 #define UTF8PROC_HANGUL_V_START 0x1160
jbe@2 80 #define UTF8PROC_HANGUL_V_END 0x11A3
jbe@2 81 #define UTF8PROC_HANGUL_T_START 0x11A8
jbe@2 82 #define UTF8PROC_HANGUL_T_END 0x11FA
jbe@2 83 #define UTF8PROC_HANGUL_S_START 0xAC00
jbe@2 84 #define UTF8PROC_HANGUL_S_END 0xD7A4
jbe@2 85
jbe@2 86
jbe@2 87 #define UTF8PROC_BOUNDCLASS_START 0
jbe@2 88 #define UTF8PROC_BOUNDCLASS_OTHER 1
jbe@2 89 #define UTF8PROC_BOUNDCLASS_CR 2
jbe@2 90 #define UTF8PROC_BOUNDCLASS_LF 3
jbe@2 91 #define UTF8PROC_BOUNDCLASS_CONTROL 4
jbe@2 92 #define UTF8PROC_BOUNDCLASS_EXTEND 5
jbe@2 93 #define UTF8PROC_BOUNDCLASS_L 6
jbe@2 94 #define UTF8PROC_BOUNDCLASS_V 7
jbe@2 95 #define UTF8PROC_BOUNDCLASS_T 8
jbe@2 96 #define UTF8PROC_BOUNDCLASS_LV 9
jbe@2 97 #define UTF8PROC_BOUNDCLASS_LVT 10
jbe@0 98
jbe@0 99
jbe@0 100 const char *utf8proc_errmsg(ssize_t errcode) {
jbe@0 101 switch (errcode) {
jbe@0 102 case UTF8PROC_ERROR_NOMEM:
jbe@0 103 return "Memory for processing UTF-8 data could not be allocated.";
jbe@0 104 case UTF8PROC_ERROR_OVERFLOW:
jbe@0 105 return "UTF-8 string is too long to be processed.";
jbe@0 106 case UTF8PROC_ERROR_INVALIDUTF8:
jbe@0 107 return "Invalid UTF-8 string";
jbe@0 108 case UTF8PROC_ERROR_NOTASSIGNED:
jbe@0 109 return "Unassigned Unicode code point found in UTF-8 string.";
jbe@3 110 case UTF8PROC_ERROR_INVALIDOPTS:
jbe@3 111 return "Invalid options for UTF-8 processing chosen.";
jbe@0 112 default:
jbe@0 113 return "An unknown error occured while processing UTF-8 data.";
jbe@0 114 }
jbe@0 115 }
jbe@0 116
jbe@7 117 ssize_t utf8proc_iterate(
jbe@7 118 const uint8_t *str, ssize_t strlen, int32_t *dst
jbe@7 119 ) {
jbe@0 120 int length;
jbe@0 121 int i;
jbe@0 122 int32_t uc = -1;
jbe@0 123 *dst = -1;
jbe@0 124 if (!strlen) return 0;
jbe@0 125 length = utf8proc_utf8class[str[0]];
jbe@0 126 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 127 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 128 for (i=1; i<length; i++) {
jbe@0 129 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 130 }
jbe@0 131 switch (length) {
jbe@0 132 case 1:
jbe@0 133 uc = str[0];
jbe@0 134 break;
jbe@0 135 case 2:
jbe@0 136 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
jbe@0 137 if (uc < 0x80) uc = -1;
jbe@0 138 break;
jbe@0 139 case 3:
jbe@0 140 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
jbe@0 141 + (str[2] & 0x3F);
jbe@0 142 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
jbe@0 143 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
jbe@0 144 break;
jbe@0 145 case 4:
jbe@0 146 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
jbe@0 147 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
jbe@0 148 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
jbe@0 149 break;
jbe@0 150 }
jbe@7 151 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
jbe@7 152 return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 153 *dst = uc;
jbe@0 154 return length;
jbe@0 155 }
jbe@0 156
jbe@7 157 bool utf8proc_codepoint_valid(int32_t uc) {
jbe@7 158 if (uc < 0 || uc >= 0x110000 ||
jbe@7 159 ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
jbe@7 160 (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
jbe@7 161 else return true;
jbe@7 162 }
jbe@7 163
jbe@0 164 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
jbe@0 165 if (uc < 0x00) {
jbe@0 166 return 0;
jbe@0 167 } else if (uc < 0x80) {
jbe@0 168 dst[0] = uc;
jbe@0 169 return 1;
jbe@0 170 } else if (uc < 0x800) {
jbe@0 171 dst[0] = 0xC0 + (uc >> 6);
jbe@0 172 dst[1] = 0x80 + (uc & 0x3F);
jbe@0 173 return 2;
jbe@2 174 } else if (uc == 0xFFFF) {
jbe@2 175 dst[0] = 0xFF;
jbe@2 176 return 1;
jbe@2 177 } else if (uc == 0xFFFE) {
jbe@2 178 dst[0] = 0xFE;
jbe@2 179 return 1;
jbe@0 180 } else if (uc < 0x10000) {
jbe@0 181 dst[0] = 0xE0 + (uc >> 12);
jbe@0 182 dst[1] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 183 dst[2] = 0x80 + (uc & 0x3F);
jbe@0 184 return 3;
jbe@0 185 } else if (uc < 0x110000) {
jbe@0 186 dst[0] = 0xF0 + (uc >> 18);
jbe@0 187 dst[1] = 0x80 + ((uc >> 12) & 0x3F);
jbe@0 188 dst[2] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 189 dst[3] = 0x80 + (uc & 0x3F);
jbe@0 190 return 4;
jbe@0 191 } else return 0;
jbe@0 192 }
jbe@0 193
jbe@0 194 const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
jbe@0 195 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 196 return utf8proc_properties + (
jbe@0 197 utf8proc_stage2table[
jbe@0 198 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
jbe@0 199 ]
jbe@0 200 );
jbe@0 201 }
jbe@0 202
jbe@3 203 #define utf8proc_decompose_lump(replacement_uc) \
jbe@3 204 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
jbe@3 205 options & ~UTF8PROC_LUMP, last_boundclass)
jbe@3 206
jbe@0 207 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
jbe@2 208 int options, int *last_boundclass) {
jbe@0 209 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 210 const utf8proc_property_t *property;
jbe@3 211 utf8proc_propval_t category;
jbe@0 212 int32_t hangul_sindex;
jbe@0 213 property = utf8proc_get_property(uc);
jbe@3 214 category = property->category;
jbe@0 215 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
jbe@3 216 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 217 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
jbe@3 218 int32_t hangul_tindex;
jbe@3 219 if (bufsize >= 1) {
jbe@3 220 dst[0] = UTF8PROC_HANGUL_LBASE +
jbe@3 221 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
jbe@3 222 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
jbe@3 223 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
jbe@3 224 }
jbe@3 225 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
jbe@3 226 if (!hangul_tindex) return 2;
jbe@3 227 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
jbe@3 228 return 3;
jbe@0 229 }
jbe@3 230 }
jbe@3 231 if (options & UTF8PROC_REJECTNA) {
jbe@3 232 if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
jbe@3 233 }
jbe@3 234 if (options & UTF8PROC_IGNORE) {
jbe@3 235 if (property->ignorable) return 0;
jbe@3 236 }
jbe@3 237 if (options & UTF8PROC_LUMP) {
jbe@3 238 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
jbe@3 239 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
jbe@3 240 utf8proc_decompose_lump(0x0027);
jbe@3 241 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
jbe@3 242 utf8proc_decompose_lump(0x002D);
jbe@3 243 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
jbe@3 244 if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
jbe@3 245 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
jbe@3 246 utf8proc_decompose_lump(0x003C);
jbe@3 247 if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
jbe@3 248 utf8proc_decompose_lump(0x003E);
jbe@3 249 if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
jbe@3 250 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
jbe@3 251 utf8proc_decompose_lump(0x005E);
jbe@3 252 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
jbe@3 253 utf8proc_decompose_lump(0x005F);
jbe@3 254 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
jbe@3 255 if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
jbe@3 256 if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
jbe@3 257 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
jbe@3 258 if (category == UTF8PROC_CATEGORY_ZL ||
jbe@3 259 category == UTF8PROC_CATEGORY_ZP)
jbe@3 260 utf8proc_decompose_lump(0x000A);
jbe@3 261 }
jbe@3 262 }
jbe@3 263 if (options & UTF8PROC_STRIPMARK) {
jbe@3 264 if (category == UTF8PROC_CATEGORY_MN ||
jbe@3 265 category == UTF8PROC_CATEGORY_MC ||
jbe@3 266 category == UTF8PROC_CATEGORY_ME) return 0;
jbe@3 267 }
jbe@3 268 if (options & UTF8PROC_CASEFOLD) {
jbe@3 269 if (property->casefold_mapping) {
jbe@3 270 const int32_t *casefold_entry;
jbe@3 271 ssize_t written = 0;
jbe@3 272 for (casefold_entry = property->casefold_mapping;
jbe@3 273 *casefold_entry >= 0; casefold_entry++) {
jbe@3 274 written += utf8proc_decompose_char(*casefold_entry, dst+written,
jbe@3 275 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@3 276 last_boundclass);
jbe@3 277 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 278 }
jbe@3 279 return written;
jbe@3 280 }
jbe@3 281 }
jbe@3 282 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 283 if (property->decomp_mapping &&
jbe@3 284 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
jbe@3 285 const int32_t *decomp_entry;
jbe@3 286 ssize_t written = 0;
jbe@3 287 for (decomp_entry = property->decomp_mapping;
jbe@3 288 *decomp_entry >= 0; decomp_entry++) {
jbe@3 289 written += utf8proc_decompose_char(*decomp_entry, dst+written,
jbe@3 290 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@2 291 last_boundclass);
jbe@3 292 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 293 }
jbe@3 294 return written;
jbe@0 295 }
jbe@3 296 }
jbe@3 297 if (options & UTF8PROC_CHARBOUND) {
jbe@2 298 bool boundary;
jbe@2 299 int tbc, lbc;
jbe@2 300 tbc =
jbe@2 301 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
jbe@2 302 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
jbe@2 303 ((category == UTF8PROC_CATEGORY_ZL ||
jbe@2 304 category == UTF8PROC_CATEGORY_ZP ||
jbe@2 305 category == UTF8PROC_CATEGORY_CC ||
jbe@2 306 category == UTF8PROC_CATEGORY_CF) &&
jbe@2 307 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
jbe@2 308 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
jbe@2 309 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
jbe@2 310 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
jbe@2 311 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
jbe@2 312 UTF8PROC_BOUNDCLASS_V :
jbe@2 313 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
jbe@2 314 UTF8PROC_BOUNDCLASS_T :
jbe@2 315 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
jbe@2 316 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
jbe@2 317 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
jbe@2 318 ) :
jbe@2 319 UTF8PROC_BOUNDCLASS_OTHER;
jbe@2 320 lbc = *last_boundclass;
jbe@2 321 boundary =
jbe@2 322 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
jbe@2 323 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
jbe@2 324 (lbc == UTF8PROC_BOUNDCLASS_CR &&
jbe@2 325 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
jbe@2 326 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 327 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 328 (lbc == UTF8PROC_BOUNDCLASS_L &&
jbe@2 329 (tbc == UTF8PROC_BOUNDCLASS_L ||
jbe@2 330 tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 331 tbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 332 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
jbe@2 333 ((lbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 334 lbc == UTF8PROC_BOUNDCLASS_V) &&
jbe@2 335 (tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 336 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
jbe@2 337 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
jbe@2 338 lbc == UTF8PROC_BOUNDCLASS_T) &&
jbe@2 339 tbc == UTF8PROC_BOUNDCLASS_T) ? false :
jbe@2 340 true;
jbe@2 341 *last_boundclass = tbc;
jbe@2 342 if (boundary) {
jbe@2 343 if (bufsize >= 1) dst[0] = 0xFFFF;
jbe@2 344 if (bufsize >= 2) dst[1] = uc;
jbe@2 345 return 2;
jbe@2 346 }
jbe@0 347 }
jbe@2 348 if (bufsize >= 1) *dst = uc;
jbe@2 349 return 1;
jbe@0 350 }
jbe@0 351
jbe@7 352 ssize_t utf8proc_decompose(
jbe@7 353 const uint8_t *str, ssize_t strlen,
jbe@7 354 int32_t *buffer, ssize_t bufsize, int options
jbe@7 355 ) {
jbe@0 356 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options
jbe@0 357 ssize_t wpos = 0;
jbe@3 358 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
jbe@3 359 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@3 360 if ((options & UTF8PROC_STRIPMARK) &&
jbe@3 361 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
jbe@3 362 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@0 363 {
jbe@0 364 int32_t uc;
jbe@0 365 ssize_t rpos = 0;
jbe@0 366 ssize_t decomp_result;
jbe@2 367 int boundclass = UTF8PROC_BOUNDCLASS_START;
jbe@0 368 while (1) {
jbe@0 369 if (options & UTF8PROC_NULLTERM) {
jbe@0 370 rpos += utf8proc_iterate(str + rpos, -1, &uc);
jbe@0 371 // checking of return value is not neccessary,
jbe@0 372 // as 'uc' is < 0 in case of error
jbe@0 373 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 374 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@0 375 if (uc == 0) break;
jbe@0 376 } else {
jbe@0 377 if (rpos >= strlen) break;
jbe@0 378 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
jbe@0 379 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 380 }
jbe@0 381 decomp_result = utf8proc_decompose_char(
jbe@2 382 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
jbe@2 383 &boundclass
jbe@0 384 );
jbe@0 385 if (decomp_result < 0) return decomp_result;
jbe@0 386 wpos += decomp_result;
jbe@0 387 // prohibiting integer overflows due to too long strings:
jbe@0 388 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
jbe@0 389 return UTF8PROC_ERROR_OVERFLOW;
jbe@0 390 }
jbe@0 391 }
jbe@2 392 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
jbe@0 393 ssize_t pos = 0;
jbe@0 394 while (pos < wpos-1) {
jbe@0 395 int32_t uc1, uc2;
jbe@0 396 const utf8proc_property_t *property1, *property2;
jbe@0 397 uc1 = buffer[pos];
jbe@0 398 uc2 = buffer[pos+1];
jbe@0 399 property1 = utf8proc_get_property(uc1);
jbe@0 400 property2 = utf8proc_get_property(uc2);
jbe@0 401 if (property1->combining_class > property2->combining_class &&
jbe@0 402 property2->combining_class > 0) {
jbe@0 403 buffer[pos] = uc2;
jbe@0 404 buffer[pos+1] = uc1;
jbe@0 405 if (pos > 0) pos--; else pos++;
jbe@0 406 } else {
jbe@0 407 pos++;
jbe@0 408 }
jbe@0 409 }
jbe@0 410 }
jbe@0 411 return wpos;
jbe@0 412 }
jbe@0 413
jbe@0 414 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
jbe@0 415 // UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
jbe@0 416 // ASSERT: 'buffer' has one spare byte of free space at the end!
jbe@0 417 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
jbe@0 418 ssize_t rpos;
jbe@0 419 ssize_t wpos = 0;
jbe@0 420 int32_t uc;
jbe@0 421 for (rpos = 0; rpos < length; rpos++) {
jbe@0 422 uc = buffer[rpos];
jbe@0 423 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
jbe@0 424 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
jbe@0 425 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
jbe@0 426 if (options & UTF8PROC_NLF2LS) {
jbe@0 427 if (options & UTF8PROC_NLF2PS) {
jbe@0 428 buffer[wpos++] = 0x000A;
jbe@0 429 } else {
jbe@0 430 buffer[wpos++] = 0x2028;
jbe@0 431 }
jbe@0 432 } else {
jbe@0 433 if (options & UTF8PROC_NLF2PS) {
jbe@0 434 buffer[wpos++] = 0x2029;
jbe@0 435 } else {
jbe@0 436 buffer[wpos++] = 0x0020;
jbe@0 437 }
jbe@0 438 }
jbe@0 439 } else if ((options & UTF8PROC_STRIPCC) &&
jbe@0 440 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
jbe@0 441 if (uc == 0x0009) buffer[wpos++] = 0x0020;
jbe@0 442 } else {
jbe@0 443 buffer[wpos++] = uc;
jbe@0 444 }
jbe@0 445 }
jbe@0 446 length = wpos;
jbe@0 447 }
jbe@0 448 if (options & UTF8PROC_COMPOSE) {
jbe@0 449 int32_t *starter = NULL;
jbe@0 450 int32_t current_char;
jbe@0 451 const utf8proc_property_t *starter_property = NULL, *current_property;
jbe@3 452 utf8proc_propval_t max_combining_class = -1;
jbe@0 453 ssize_t rpos;
jbe@0 454 ssize_t wpos = 0;
jbe@0 455 int32_t composition;
jbe@0 456 for (rpos = 0; rpos < length; rpos++) {
jbe@0 457 current_char = buffer[rpos];
jbe@0 458 current_property = utf8proc_get_property(current_char);
jbe@0 459 if (starter && current_property->combining_class > max_combining_class) {
jbe@0 460 // combination perhaps possible
jbe@0 461 int32_t hangul_lindex;
jbe@0 462 int32_t hangul_sindex;
jbe@0 463 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
jbe@0 464 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
jbe@0 465 int32_t hangul_vindex;
jbe@0 466 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
jbe@0 467 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
jbe@0 468 *starter = UTF8PROC_HANGUL_SBASE +
jbe@0 469 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
jbe@0 470 UTF8PROC_HANGUL_TCOUNT;
jbe@0 471 starter_property = NULL;
jbe@0 472 continue;
jbe@0 473 }
jbe@0 474 }
jbe@0 475 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
jbe@0 476 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
jbe@0 477 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
jbe@0 478 int32_t hangul_tindex;
jbe@0 479 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
jbe@0 480 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
jbe@0 481 *starter += hangul_tindex;
jbe@0 482 starter_property = NULL;
jbe@0 483 continue;
jbe@0 484 }
jbe@0 485 }
jbe@0 486 if (!starter_property) {
jbe@0 487 starter_property = utf8proc_get_property(*starter);
jbe@0 488 }
jbe@0 489 if (starter_property->comb1st_index >= 0 &&
jbe@0 490 current_property->comb2nd_index >= 0) {
jbe@0 491 composition = utf8proc_combinations[
jbe@0 492 starter_property->comb1st_index +
jbe@0 493 current_property->comb2nd_index
jbe@0 494 ];
jbe@0 495 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
jbe@0 496 !(utf8proc_get_property(composition)->comp_exclusion))) {
jbe@0 497 *starter = composition;
jbe@0 498 starter_property = NULL;
jbe@0 499 continue;
jbe@0 500 }
jbe@0 501 }
jbe@0 502 }
jbe@0 503 buffer[wpos] = current_char;
jbe@0 504 if (current_property->combining_class) {
jbe@0 505 if (current_property->combining_class > max_combining_class) {
jbe@0 506 max_combining_class = current_property->combining_class;
jbe@0 507 }
jbe@0 508 } else {
jbe@0 509 starter = buffer + wpos;
jbe@0 510 starter_property = NULL;
jbe@0 511 max_combining_class = -1;
jbe@0 512 }
jbe@0 513 wpos++;
jbe@0 514 }
jbe@0 515 length = wpos;
jbe@0 516 }
jbe@0 517 {
jbe@0 518 ssize_t rpos, wpos = 0;
jbe@0 519 int32_t uc;
jbe@0 520 for (rpos = 0; rpos < length; rpos++) {
jbe@0 521 uc = buffer[rpos];
jbe@0 522 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
jbe@0 523 }
jbe@0 524 ((uint8_t *)buffer)[wpos] = 0;
jbe@0 525 return wpos;
jbe@0 526 }
jbe@0 527 }
jbe@0 528
jbe@7 529 ssize_t utf8proc_map(
jbe@7 530 const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
jbe@7 531 ) {
jbe@0 532 int32_t *buffer;
jbe@0 533 ssize_t result;
jbe@0 534 *dstptr = NULL;
jbe@0 535 result = utf8proc_decompose(str, strlen, NULL, 0, options);
jbe@0 536 if (result < 0) return result;
jbe@0 537 buffer = malloc(result * sizeof(int32_t) + 1);
jbe@0 538 if (!buffer) return UTF8PROC_ERROR_NOMEM;
jbe@0 539 result = utf8proc_decompose(str, strlen, buffer, result, options);
jbe@0 540 if (result < 0) {
jbe@0 541 free(buffer);
jbe@0 542 return result;
jbe@0 543 }
jbe@0 544 result = utf8proc_reencode(buffer, result, options);
jbe@0 545 if (result < 0) {
jbe@0 546 free(buffer);
jbe@0 547 return result;
jbe@0 548 }
jbe@0 549 {
jbe@0 550 int32_t *newptr;
jbe@0 551 newptr = realloc(buffer, result+1);
jbe@0 552 if (newptr) buffer = newptr;
jbe@0 553 }
jbe@0 554 *dstptr = (uint8_t *)buffer;
jbe@0 555 return result;
jbe@0 556 }
jbe@0 557
jbe@7 558 uint8_t *utf8proc_NFD(const uint8_t *str) {
jbe@0 559 uint8_t *retval;
jbe@2 560 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 561 UTF8PROC_DECOMPOSE);
jbe@0 562 return retval;
jbe@0 563 }
jbe@0 564
jbe@7 565 uint8_t *utf8proc_NFC(const uint8_t *str) {
jbe@0 566 uint8_t *retval;
jbe@0 567 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 568 UTF8PROC_COMPOSE);
jbe@0 569 return retval;
jbe@0 570 }
jbe@0 571
jbe@7 572 uint8_t *utf8proc_NFKD(const uint8_t *str) {
jbe@0 573 uint8_t *retval;
jbe@0 574 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 575 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
jbe@0 576 return retval;
jbe@0 577 }
jbe@0 578
jbe@7 579 uint8_t *utf8proc_NFKC(const uint8_t *str) {
jbe@0 580 uint8_t *retval;
jbe@0 581 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 582 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
jbe@0 583 return retval;
jbe@0 584 }
jbe@0 585

Impressum / About Us