utf8proc

annotate utf8proc.c @ 9:951e73a98021

Version 1.1.3

- Added a function utf8proc_version returning a string containing the version number of the library.
- Included a target libutf8proc.dylib for MacOSX.
- PostgreSQL 8.3 compatibility (use of SET_VARSIZE macro)
author jbe
date Fri May 01 12:00:00 2009 +0200 (2009-05-01)
parents fcfd8c836c64
children 00d2bcbdc945
rev   line source
jbe@0 1 /*
jbe@7 2 * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
jbe@0 3 *
jbe@7 4 * Permission is hereby granted, free of charge, to any person obtaining a
jbe@7 5 * copy of this software and associated documentation files (the "Software"),
jbe@7 6 * to deal in the Software without restriction, including without limitation
jbe@7 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
jbe@7 8 * and/or sell copies of the Software, and to permit persons to whom the
jbe@7 9 * Software is furnished to do so, subject to the following conditions:
jbe@0 10 *
jbe@7 11 * The above copyright notice and this permission notice shall be included in
jbe@7 12 * all copies or substantial portions of the Software.
jbe@0 13 *
jbe@7 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jbe@7 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jbe@7 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
jbe@7 17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jbe@7 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jbe@7 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jbe@7 20 * DEALINGS IN THE SOFTWARE.
jbe@7 21 */
jbe@7 22
jbe@7 23 /*
jbe@0 24 * This library contains derived data from a modified version of the
jbe@0 25 * Unicode data files.
jbe@0 26 *
jbe@0 27 * The original data files are available at
jbe@0 28 * http://www.unicode.org/Public/UNIDATA/
jbe@0 29 *
jbe@0 30 * Please notice the copyright statement in the file "utf8proc_data.c".
jbe@0 31 */
jbe@0 32
jbe@0 33
jbe@0 34 /*
jbe@0 35 * File name: utf8proc.c
jbe@7 36 * Version: 1.1.1
jbe@7 37 * Last changed: 2007-07-22
jbe@0 38 *
jbe@0 39 * Description:
jbe@0 40 * Implementation of libutf8proc.
jbe@0 41 */
jbe@0 42
jbe@0 43
jbe@0 44 #include "utf8proc.h"
jbe@0 45 #include "utf8proc_data.c"
jbe@0 46
jbe@0 47
jbe@0 48 const int8_t utf8proc_utf8class[256] = {
jbe@0 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 61 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 62 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 63 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
jbe@0 64 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
jbe@0 65
jbe@0 66 #define UTF8PROC_HANGUL_SBASE 0xAC00
jbe@0 67 #define UTF8PROC_HANGUL_LBASE 0x1100
jbe@0 68 #define UTF8PROC_HANGUL_VBASE 0x1161
jbe@0 69 #define UTF8PROC_HANGUL_TBASE 0x11A7
jbe@0 70 #define UTF8PROC_HANGUL_LCOUNT 19
jbe@0 71 #define UTF8PROC_HANGUL_VCOUNT 21
jbe@0 72 #define UTF8PROC_HANGUL_TCOUNT 28
jbe@0 73 #define UTF8PROC_HANGUL_NCOUNT 588
jbe@0 74 #define UTF8PROC_HANGUL_SCOUNT 11172
jbe@2 75 // END is exclusive
jbe@2 76 #define UTF8PROC_HANGUL_L_START 0x1100
jbe@2 77 #define UTF8PROC_HANGUL_L_END 0x115A
jbe@2 78 #define UTF8PROC_HANGUL_L_FILLER 0x115F
jbe@2 79 #define UTF8PROC_HANGUL_V_START 0x1160
jbe@2 80 #define UTF8PROC_HANGUL_V_END 0x11A3
jbe@2 81 #define UTF8PROC_HANGUL_T_START 0x11A8
jbe@2 82 #define UTF8PROC_HANGUL_T_END 0x11FA
jbe@2 83 #define UTF8PROC_HANGUL_S_START 0xAC00
jbe@2 84 #define UTF8PROC_HANGUL_S_END 0xD7A4
jbe@2 85
jbe@2 86
jbe@2 87 #define UTF8PROC_BOUNDCLASS_START 0
jbe@2 88 #define UTF8PROC_BOUNDCLASS_OTHER 1
jbe@2 89 #define UTF8PROC_BOUNDCLASS_CR 2
jbe@2 90 #define UTF8PROC_BOUNDCLASS_LF 3
jbe@2 91 #define UTF8PROC_BOUNDCLASS_CONTROL 4
jbe@2 92 #define UTF8PROC_BOUNDCLASS_EXTEND 5
jbe@2 93 #define UTF8PROC_BOUNDCLASS_L 6
jbe@2 94 #define UTF8PROC_BOUNDCLASS_V 7
jbe@2 95 #define UTF8PROC_BOUNDCLASS_T 8
jbe@2 96 #define UTF8PROC_BOUNDCLASS_LV 9
jbe@2 97 #define UTF8PROC_BOUNDCLASS_LVT 10
jbe@0 98
jbe@0 99
jbe@9 100 const char *utf8proc_version(void) {
jbe@9 101 return "1.1.3";
jbe@9 102 }
jbe@9 103
jbe@0 104 const char *utf8proc_errmsg(ssize_t errcode) {
jbe@0 105 switch (errcode) {
jbe@0 106 case UTF8PROC_ERROR_NOMEM:
jbe@0 107 return "Memory for processing UTF-8 data could not be allocated.";
jbe@0 108 case UTF8PROC_ERROR_OVERFLOW:
jbe@0 109 return "UTF-8 string is too long to be processed.";
jbe@0 110 case UTF8PROC_ERROR_INVALIDUTF8:
jbe@0 111 return "Invalid UTF-8 string";
jbe@0 112 case UTF8PROC_ERROR_NOTASSIGNED:
jbe@0 113 return "Unassigned Unicode code point found in UTF-8 string.";
jbe@3 114 case UTF8PROC_ERROR_INVALIDOPTS:
jbe@3 115 return "Invalid options for UTF-8 processing chosen.";
jbe@0 116 default:
jbe@0 117 return "An unknown error occured while processing UTF-8 data.";
jbe@0 118 }
jbe@0 119 }
jbe@0 120
jbe@7 121 ssize_t utf8proc_iterate(
jbe@7 122 const uint8_t *str, ssize_t strlen, int32_t *dst
jbe@7 123 ) {
jbe@0 124 int length;
jbe@0 125 int i;
jbe@0 126 int32_t uc = -1;
jbe@0 127 *dst = -1;
jbe@0 128 if (!strlen) return 0;
jbe@0 129 length = utf8proc_utf8class[str[0]];
jbe@0 130 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 131 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 132 for (i=1; i<length; i++) {
jbe@0 133 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 134 }
jbe@0 135 switch (length) {
jbe@0 136 case 1:
jbe@0 137 uc = str[0];
jbe@0 138 break;
jbe@0 139 case 2:
jbe@0 140 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
jbe@0 141 if (uc < 0x80) uc = -1;
jbe@0 142 break;
jbe@0 143 case 3:
jbe@0 144 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
jbe@0 145 + (str[2] & 0x3F);
jbe@0 146 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
jbe@0 147 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
jbe@0 148 break;
jbe@0 149 case 4:
jbe@0 150 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
jbe@0 151 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
jbe@0 152 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
jbe@0 153 break;
jbe@0 154 }
jbe@7 155 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
jbe@7 156 return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 157 *dst = uc;
jbe@0 158 return length;
jbe@0 159 }
jbe@0 160
jbe@7 161 bool utf8proc_codepoint_valid(int32_t uc) {
jbe@7 162 if (uc < 0 || uc >= 0x110000 ||
jbe@7 163 ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
jbe@7 164 (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
jbe@7 165 else return true;
jbe@7 166 }
jbe@7 167
jbe@0 168 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
jbe@0 169 if (uc < 0x00) {
jbe@0 170 return 0;
jbe@0 171 } else if (uc < 0x80) {
jbe@0 172 dst[0] = uc;
jbe@0 173 return 1;
jbe@0 174 } else if (uc < 0x800) {
jbe@0 175 dst[0] = 0xC0 + (uc >> 6);
jbe@0 176 dst[1] = 0x80 + (uc & 0x3F);
jbe@0 177 return 2;
jbe@2 178 } else if (uc == 0xFFFF) {
jbe@2 179 dst[0] = 0xFF;
jbe@2 180 return 1;
jbe@2 181 } else if (uc == 0xFFFE) {
jbe@2 182 dst[0] = 0xFE;
jbe@2 183 return 1;
jbe@0 184 } else if (uc < 0x10000) {
jbe@0 185 dst[0] = 0xE0 + (uc >> 12);
jbe@0 186 dst[1] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 187 dst[2] = 0x80 + (uc & 0x3F);
jbe@0 188 return 3;
jbe@0 189 } else if (uc < 0x110000) {
jbe@0 190 dst[0] = 0xF0 + (uc >> 18);
jbe@0 191 dst[1] = 0x80 + ((uc >> 12) & 0x3F);
jbe@0 192 dst[2] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 193 dst[3] = 0x80 + (uc & 0x3F);
jbe@0 194 return 4;
jbe@0 195 } else return 0;
jbe@0 196 }
jbe@0 197
jbe@0 198 const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
jbe@0 199 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 200 return utf8proc_properties + (
jbe@0 201 utf8proc_stage2table[
jbe@0 202 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
jbe@0 203 ]
jbe@0 204 );
jbe@0 205 }
jbe@0 206
jbe@3 207 #define utf8proc_decompose_lump(replacement_uc) \
jbe@3 208 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
jbe@3 209 options & ~UTF8PROC_LUMP, last_boundclass)
jbe@3 210
jbe@0 211 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
jbe@2 212 int options, int *last_boundclass) {
jbe@0 213 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 214 const utf8proc_property_t *property;
jbe@3 215 utf8proc_propval_t category;
jbe@0 216 int32_t hangul_sindex;
jbe@0 217 property = utf8proc_get_property(uc);
jbe@3 218 category = property->category;
jbe@0 219 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
jbe@3 220 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 221 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
jbe@3 222 int32_t hangul_tindex;
jbe@3 223 if (bufsize >= 1) {
jbe@3 224 dst[0] = UTF8PROC_HANGUL_LBASE +
jbe@3 225 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
jbe@3 226 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
jbe@3 227 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
jbe@3 228 }
jbe@3 229 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
jbe@3 230 if (!hangul_tindex) return 2;
jbe@3 231 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
jbe@3 232 return 3;
jbe@0 233 }
jbe@3 234 }
jbe@3 235 if (options & UTF8PROC_REJECTNA) {
jbe@3 236 if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
jbe@3 237 }
jbe@3 238 if (options & UTF8PROC_IGNORE) {
jbe@3 239 if (property->ignorable) return 0;
jbe@3 240 }
jbe@3 241 if (options & UTF8PROC_LUMP) {
jbe@3 242 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
jbe@3 243 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
jbe@3 244 utf8proc_decompose_lump(0x0027);
jbe@3 245 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
jbe@3 246 utf8proc_decompose_lump(0x002D);
jbe@3 247 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
jbe@3 248 if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
jbe@3 249 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
jbe@3 250 utf8proc_decompose_lump(0x003C);
jbe@3 251 if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
jbe@3 252 utf8proc_decompose_lump(0x003E);
jbe@3 253 if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
jbe@3 254 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
jbe@3 255 utf8proc_decompose_lump(0x005E);
jbe@3 256 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
jbe@3 257 utf8proc_decompose_lump(0x005F);
jbe@3 258 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
jbe@3 259 if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
jbe@3 260 if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
jbe@3 261 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
jbe@3 262 if (category == UTF8PROC_CATEGORY_ZL ||
jbe@3 263 category == UTF8PROC_CATEGORY_ZP)
jbe@3 264 utf8proc_decompose_lump(0x000A);
jbe@3 265 }
jbe@3 266 }
jbe@3 267 if (options & UTF8PROC_STRIPMARK) {
jbe@3 268 if (category == UTF8PROC_CATEGORY_MN ||
jbe@3 269 category == UTF8PROC_CATEGORY_MC ||
jbe@3 270 category == UTF8PROC_CATEGORY_ME) return 0;
jbe@3 271 }
jbe@3 272 if (options & UTF8PROC_CASEFOLD) {
jbe@3 273 if (property->casefold_mapping) {
jbe@3 274 const int32_t *casefold_entry;
jbe@3 275 ssize_t written = 0;
jbe@3 276 for (casefold_entry = property->casefold_mapping;
jbe@3 277 *casefold_entry >= 0; casefold_entry++) {
jbe@3 278 written += utf8proc_decompose_char(*casefold_entry, dst+written,
jbe@3 279 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@3 280 last_boundclass);
jbe@3 281 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 282 }
jbe@3 283 return written;
jbe@3 284 }
jbe@3 285 }
jbe@3 286 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 287 if (property->decomp_mapping &&
jbe@3 288 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
jbe@3 289 const int32_t *decomp_entry;
jbe@3 290 ssize_t written = 0;
jbe@3 291 for (decomp_entry = property->decomp_mapping;
jbe@3 292 *decomp_entry >= 0; decomp_entry++) {
jbe@3 293 written += utf8proc_decompose_char(*decomp_entry, dst+written,
jbe@3 294 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@2 295 last_boundclass);
jbe@3 296 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 297 }
jbe@3 298 return written;
jbe@0 299 }
jbe@3 300 }
jbe@3 301 if (options & UTF8PROC_CHARBOUND) {
jbe@2 302 bool boundary;
jbe@2 303 int tbc, lbc;
jbe@2 304 tbc =
jbe@2 305 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
jbe@2 306 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
jbe@2 307 ((category == UTF8PROC_CATEGORY_ZL ||
jbe@2 308 category == UTF8PROC_CATEGORY_ZP ||
jbe@2 309 category == UTF8PROC_CATEGORY_CC ||
jbe@2 310 category == UTF8PROC_CATEGORY_CF) &&
jbe@2 311 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
jbe@2 312 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
jbe@2 313 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
jbe@2 314 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
jbe@2 315 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
jbe@2 316 UTF8PROC_BOUNDCLASS_V :
jbe@2 317 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
jbe@2 318 UTF8PROC_BOUNDCLASS_T :
jbe@2 319 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
jbe@2 320 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
jbe@2 321 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
jbe@2 322 ) :
jbe@2 323 UTF8PROC_BOUNDCLASS_OTHER;
jbe@2 324 lbc = *last_boundclass;
jbe@2 325 boundary =
jbe@2 326 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
jbe@2 327 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
jbe@2 328 (lbc == UTF8PROC_BOUNDCLASS_CR &&
jbe@2 329 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
jbe@2 330 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 331 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 332 (lbc == UTF8PROC_BOUNDCLASS_L &&
jbe@2 333 (tbc == UTF8PROC_BOUNDCLASS_L ||
jbe@2 334 tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 335 tbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 336 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
jbe@2 337 ((lbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 338 lbc == UTF8PROC_BOUNDCLASS_V) &&
jbe@2 339 (tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 340 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
jbe@2 341 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
jbe@2 342 lbc == UTF8PROC_BOUNDCLASS_T) &&
jbe@2 343 tbc == UTF8PROC_BOUNDCLASS_T) ? false :
jbe@2 344 true;
jbe@2 345 *last_boundclass = tbc;
jbe@2 346 if (boundary) {
jbe@2 347 if (bufsize >= 1) dst[0] = 0xFFFF;
jbe@2 348 if (bufsize >= 2) dst[1] = uc;
jbe@2 349 return 2;
jbe@2 350 }
jbe@0 351 }
jbe@2 352 if (bufsize >= 1) *dst = uc;
jbe@2 353 return 1;
jbe@0 354 }
jbe@0 355
jbe@7 356 ssize_t utf8proc_decompose(
jbe@7 357 const uint8_t *str, ssize_t strlen,
jbe@7 358 int32_t *buffer, ssize_t bufsize, int options
jbe@7 359 ) {
jbe@0 360 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options
jbe@0 361 ssize_t wpos = 0;
jbe@3 362 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
jbe@3 363 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@3 364 if ((options & UTF8PROC_STRIPMARK) &&
jbe@3 365 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
jbe@3 366 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@0 367 {
jbe@0 368 int32_t uc;
jbe@0 369 ssize_t rpos = 0;
jbe@0 370 ssize_t decomp_result;
jbe@2 371 int boundclass = UTF8PROC_BOUNDCLASS_START;
jbe@0 372 while (1) {
jbe@0 373 if (options & UTF8PROC_NULLTERM) {
jbe@0 374 rpos += utf8proc_iterate(str + rpos, -1, &uc);
jbe@0 375 // checking of return value is not neccessary,
jbe@0 376 // as 'uc' is < 0 in case of error
jbe@0 377 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 378 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@0 379 if (uc == 0) break;
jbe@0 380 } else {
jbe@0 381 if (rpos >= strlen) break;
jbe@0 382 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
jbe@0 383 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 384 }
jbe@0 385 decomp_result = utf8proc_decompose_char(
jbe@2 386 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
jbe@2 387 &boundclass
jbe@0 388 );
jbe@0 389 if (decomp_result < 0) return decomp_result;
jbe@0 390 wpos += decomp_result;
jbe@0 391 // prohibiting integer overflows due to too long strings:
jbe@0 392 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
jbe@0 393 return UTF8PROC_ERROR_OVERFLOW;
jbe@0 394 }
jbe@0 395 }
jbe@2 396 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
jbe@0 397 ssize_t pos = 0;
jbe@0 398 while (pos < wpos-1) {
jbe@0 399 int32_t uc1, uc2;
jbe@0 400 const utf8proc_property_t *property1, *property2;
jbe@0 401 uc1 = buffer[pos];
jbe@0 402 uc2 = buffer[pos+1];
jbe@0 403 property1 = utf8proc_get_property(uc1);
jbe@0 404 property2 = utf8proc_get_property(uc2);
jbe@0 405 if (property1->combining_class > property2->combining_class &&
jbe@0 406 property2->combining_class > 0) {
jbe@0 407 buffer[pos] = uc2;
jbe@0 408 buffer[pos+1] = uc1;
jbe@0 409 if (pos > 0) pos--; else pos++;
jbe@0 410 } else {
jbe@0 411 pos++;
jbe@0 412 }
jbe@0 413 }
jbe@0 414 }
jbe@0 415 return wpos;
jbe@0 416 }
jbe@0 417
jbe@0 418 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
jbe@0 419 // UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
jbe@0 420 // ASSERT: 'buffer' has one spare byte of free space at the end!
jbe@0 421 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
jbe@0 422 ssize_t rpos;
jbe@0 423 ssize_t wpos = 0;
jbe@0 424 int32_t uc;
jbe@0 425 for (rpos = 0; rpos < length; rpos++) {
jbe@0 426 uc = buffer[rpos];
jbe@0 427 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
jbe@0 428 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
jbe@0 429 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
jbe@0 430 if (options & UTF8PROC_NLF2LS) {
jbe@0 431 if (options & UTF8PROC_NLF2PS) {
jbe@0 432 buffer[wpos++] = 0x000A;
jbe@0 433 } else {
jbe@0 434 buffer[wpos++] = 0x2028;
jbe@0 435 }
jbe@0 436 } else {
jbe@0 437 if (options & UTF8PROC_NLF2PS) {
jbe@0 438 buffer[wpos++] = 0x2029;
jbe@0 439 } else {
jbe@0 440 buffer[wpos++] = 0x0020;
jbe@0 441 }
jbe@0 442 }
jbe@0 443 } else if ((options & UTF8PROC_STRIPCC) &&
jbe@0 444 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
jbe@0 445 if (uc == 0x0009) buffer[wpos++] = 0x0020;
jbe@0 446 } else {
jbe@0 447 buffer[wpos++] = uc;
jbe@0 448 }
jbe@0 449 }
jbe@0 450 length = wpos;
jbe@0 451 }
jbe@0 452 if (options & UTF8PROC_COMPOSE) {
jbe@0 453 int32_t *starter = NULL;
jbe@0 454 int32_t current_char;
jbe@0 455 const utf8proc_property_t *starter_property = NULL, *current_property;
jbe@3 456 utf8proc_propval_t max_combining_class = -1;
jbe@0 457 ssize_t rpos;
jbe@0 458 ssize_t wpos = 0;
jbe@0 459 int32_t composition;
jbe@0 460 for (rpos = 0; rpos < length; rpos++) {
jbe@0 461 current_char = buffer[rpos];
jbe@0 462 current_property = utf8proc_get_property(current_char);
jbe@0 463 if (starter && current_property->combining_class > max_combining_class) {
jbe@0 464 // combination perhaps possible
jbe@0 465 int32_t hangul_lindex;
jbe@0 466 int32_t hangul_sindex;
jbe@0 467 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
jbe@0 468 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
jbe@0 469 int32_t hangul_vindex;
jbe@0 470 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
jbe@0 471 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
jbe@0 472 *starter = UTF8PROC_HANGUL_SBASE +
jbe@0 473 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
jbe@0 474 UTF8PROC_HANGUL_TCOUNT;
jbe@0 475 starter_property = NULL;
jbe@0 476 continue;
jbe@0 477 }
jbe@0 478 }
jbe@0 479 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
jbe@0 480 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
jbe@0 481 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
jbe@0 482 int32_t hangul_tindex;
jbe@0 483 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
jbe@0 484 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
jbe@0 485 *starter += hangul_tindex;
jbe@0 486 starter_property = NULL;
jbe@0 487 continue;
jbe@0 488 }
jbe@0 489 }
jbe@0 490 if (!starter_property) {
jbe@0 491 starter_property = utf8proc_get_property(*starter);
jbe@0 492 }
jbe@0 493 if (starter_property->comb1st_index >= 0 &&
jbe@0 494 current_property->comb2nd_index >= 0) {
jbe@0 495 composition = utf8proc_combinations[
jbe@0 496 starter_property->comb1st_index +
jbe@0 497 current_property->comb2nd_index
jbe@0 498 ];
jbe@0 499 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
jbe@0 500 !(utf8proc_get_property(composition)->comp_exclusion))) {
jbe@0 501 *starter = composition;
jbe@0 502 starter_property = NULL;
jbe@0 503 continue;
jbe@0 504 }
jbe@0 505 }
jbe@0 506 }
jbe@0 507 buffer[wpos] = current_char;
jbe@0 508 if (current_property->combining_class) {
jbe@0 509 if (current_property->combining_class > max_combining_class) {
jbe@0 510 max_combining_class = current_property->combining_class;
jbe@0 511 }
jbe@0 512 } else {
jbe@0 513 starter = buffer + wpos;
jbe@0 514 starter_property = NULL;
jbe@0 515 max_combining_class = -1;
jbe@0 516 }
jbe@0 517 wpos++;
jbe@0 518 }
jbe@0 519 length = wpos;
jbe@0 520 }
jbe@0 521 {
jbe@0 522 ssize_t rpos, wpos = 0;
jbe@0 523 int32_t uc;
jbe@0 524 for (rpos = 0; rpos < length; rpos++) {
jbe@0 525 uc = buffer[rpos];
jbe@0 526 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
jbe@0 527 }
jbe@0 528 ((uint8_t *)buffer)[wpos] = 0;
jbe@0 529 return wpos;
jbe@0 530 }
jbe@0 531 }
jbe@0 532
jbe@7 533 ssize_t utf8proc_map(
jbe@7 534 const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
jbe@7 535 ) {
jbe@0 536 int32_t *buffer;
jbe@0 537 ssize_t result;
jbe@0 538 *dstptr = NULL;
jbe@0 539 result = utf8proc_decompose(str, strlen, NULL, 0, options);
jbe@0 540 if (result < 0) return result;
jbe@0 541 buffer = malloc(result * sizeof(int32_t) + 1);
jbe@0 542 if (!buffer) return UTF8PROC_ERROR_NOMEM;
jbe@0 543 result = utf8proc_decompose(str, strlen, buffer, result, options);
jbe@0 544 if (result < 0) {
jbe@0 545 free(buffer);
jbe@0 546 return result;
jbe@0 547 }
jbe@0 548 result = utf8proc_reencode(buffer, result, options);
jbe@0 549 if (result < 0) {
jbe@0 550 free(buffer);
jbe@0 551 return result;
jbe@0 552 }
jbe@0 553 {
jbe@0 554 int32_t *newptr;
jbe@0 555 newptr = realloc(buffer, result+1);
jbe@0 556 if (newptr) buffer = newptr;
jbe@0 557 }
jbe@0 558 *dstptr = (uint8_t *)buffer;
jbe@0 559 return result;
jbe@0 560 }
jbe@0 561
jbe@7 562 uint8_t *utf8proc_NFD(const uint8_t *str) {
jbe@0 563 uint8_t *retval;
jbe@2 564 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 565 UTF8PROC_DECOMPOSE);
jbe@0 566 return retval;
jbe@0 567 }
jbe@0 568
jbe@7 569 uint8_t *utf8proc_NFC(const uint8_t *str) {
jbe@0 570 uint8_t *retval;
jbe@0 571 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 572 UTF8PROC_COMPOSE);
jbe@0 573 return retval;
jbe@0 574 }
jbe@0 575
jbe@7 576 uint8_t *utf8proc_NFKD(const uint8_t *str) {
jbe@0 577 uint8_t *retval;
jbe@0 578 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 579 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
jbe@0 580 return retval;
jbe@0 581 }
jbe@0 582
jbe@7 583 uint8_t *utf8proc_NFKC(const uint8_t *str) {
jbe@0 584 uint8_t *retval;
jbe@0 585 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 586 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
jbe@0 587 return retval;
jbe@0 588 }
jbe@0 589

Impressum / About Us