utf8proc

annotate utf8proc.c @ 3:4ee0d5f54af1

Version 1.0

- added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function)
- added the STRIPMARK option, which strips marking characters (or marks of composed characters)
- deprecated ruby method String#char_ary in favour of String#utf8chars
author jbe
date Sun Sep 17 12:00:00 2006 +0200 (2006-09-17)
parents aaad485d5335
children fcfd8c836c64
rev   line source
jbe@0 1 /*
jbe@0 2 * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
jbe@0 3 * Author: Jan Behrens <jan.behrens@flexiguided.de>
jbe@0 4 * All rights reserved.
jbe@0 5 *
jbe@0 6 * Redistribution and use in source and binary forms, with or without
jbe@0 7 * modification, are permitted provided that the following conditions are
jbe@0 8 * met:
jbe@0 9 *
jbe@0 10 * 1. Redistributions of source code must retain the above copyright
jbe@0 11 * notice, this list of conditions and the following disclaimer.
jbe@0 12 * 2. Redistributions in binary form must reproduce the above copyright
jbe@0 13 * notice, this list of conditions and the following disclaimer in the
jbe@0 14 * documentation and/or other materials provided with the distribution.
jbe@0 15 * 3. Neither the name of the FlexiGuided GmbH nor the names of its
jbe@0 16 * contributors may be used to endorse or promote products derived from
jbe@0 17 * this software without specific prior written permission.
jbe@0 18 *
jbe@0 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
jbe@0 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
jbe@0 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
jbe@0 22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
jbe@0 23 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
jbe@0 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
jbe@0 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
jbe@0 26 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
jbe@0 27 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
jbe@0 28 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
jbe@0 29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
jbe@0 30 *
jbe@0 31 *
jbe@0 32 * This library contains derived data from a modified version of the
jbe@0 33 * Unicode data files.
jbe@0 34 *
jbe@0 35 * The original data files are available at
jbe@0 36 * http://www.unicode.org/Public/UNIDATA/
jbe@0 37 *
jbe@0 38 * Please notice the copyright statement in the file "utf8proc_data.c".
jbe@0 39 *
jbe@0 40 */
jbe@0 41
jbe@0 42
jbe@0 43 /*
jbe@0 44 * File name: utf8proc.c
jbe@3 45 * Version: 1.0
jbe@3 46 * Last changed: 2006-09-17
jbe@0 47 *
jbe@0 48 * Description:
jbe@0 49 * Implementation of libutf8proc.
jbe@0 50 */
jbe@0 51
jbe@0 52
jbe@0 53 #include "utf8proc.h"
jbe@0 54 #include "utf8proc_data.c"
jbe@0 55
jbe@0 56
jbe@0 57 const int8_t utf8proc_utf8class[256] = {
jbe@0 58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
jbe@0 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
jbe@0 70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 71 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
jbe@0 72 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
jbe@0 73 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
jbe@0 74
jbe@0 75 #define UTF8PROC_HANGUL_SBASE 0xAC00
jbe@0 76 #define UTF8PROC_HANGUL_LBASE 0x1100
jbe@0 77 #define UTF8PROC_HANGUL_VBASE 0x1161
jbe@0 78 #define UTF8PROC_HANGUL_TBASE 0x11A7
jbe@0 79 #define UTF8PROC_HANGUL_LCOUNT 19
jbe@0 80 #define UTF8PROC_HANGUL_VCOUNT 21
jbe@0 81 #define UTF8PROC_HANGUL_TCOUNT 28
jbe@0 82 #define UTF8PROC_HANGUL_NCOUNT 588
jbe@0 83 #define UTF8PROC_HANGUL_SCOUNT 11172
jbe@2 84 // END is exclusive
jbe@2 85 #define UTF8PROC_HANGUL_L_START 0x1100
jbe@2 86 #define UTF8PROC_HANGUL_L_END 0x115A
jbe@2 87 #define UTF8PROC_HANGUL_L_FILLER 0x115F
jbe@2 88 #define UTF8PROC_HANGUL_V_START 0x1160
jbe@2 89 #define UTF8PROC_HANGUL_V_END 0x11A3
jbe@2 90 #define UTF8PROC_HANGUL_T_START 0x11A8
jbe@2 91 #define UTF8PROC_HANGUL_T_END 0x11FA
jbe@2 92 #define UTF8PROC_HANGUL_S_START 0xAC00
jbe@2 93 #define UTF8PROC_HANGUL_S_END 0xD7A4
jbe@2 94
jbe@2 95
jbe@2 96 #define UTF8PROC_BOUNDCLASS_START 0
jbe@2 97 #define UTF8PROC_BOUNDCLASS_OTHER 1
jbe@2 98 #define UTF8PROC_BOUNDCLASS_CR 2
jbe@2 99 #define UTF8PROC_BOUNDCLASS_LF 3
jbe@2 100 #define UTF8PROC_BOUNDCLASS_CONTROL 4
jbe@2 101 #define UTF8PROC_BOUNDCLASS_EXTEND 5
jbe@2 102 #define UTF8PROC_BOUNDCLASS_L 6
jbe@2 103 #define UTF8PROC_BOUNDCLASS_V 7
jbe@2 104 #define UTF8PROC_BOUNDCLASS_T 8
jbe@2 105 #define UTF8PROC_BOUNDCLASS_LV 9
jbe@2 106 #define UTF8PROC_BOUNDCLASS_LVT 10
jbe@0 107
jbe@0 108
jbe@0 109 const char *utf8proc_errmsg(ssize_t errcode) {
jbe@0 110 switch (errcode) {
jbe@0 111 case UTF8PROC_ERROR_NOMEM:
jbe@0 112 return "Memory for processing UTF-8 data could not be allocated.";
jbe@0 113 case UTF8PROC_ERROR_OVERFLOW:
jbe@0 114 return "UTF-8 string is too long to be processed.";
jbe@0 115 case UTF8PROC_ERROR_INVALIDUTF8:
jbe@0 116 return "Invalid UTF-8 string";
jbe@0 117 case UTF8PROC_ERROR_NOTASSIGNED:
jbe@0 118 return "Unassigned Unicode code point found in UTF-8 string.";
jbe@3 119 case UTF8PROC_ERROR_INVALIDOPTS:
jbe@3 120 return "Invalid options for UTF-8 processing chosen.";
jbe@0 121 default:
jbe@0 122 return "An unknown error occured while processing UTF-8 data.";
jbe@0 123 }
jbe@0 124 }
jbe@0 125
jbe@0 126 ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst) {
jbe@0 127 int length;
jbe@0 128 int i;
jbe@0 129 int32_t uc = -1;
jbe@0 130 *dst = -1;
jbe@0 131 if (!strlen) return 0;
jbe@0 132 length = utf8proc_utf8class[str[0]];
jbe@0 133 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 134 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 135 for (i=1; i<length; i++) {
jbe@0 136 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 137 }
jbe@0 138 switch (length) {
jbe@0 139 case 1:
jbe@0 140 uc = str[0];
jbe@0 141 break;
jbe@0 142 case 2:
jbe@0 143 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
jbe@0 144 if (uc < 0x80) uc = -1;
jbe@0 145 break;
jbe@0 146 case 3:
jbe@0 147 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
jbe@0 148 + (str[2] & 0x3F);
jbe@0 149 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
jbe@0 150 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
jbe@0 151 break;
jbe@0 152 case 4:
jbe@0 153 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
jbe@0 154 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
jbe@0 155 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
jbe@0 156 break;
jbe@0 157 }
jbe@0 158 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 159 *dst = uc;
jbe@0 160 return length;
jbe@0 161 }
jbe@0 162
jbe@0 163 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
jbe@0 164 if (uc < 0x00) {
jbe@0 165 return 0;
jbe@0 166 } else if (uc < 0x80) {
jbe@0 167 dst[0] = uc;
jbe@0 168 return 1;
jbe@0 169 } else if (uc < 0x800) {
jbe@0 170 dst[0] = 0xC0 + (uc >> 6);
jbe@0 171 dst[1] = 0x80 + (uc & 0x3F);
jbe@0 172 return 2;
jbe@2 173 } else if (uc == 0xFFFF) {
jbe@2 174 dst[0] = 0xFF;
jbe@2 175 return 1;
jbe@2 176 } else if (uc == 0xFFFE) {
jbe@2 177 dst[0] = 0xFE;
jbe@2 178 return 1;
jbe@0 179 } else if (uc < 0x10000) {
jbe@0 180 dst[0] = 0xE0 + (uc >> 12);
jbe@0 181 dst[1] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 182 dst[2] = 0x80 + (uc & 0x3F);
jbe@0 183 return 3;
jbe@0 184 } else if (uc < 0x110000) {
jbe@0 185 dst[0] = 0xF0 + (uc >> 18);
jbe@0 186 dst[1] = 0x80 + ((uc >> 12) & 0x3F);
jbe@0 187 dst[2] = 0x80 + ((uc >> 6) & 0x3F);
jbe@0 188 dst[3] = 0x80 + (uc & 0x3F);
jbe@0 189 return 4;
jbe@0 190 } else return 0;
jbe@0 191 }
jbe@0 192
jbe@0 193 const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
jbe@0 194 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 195 return utf8proc_properties + (
jbe@0 196 utf8proc_stage2table[
jbe@0 197 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
jbe@0 198 ]
jbe@0 199 );
jbe@0 200 }
jbe@0 201
jbe@3 202 #define utf8proc_decompose_lump(replacement_uc) \
jbe@3 203 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
jbe@3 204 options & ~UTF8PROC_LUMP, last_boundclass)
jbe@3 205
jbe@0 206 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
jbe@2 207 int options, int *last_boundclass) {
jbe@0 208 // ASSERT: uc >= 0 && uc < 0x110000
jbe@0 209 const utf8proc_property_t *property;
jbe@3 210 utf8proc_propval_t category;
jbe@0 211 int32_t hangul_sindex;
jbe@0 212 property = utf8proc_get_property(uc);
jbe@3 213 category = property->category;
jbe@0 214 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
jbe@3 215 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 216 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
jbe@3 217 int32_t hangul_tindex;
jbe@3 218 if (bufsize >= 1) {
jbe@3 219 dst[0] = UTF8PROC_HANGUL_LBASE +
jbe@3 220 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
jbe@3 221 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
jbe@3 222 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
jbe@3 223 }
jbe@3 224 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
jbe@3 225 if (!hangul_tindex) return 2;
jbe@3 226 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
jbe@3 227 return 3;
jbe@0 228 }
jbe@3 229 }
jbe@3 230 if (options & UTF8PROC_REJECTNA) {
jbe@3 231 if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
jbe@3 232 }
jbe@3 233 if (options & UTF8PROC_IGNORE) {
jbe@3 234 if (property->ignorable) return 0;
jbe@3 235 }
jbe@3 236 if (options & UTF8PROC_LUMP) {
jbe@3 237 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
jbe@3 238 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
jbe@3 239 utf8proc_decompose_lump(0x0027);
jbe@3 240 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
jbe@3 241 utf8proc_decompose_lump(0x002D);
jbe@3 242 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
jbe@3 243 if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
jbe@3 244 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
jbe@3 245 utf8proc_decompose_lump(0x003C);
jbe@3 246 if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
jbe@3 247 utf8proc_decompose_lump(0x003E);
jbe@3 248 if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
jbe@3 249 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
jbe@3 250 utf8proc_decompose_lump(0x005E);
jbe@3 251 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
jbe@3 252 utf8proc_decompose_lump(0x005F);
jbe@3 253 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
jbe@3 254 if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
jbe@3 255 if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
jbe@3 256 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
jbe@3 257 if (category == UTF8PROC_CATEGORY_ZL ||
jbe@3 258 category == UTF8PROC_CATEGORY_ZP)
jbe@3 259 utf8proc_decompose_lump(0x000A);
jbe@3 260 }
jbe@3 261 }
jbe@3 262 if (options & UTF8PROC_STRIPMARK) {
jbe@3 263 if (category == UTF8PROC_CATEGORY_MN ||
jbe@3 264 category == UTF8PROC_CATEGORY_MC ||
jbe@3 265 category == UTF8PROC_CATEGORY_ME) return 0;
jbe@3 266 }
jbe@3 267 if (options & UTF8PROC_CASEFOLD) {
jbe@3 268 if (property->casefold_mapping) {
jbe@3 269 const int32_t *casefold_entry;
jbe@3 270 ssize_t written = 0;
jbe@3 271 for (casefold_entry = property->casefold_mapping;
jbe@3 272 *casefold_entry >= 0; casefold_entry++) {
jbe@3 273 written += utf8proc_decompose_char(*casefold_entry, dst+written,
jbe@3 274 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@3 275 last_boundclass);
jbe@3 276 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 277 }
jbe@3 278 return written;
jbe@3 279 }
jbe@3 280 }
jbe@3 281 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
jbe@3 282 if (property->decomp_mapping &&
jbe@3 283 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
jbe@3 284 const int32_t *decomp_entry;
jbe@3 285 ssize_t written = 0;
jbe@3 286 for (decomp_entry = property->decomp_mapping;
jbe@3 287 *decomp_entry >= 0; decomp_entry++) {
jbe@3 288 written += utf8proc_decompose_char(*decomp_entry, dst+written,
jbe@3 289 (bufsize > written) ? (bufsize - written) : 0, options,
jbe@2 290 last_boundclass);
jbe@3 291 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@3 292 }
jbe@3 293 return written;
jbe@0 294 }
jbe@3 295 }
jbe@3 296 if (options & UTF8PROC_CHARBOUND) {
jbe@2 297 bool boundary;
jbe@2 298 int tbc, lbc;
jbe@2 299 tbc =
jbe@2 300 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
jbe@2 301 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
jbe@2 302 ((category == UTF8PROC_CATEGORY_ZL ||
jbe@2 303 category == UTF8PROC_CATEGORY_ZP ||
jbe@2 304 category == UTF8PROC_CATEGORY_CC ||
jbe@2 305 category == UTF8PROC_CATEGORY_CF) &&
jbe@2 306 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
jbe@2 307 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
jbe@2 308 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
jbe@2 309 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
jbe@2 310 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
jbe@2 311 UTF8PROC_BOUNDCLASS_V :
jbe@2 312 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
jbe@2 313 UTF8PROC_BOUNDCLASS_T :
jbe@2 314 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
jbe@2 315 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
jbe@2 316 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
jbe@2 317 ) :
jbe@2 318 UTF8PROC_BOUNDCLASS_OTHER;
jbe@2 319 lbc = *last_boundclass;
jbe@2 320 boundary =
jbe@2 321 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
jbe@2 322 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
jbe@2 323 (lbc == UTF8PROC_BOUNDCLASS_CR &&
jbe@2 324 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
jbe@2 325 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 326 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
jbe@2 327 (lbc == UTF8PROC_BOUNDCLASS_L &&
jbe@2 328 (tbc == UTF8PROC_BOUNDCLASS_L ||
jbe@2 329 tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 330 tbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 331 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
jbe@2 332 ((lbc == UTF8PROC_BOUNDCLASS_LV ||
jbe@2 333 lbc == UTF8PROC_BOUNDCLASS_V) &&
jbe@2 334 (tbc == UTF8PROC_BOUNDCLASS_V ||
jbe@2 335 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
jbe@2 336 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
jbe@2 337 lbc == UTF8PROC_BOUNDCLASS_T) &&
jbe@2 338 tbc == UTF8PROC_BOUNDCLASS_T) ? false :
jbe@2 339 true;
jbe@2 340 *last_boundclass = tbc;
jbe@2 341 if (boundary) {
jbe@2 342 if (bufsize >= 1) dst[0] = 0xFFFF;
jbe@2 343 if (bufsize >= 2) dst[1] = uc;
jbe@2 344 return 2;
jbe@2 345 }
jbe@0 346 }
jbe@2 347 if (bufsize >= 1) *dst = uc;
jbe@2 348 return 1;
jbe@0 349 }
jbe@0 350
jbe@0 351 ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen,
jbe@0 352 int32_t *buffer, ssize_t bufsize, int options) {
jbe@0 353 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options
jbe@0 354 ssize_t wpos = 0;
jbe@3 355 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
jbe@3 356 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@3 357 if ((options & UTF8PROC_STRIPMARK) &&
jbe@3 358 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
jbe@3 359 return UTF8PROC_ERROR_INVALIDOPTS;
jbe@0 360 {
jbe@0 361 int32_t uc;
jbe@0 362 ssize_t rpos = 0;
jbe@0 363 ssize_t decomp_result;
jbe@2 364 int boundclass = UTF8PROC_BOUNDCLASS_START;
jbe@0 365 while (1) {
jbe@0 366 if (options & UTF8PROC_NULLTERM) {
jbe@0 367 rpos += utf8proc_iterate(str + rpos, -1, &uc);
jbe@0 368 // checking of return value is not neccessary,
jbe@0 369 // as 'uc' is < 0 in case of error
jbe@0 370 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 371 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
jbe@0 372 if (uc == 0) break;
jbe@0 373 } else {
jbe@0 374 if (rpos >= strlen) break;
jbe@0 375 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
jbe@0 376 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
jbe@0 377 }
jbe@0 378 decomp_result = utf8proc_decompose_char(
jbe@2 379 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
jbe@2 380 &boundclass
jbe@0 381 );
jbe@0 382 if (decomp_result < 0) return decomp_result;
jbe@0 383 wpos += decomp_result;
jbe@0 384 // prohibiting integer overflows due to too long strings:
jbe@0 385 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
jbe@0 386 return UTF8PROC_ERROR_OVERFLOW;
jbe@0 387 }
jbe@0 388 }
jbe@2 389 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
jbe@0 390 ssize_t pos = 0;
jbe@0 391 while (pos < wpos-1) {
jbe@0 392 int32_t uc1, uc2;
jbe@0 393 const utf8proc_property_t *property1, *property2;
jbe@0 394 uc1 = buffer[pos];
jbe@0 395 uc2 = buffer[pos+1];
jbe@0 396 property1 = utf8proc_get_property(uc1);
jbe@0 397 property2 = utf8proc_get_property(uc2);
jbe@0 398 if (property1->combining_class > property2->combining_class &&
jbe@0 399 property2->combining_class > 0) {
jbe@0 400 buffer[pos] = uc2;
jbe@0 401 buffer[pos+1] = uc1;
jbe@0 402 if (pos > 0) pos--; else pos++;
jbe@0 403 } else {
jbe@0 404 pos++;
jbe@0 405 }
jbe@0 406 }
jbe@0 407 }
jbe@0 408 return wpos;
jbe@0 409 }
jbe@0 410
jbe@0 411 ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
jbe@0 412 // UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
jbe@0 413 // ASSERT: 'buffer' has one spare byte of free space at the end!
jbe@0 414 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
jbe@0 415 ssize_t rpos;
jbe@0 416 ssize_t wpos = 0;
jbe@0 417 int32_t uc;
jbe@0 418 for (rpos = 0; rpos < length; rpos++) {
jbe@0 419 uc = buffer[rpos];
jbe@0 420 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
jbe@0 421 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
jbe@0 422 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
jbe@0 423 if (options & UTF8PROC_NLF2LS) {
jbe@0 424 if (options & UTF8PROC_NLF2PS) {
jbe@0 425 buffer[wpos++] = 0x000A;
jbe@0 426 } else {
jbe@0 427 buffer[wpos++] = 0x2028;
jbe@0 428 }
jbe@0 429 } else {
jbe@0 430 if (options & UTF8PROC_NLF2PS) {
jbe@0 431 buffer[wpos++] = 0x2029;
jbe@0 432 } else {
jbe@0 433 buffer[wpos++] = 0x0020;
jbe@0 434 }
jbe@0 435 }
jbe@0 436 } else if ((options & UTF8PROC_STRIPCC) &&
jbe@0 437 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
jbe@0 438 if (uc == 0x0009) buffer[wpos++] = 0x0020;
jbe@0 439 } else {
jbe@0 440 buffer[wpos++] = uc;
jbe@0 441 }
jbe@0 442 }
jbe@0 443 length = wpos;
jbe@0 444 }
jbe@0 445 if (options & UTF8PROC_COMPOSE) {
jbe@0 446 int32_t *starter = NULL;
jbe@0 447 int32_t current_char;
jbe@0 448 const utf8proc_property_t *starter_property = NULL, *current_property;
jbe@3 449 utf8proc_propval_t max_combining_class = -1;
jbe@0 450 ssize_t rpos;
jbe@0 451 ssize_t wpos = 0;
jbe@0 452 int32_t composition;
jbe@0 453 for (rpos = 0; rpos < length; rpos++) {
jbe@0 454 current_char = buffer[rpos];
jbe@0 455 current_property = utf8proc_get_property(current_char);
jbe@0 456 if (starter && current_property->combining_class > max_combining_class) {
jbe@0 457 // combination perhaps possible
jbe@0 458 int32_t hangul_lindex;
jbe@0 459 int32_t hangul_sindex;
jbe@0 460 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
jbe@0 461 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
jbe@0 462 int32_t hangul_vindex;
jbe@0 463 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
jbe@0 464 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
jbe@0 465 *starter = UTF8PROC_HANGUL_SBASE +
jbe@0 466 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
jbe@0 467 UTF8PROC_HANGUL_TCOUNT;
jbe@0 468 starter_property = NULL;
jbe@0 469 continue;
jbe@0 470 }
jbe@0 471 }
jbe@0 472 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
jbe@0 473 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
jbe@0 474 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
jbe@0 475 int32_t hangul_tindex;
jbe@0 476 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
jbe@0 477 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
jbe@0 478 *starter += hangul_tindex;
jbe@0 479 starter_property = NULL;
jbe@0 480 continue;
jbe@0 481 }
jbe@0 482 }
jbe@0 483 if (!starter_property) {
jbe@0 484 starter_property = utf8proc_get_property(*starter);
jbe@0 485 }
jbe@0 486 if (starter_property->comb1st_index >= 0 &&
jbe@0 487 current_property->comb2nd_index >= 0) {
jbe@0 488 composition = utf8proc_combinations[
jbe@0 489 starter_property->comb1st_index +
jbe@0 490 current_property->comb2nd_index
jbe@0 491 ];
jbe@0 492 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
jbe@0 493 !(utf8proc_get_property(composition)->comp_exclusion))) {
jbe@0 494 *starter = composition;
jbe@0 495 starter_property = NULL;
jbe@0 496 continue;
jbe@0 497 }
jbe@0 498 }
jbe@0 499 }
jbe@0 500 buffer[wpos] = current_char;
jbe@0 501 if (current_property->combining_class) {
jbe@0 502 if (current_property->combining_class > max_combining_class) {
jbe@0 503 max_combining_class = current_property->combining_class;
jbe@0 504 }
jbe@0 505 } else {
jbe@0 506 starter = buffer + wpos;
jbe@0 507 starter_property = NULL;
jbe@0 508 max_combining_class = -1;
jbe@0 509 }
jbe@0 510 wpos++;
jbe@0 511 }
jbe@0 512 length = wpos;
jbe@0 513 }
jbe@0 514 {
jbe@0 515 ssize_t rpos, wpos = 0;
jbe@0 516 int32_t uc;
jbe@0 517 for (rpos = 0; rpos < length; rpos++) {
jbe@0 518 uc = buffer[rpos];
jbe@0 519 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
jbe@0 520 }
jbe@0 521 ((uint8_t *)buffer)[wpos] = 0;
jbe@0 522 return wpos;
jbe@0 523 }
jbe@0 524 }
jbe@0 525
jbe@0 526 ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr,
jbe@0 527 int options) {
jbe@0 528 int32_t *buffer;
jbe@0 529 ssize_t result;
jbe@0 530 *dstptr = NULL;
jbe@0 531 result = utf8proc_decompose(str, strlen, NULL, 0, options);
jbe@0 532 if (result < 0) return result;
jbe@0 533 buffer = malloc(result * sizeof(int32_t) + 1);
jbe@0 534 if (!buffer) return UTF8PROC_ERROR_NOMEM;
jbe@0 535 result = utf8proc_decompose(str, strlen, buffer, result, options);
jbe@0 536 if (result < 0) {
jbe@0 537 free(buffer);
jbe@0 538 return result;
jbe@0 539 }
jbe@0 540 result = utf8proc_reencode(buffer, result, options);
jbe@0 541 if (result < 0) {
jbe@0 542 free(buffer);
jbe@0 543 return result;
jbe@0 544 }
jbe@0 545 {
jbe@0 546 int32_t *newptr;
jbe@0 547 newptr = realloc(buffer, result+1);
jbe@0 548 if (newptr) buffer = newptr;
jbe@0 549 }
jbe@0 550 *dstptr = (uint8_t *)buffer;
jbe@0 551 return result;
jbe@0 552 }
jbe@0 553
jbe@0 554 uint8_t *utf8proc_NFD(uint8_t *str) {
jbe@0 555 uint8_t *retval;
jbe@2 556 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 557 UTF8PROC_DECOMPOSE);
jbe@0 558 return retval;
jbe@0 559 }
jbe@0 560
jbe@0 561 uint8_t *utf8proc_NFC(uint8_t *str) {
jbe@0 562 uint8_t *retval;
jbe@0 563 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 564 UTF8PROC_COMPOSE);
jbe@0 565 return retval;
jbe@0 566 }
jbe@0 567
jbe@0 568 uint8_t *utf8proc_NFKD(uint8_t *str) {
jbe@0 569 uint8_t *retval;
jbe@0 570 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@2 571 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
jbe@0 572 return retval;
jbe@0 573 }
jbe@0 574
jbe@0 575 uint8_t *utf8proc_NFKC(uint8_t *str) {
jbe@0 576 uint8_t *retval;
jbe@0 577 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
jbe@0 578 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
jbe@0 579 return retval;
jbe@0 580 }
jbe@0 581
jbe@0 582

Impressum / About Us