utf8proc
diff utf8proc.c @ 0:a0368662434c
Version 0.1
author | jbe |
---|---|
date | Fri Jun 02 12:00:00 2006 +0200 (2006-06-02) |
parents | |
children | 61a89ecc2fb9 |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/utf8proc.c Fri Jun 02 12:00:00 2006 +0200 1.3 @@ -0,0 +1,444 @@ 1.4 +/* 1.5 + * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany 1.6 + * Author: Jan Behrens <jan.behrens@flexiguided.de> 1.7 + * All rights reserved. 1.8 + * 1.9 + * Redistribution and use in source and binary forms, with or without 1.10 + * modification, are permitted provided that the following conditions are 1.11 + * met: 1.12 + * 1.13 + * 1. Redistributions of source code must retain the above copyright 1.14 + * notice, this list of conditions and the following disclaimer. 1.15 + * 2. Redistributions in binary form must reproduce the above copyright 1.16 + * notice, this list of conditions and the following disclaimer in the 1.17 + * documentation and/or other materials provided with the distribution. 1.18 + * 3. Neither the name of the FlexiGuided GmbH nor the names of its 1.19 + * contributors may be used to endorse or promote products derived from 1.20 + * this software without specific prior written permission. 1.21 + * 1.22 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.23 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.24 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 1.25 + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 1.26 + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.27 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.28 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.29 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.30 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.31 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.32 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.33 + * 1.34 + * 1.35 + * This library contains derived data from a modified version of the 1.36 + * Unicode data files. 1.37 + * 1.38 + * The original data files are available at 1.39 + * http://www.unicode.org/Public/UNIDATA/ 1.40 + * 1.41 + * Please notice the copyright statement in the file "utf8proc_data.c". 1.42 + * 1.43 + */ 1.44 + 1.45 + 1.46 +/* 1.47 + * File name: utf8proc.c 1.48 + * Version: 0.1 1.49 + * Last changed: 2006-05-31 1.50 + * 1.51 + * Description: 1.52 + * Implementation of libutf8proc. 1.53 + */ 1.54 + 1.55 + 1.56 +#include "utf8proc.h" 1.57 +#include "utf8proc_data.c" 1.58 + 1.59 + 1.60 +const int8_t utf8proc_utf8class[256] = { 1.61 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.62 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.63 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.64 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.65 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.66 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.67 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.68 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.69 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.70 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.71 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.72 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.73 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1.74 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1.75 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1.76 + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; 1.77 + 1.78 +#define UTF8PROC_HANGUL_SBASE 0xAC00 1.79 +#define UTF8PROC_HANGUL_LBASE 0x1100 1.80 +#define UTF8PROC_HANGUL_VBASE 0x1161 1.81 +#define UTF8PROC_HANGUL_TBASE 0x11A7 1.82 +#define UTF8PROC_HANGUL_LCOUNT 19 1.83 +#define UTF8PROC_HANGUL_VCOUNT 21 1.84 +#define UTF8PROC_HANGUL_TCOUNT 28 1.85 +#define UTF8PROC_HANGUL_NCOUNT 588 1.86 +#define UTF8PROC_HANGUL_SCOUNT 11172 1.87 + 1.88 + 1.89 +const char *utf8proc_errmsg(ssize_t errcode) { 1.90 + switch (errcode) { 1.91 + case UTF8PROC_ERROR_NOMEM: 1.92 + return "Memory for processing UTF-8 data could not be allocated."; 1.93 + case UTF8PROC_ERROR_OVERFLOW: 1.94 + return "UTF-8 string is too long to be processed."; 1.95 + case UTF8PROC_ERROR_INVALIDUTF8: 1.96 + return "Invalid UTF-8 string"; 1.97 + case UTF8PROC_ERROR_NOTASSIGNED: 1.98 + return "Unassigned Unicode code point found in UTF-8 string."; 1.99 + default: 1.100 + return "An unknown error occured while processing UTF-8 data."; 1.101 + } 1.102 +} 1.103 + 1.104 +ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst) { 1.105 + int length; 1.106 + int i; 1.107 + int32_t uc = -1; 1.108 + *dst = -1; 1.109 + if (!strlen) return 0; 1.110 + length = utf8proc_utf8class[str[0]]; 1.111 + if (!length) return UTF8PROC_ERROR_INVALIDUTF8; 1.112 + if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8; 1.113 + for (i=1; i<length; i++) { 1.114 + if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8; 1.115 + } 1.116 + switch (length) { 1.117 + case 1: 1.118 + uc = str[0]; 1.119 + break; 1.120 + case 2: 1.121 + uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); 1.122 + if (uc < 0x80) uc = -1; 1.123 + break; 1.124 + case 3: 1.125 + uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) 1.126 + + (str[2] & 0x3F); 1.127 + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || 1.128 + (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; 1.129 + break; 1.130 + case 4: 1.131 + uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) 1.132 + + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); 1.133 + if (uc < 0x10000 || uc >= 0x110000) uc = -1; 1.134 + break; 1.135 + } 1.136 + if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return UTF8PROC_ERROR_INVALIDUTF8; 1.137 + *dst = uc; 1.138 + return length; 1.139 +} 1.140 + 1.141 +ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { 1.142 + if (uc < 0x00) { 1.143 + return 0; 1.144 + } else if (uc < 0x80) { 1.145 + dst[0] = uc; 1.146 + return 1; 1.147 + } else if (uc < 0x800) { 1.148 + dst[0] = 0xC0 + (uc >> 6); 1.149 + dst[1] = 0x80 + (uc & 0x3F); 1.150 + return 2; 1.151 + } else if (uc < 0x10000) { 1.152 + dst[0] = 0xE0 + (uc >> 12); 1.153 + dst[1] = 0x80 + ((uc >> 6) & 0x3F); 1.154 + dst[2] = 0x80 + (uc & 0x3F); 1.155 + return 3; 1.156 + } else if (uc < 0x110000) { 1.157 + dst[0] = 0xF0 + (uc >> 18); 1.158 + dst[1] = 0x80 + ((uc >> 12) & 0x3F); 1.159 + dst[2] = 0x80 + ((uc >> 6) & 0x3F); 1.160 + dst[3] = 0x80 + (uc & 0x3F); 1.161 + return 4; 1.162 + } else return 0; 1.163 +} 1.164 + 1.165 +const utf8proc_property_t *utf8proc_get_property(int32_t uc) { 1.166 + // ASSERT: uc >= 0 && uc < 0x110000 1.167 + return utf8proc_properties + ( 1.168 + utf8proc_stage2table[ 1.169 + utf8proc_stage1table[uc >> 8] + (uc & 0xFF) 1.170 + ] 1.171 + ); 1.172 +} 1.173 + 1.174 +ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, 1.175 + int options) { 1.176 + // ASSERT: uc >= 0 && uc < 0x110000 1.177 + const utf8proc_property_t *property; 1.178 + int32_t hangul_sindex; 1.179 + property = utf8proc_get_property(uc); 1.180 + hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 1.181 + if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 1.182 + int32_t hangul_tindex; 1.183 + if (bufsize >= 1) { 1.184 + dst[0] = UTF8PROC_HANGUL_LBASE + 1.185 + hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 1.186 + if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 1.187 + (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 1.188 + } 1.189 + hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 1.190 + if (!hangul_tindex) return 2; 1.191 + if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 1.192 + return 3; 1.193 + } else if ((options & UTF8PROC_REJECTNA) && !property->category) { 1.194 + return UTF8PROC_ERROR_NOTASSIGNED; 1.195 + } else if ((options & UTF8PROC_IGNORE) && property->ignorable) { 1.196 + return 0; 1.197 + } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) { 1.198 + const int32_t *casefold_entry; 1.199 + ssize_t written = 0; 1.200 + for (casefold_entry = property->casefold_mapping; 1.201 + *casefold_entry >= 0; casefold_entry++) { 1.202 + written += utf8proc_decompose_char(*casefold_entry, dst+written, 1.203 + (bufsize > written) ? (bufsize - written) : 0, options); 1.204 + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.205 + } 1.206 + return written; 1.207 + } else if (property->decomp_mapping && 1.208 + (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 1.209 + const int32_t *decomp_entry; 1.210 + ssize_t written = 0; 1.211 + for (decomp_entry = property->decomp_mapping; 1.212 + *decomp_entry >= 0; decomp_entry++) { 1.213 + written += utf8proc_decompose_char(*decomp_entry, dst+written, 1.214 + (bufsize > written) ? (bufsize - written) : 0, options); 1.215 + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 1.216 + } 1.217 + return written; 1.218 + } else { 1.219 + if (bufsize >= 1) *dst = uc; 1.220 + return 1; 1.221 + } 1.222 +} 1.223 + 1.224 +ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen, 1.225 + int32_t *buffer, ssize_t bufsize, int options) { 1.226 + // strlen will be ignored, if UTF8PROC_NULLTERM is set in options 1.227 + ssize_t wpos = 0; 1.228 + { 1.229 + int32_t uc; 1.230 + ssize_t rpos = 0; 1.231 + ssize_t decomp_result; 1.232 + while (1) { 1.233 + if (options & UTF8PROC_NULLTERM) { 1.234 + rpos += utf8proc_iterate(str + rpos, -1, &uc); 1.235 + // checking of return value is not neccessary, 1.236 + // as 'uc' is < 0 in case of error 1.237 + if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 1.238 + if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; 1.239 + if (uc == 0) break; 1.240 + } else { 1.241 + if (rpos >= strlen) break; 1.242 + rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); 1.243 + if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 1.244 + } 1.245 + decomp_result = utf8proc_decompose_char( 1.246 + uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options 1.247 + ); 1.248 + if (decomp_result < 0) return decomp_result; 1.249 + wpos += decomp_result; 1.250 + // prohibiting integer overflows due to too long strings: 1.251 + if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) 1.252 + return UTF8PROC_ERROR_OVERFLOW; 1.253 + } 1.254 + } 1.255 + if (bufsize >= wpos) { 1.256 + ssize_t pos = 0; 1.257 + while (pos < wpos-1) { 1.258 + int32_t uc1, uc2; 1.259 + const utf8proc_property_t *property1, *property2; 1.260 + uc1 = buffer[pos]; 1.261 + uc2 = buffer[pos+1]; 1.262 + property1 = utf8proc_get_property(uc1); 1.263 + property2 = utf8proc_get_property(uc2); 1.264 + if (property1->combining_class > property2->combining_class && 1.265 + property2->combining_class > 0) { 1.266 + buffer[pos] = uc2; 1.267 + buffer[pos+1] = uc1; 1.268 + if (pos > 0) pos--; else pos++; 1.269 + } else { 1.270 + pos++; 1.271 + } 1.272 + } 1.273 + } 1.274 + return wpos; 1.275 +} 1.276 + 1.277 +ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) { 1.278 + // UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored 1.279 + // ASSERT: 'buffer' has one spare byte of free space at the end! 1.280 + if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { 1.281 + ssize_t rpos; 1.282 + ssize_t wpos = 0; 1.283 + int32_t uc; 1.284 + for (rpos = 0; rpos < length; rpos++) { 1.285 + uc = buffer[rpos]; 1.286 + if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; 1.287 + if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || 1.288 + ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { 1.289 + if (options & UTF8PROC_NLF2LS) { 1.290 + if (options & UTF8PROC_NLF2PS) { 1.291 + buffer[wpos++] = 0x000A; 1.292 + } else { 1.293 + buffer[wpos++] = 0x2028; 1.294 + } 1.295 + } else { 1.296 + if (options & UTF8PROC_NLF2PS) { 1.297 + buffer[wpos++] = 0x2029; 1.298 + } else { 1.299 + buffer[wpos++] = 0x0020; 1.300 + } 1.301 + } 1.302 + } else if ((options & UTF8PROC_STRIPCC) && 1.303 + (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { 1.304 + if (uc == 0x0009) buffer[wpos++] = 0x0020; 1.305 + } else { 1.306 + buffer[wpos++] = uc; 1.307 + } 1.308 + } 1.309 + length = wpos; 1.310 + } 1.311 + if (options & UTF8PROC_COMPOSE) { 1.312 + int32_t *starter = NULL; 1.313 + int32_t current_char; 1.314 + const utf8proc_property_t *starter_property = NULL, *current_property; 1.315 + int16_t max_combining_class = -1; 1.316 + ssize_t rpos; 1.317 + ssize_t wpos = 0; 1.318 + int32_t composition; 1.319 + for (rpos = 0; rpos < length; rpos++) { 1.320 + current_char = buffer[rpos]; 1.321 + current_property = utf8proc_get_property(current_char); 1.322 + if (starter && current_property->combining_class > max_combining_class) { 1.323 + // combination perhaps possible 1.324 + int32_t hangul_lindex; 1.325 + int32_t hangul_sindex; 1.326 + hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; 1.327 + if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { 1.328 + int32_t hangul_vindex; 1.329 + hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; 1.330 + if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { 1.331 + *starter = UTF8PROC_HANGUL_SBASE + 1.332 + (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * 1.333 + UTF8PROC_HANGUL_TCOUNT; 1.334 + starter_property = NULL; 1.335 + continue; 1.336 + } 1.337 + } 1.338 + hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; 1.339 + if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && 1.340 + (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { 1.341 + int32_t hangul_tindex; 1.342 + hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; 1.343 + if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { 1.344 + *starter += hangul_tindex; 1.345 + starter_property = NULL; 1.346 + continue; 1.347 + } 1.348 + } 1.349 + if (!starter_property) { 1.350 + starter_property = utf8proc_get_property(*starter); 1.351 + } 1.352 + if (starter_property->comb1st_index >= 0 && 1.353 + current_property->comb2nd_index >= 0) { 1.354 + composition = utf8proc_combinations[ 1.355 + starter_property->comb1st_index + 1.356 + current_property->comb2nd_index 1.357 + ]; 1.358 + if (composition >= 0 && (!(options & UTF8PROC_STABLE) || 1.359 + !(utf8proc_get_property(composition)->comp_exclusion))) { 1.360 + *starter = composition; 1.361 + starter_property = NULL; 1.362 + continue; 1.363 + } 1.364 + } 1.365 + } 1.366 + buffer[wpos] = current_char; 1.367 + if (current_property->combining_class) { 1.368 + if (current_property->combining_class > max_combining_class) { 1.369 + max_combining_class = current_property->combining_class; 1.370 + } 1.371 + } else { 1.372 + starter = buffer + wpos; 1.373 + starter_property = NULL; 1.374 + max_combining_class = -1; 1.375 + } 1.376 + wpos++; 1.377 + } 1.378 + length = wpos; 1.379 + } 1.380 + { 1.381 + ssize_t rpos, wpos = 0; 1.382 + int32_t uc; 1.383 + for (rpos = 0; rpos < length; rpos++) { 1.384 + uc = buffer[rpos]; 1.385 + wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); 1.386 + } 1.387 + ((uint8_t *)buffer)[wpos] = 0; 1.388 + return wpos; 1.389 + } 1.390 +} 1.391 + 1.392 +ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr, 1.393 + int options) { 1.394 + int32_t *buffer; 1.395 + ssize_t result; 1.396 + *dstptr = NULL; 1.397 + result = utf8proc_decompose(str, strlen, NULL, 0, options); 1.398 + if (result < 0) return result; 1.399 + buffer = malloc(result * sizeof(int32_t) + 1); 1.400 + if (!buffer) return UTF8PROC_ERROR_NOMEM; 1.401 + result = utf8proc_decompose(str, strlen, buffer, result, options); 1.402 + if (result < 0) { 1.403 + free(buffer); 1.404 + return result; 1.405 + } 1.406 + result = utf8proc_reencode(buffer, result, options); 1.407 + if (result < 0) { 1.408 + free(buffer); 1.409 + return result; 1.410 + } 1.411 + { 1.412 + int32_t *newptr; 1.413 + newptr = realloc(buffer, result+1); 1.414 + if (newptr) buffer = newptr; 1.415 + } 1.416 + *dstptr = (uint8_t *)buffer; 1.417 + return result; 1.418 +} 1.419 + 1.420 +uint8_t *utf8proc_NFD(uint8_t *str) { 1.421 + uint8_t *retval; 1.422 + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE); 1.423 + return retval; 1.424 +} 1.425 + 1.426 +uint8_t *utf8proc_NFC(uint8_t *str) { 1.427 + uint8_t *retval; 1.428 + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.429 + UTF8PROC_COMPOSE); 1.430 + return retval; 1.431 +} 1.432 + 1.433 +uint8_t *utf8proc_NFKD(uint8_t *str) { 1.434 + uint8_t *retval; 1.435 + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.436 + UTF8PROC_COMPAT); 1.437 + return retval; 1.438 +} 1.439 + 1.440 +uint8_t *utf8proc_NFKC(uint8_t *str) { 1.441 + uint8_t *retval; 1.442 + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.443 + UTF8PROC_COMPOSE | UTF8PROC_COMPAT); 1.444 + return retval; 1.445 +} 1.446 + 1.447 +