jbe@0: /* jbe@0: * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany jbe@0: * Author: Jan Behrens jbe@0: * All rights reserved. jbe@0: * jbe@0: * Redistribution and use in source and binary forms, with or without jbe@0: * modification, are permitted provided that the following conditions are jbe@0: * met: jbe@0: * jbe@0: * 1. Redistributions of source code must retain the above copyright jbe@0: * notice, this list of conditions and the following disclaimer. jbe@0: * 2. Redistributions in binary form must reproduce the above copyright jbe@0: * notice, this list of conditions and the following disclaimer in the jbe@0: * documentation and/or other materials provided with the distribution. jbe@0: * 3. Neither the name of the FlexiGuided GmbH nor the names of its jbe@0: * contributors may be used to endorse or promote products derived from jbe@0: * this software without specific prior written permission. jbe@0: * jbe@0: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS jbe@0: * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT jbe@0: * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A jbe@0: * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER jbe@0: * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, jbe@0: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, jbe@0: * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR jbe@0: * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF jbe@0: * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING jbe@0: * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS jbe@0: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jbe@0: * jbe@0: * jbe@0: * This library contains derived data from a modified version of the jbe@0: * Unicode data files. jbe@0: * jbe@0: * The original data files are available at jbe@0: * http://www.unicode.org/Public/UNIDATA/ jbe@0: * jbe@0: * Please notice the copyright statement in the file "utf8proc_data.c". jbe@0: * jbe@0: */ jbe@0: jbe@0: jbe@0: /* jbe@0: * File name: utf8proc.c jbe@0: * Version: 0.1 jbe@0: * Last changed: 2006-05-31 jbe@0: * jbe@0: * Description: jbe@0: * Implementation of libutf8proc. jbe@0: */ jbe@0: jbe@0: jbe@0: #include "utf8proc.h" jbe@0: #include "utf8proc_data.c" jbe@0: jbe@0: jbe@0: const int8_t utf8proc_utf8class[256] = { jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, jbe@0: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, jbe@0: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, jbe@0: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, jbe@0: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, jbe@0: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, jbe@0: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, jbe@0: 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, jbe@0: 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; jbe@0: jbe@0: #define UTF8PROC_HANGUL_SBASE 0xAC00 jbe@0: #define UTF8PROC_HANGUL_LBASE 0x1100 jbe@0: #define UTF8PROC_HANGUL_VBASE 0x1161 jbe@0: #define UTF8PROC_HANGUL_TBASE 0x11A7 jbe@0: #define UTF8PROC_HANGUL_LCOUNT 19 jbe@0: #define UTF8PROC_HANGUL_VCOUNT 21 jbe@0: #define UTF8PROC_HANGUL_TCOUNT 28 jbe@0: #define UTF8PROC_HANGUL_NCOUNT 588 jbe@0: #define UTF8PROC_HANGUL_SCOUNT 11172 jbe@0: jbe@0: jbe@0: const char *utf8proc_errmsg(ssize_t errcode) { jbe@0: switch (errcode) { jbe@0: case UTF8PROC_ERROR_NOMEM: jbe@0: return "Memory for processing UTF-8 data could not be allocated."; jbe@0: case UTF8PROC_ERROR_OVERFLOW: jbe@0: return "UTF-8 string is too long to be processed."; jbe@0: case UTF8PROC_ERROR_INVALIDUTF8: jbe@0: return "Invalid UTF-8 string"; jbe@0: case UTF8PROC_ERROR_NOTASSIGNED: jbe@0: return "Unassigned Unicode code point found in UTF-8 string."; jbe@0: default: jbe@0: return "An unknown error occured while processing UTF-8 data."; jbe@0: } jbe@0: } jbe@0: jbe@0: ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst) { jbe@0: int length; jbe@0: int i; jbe@0: int32_t uc = -1; jbe@0: *dst = -1; jbe@0: if (!strlen) return 0; jbe@0: length = utf8proc_utf8class[str[0]]; jbe@0: if (!length) return UTF8PROC_ERROR_INVALIDUTF8; jbe@0: if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8; jbe@0: for (i=1; i= 0xD800 && uc < 0xE000) || jbe@0: (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; jbe@0: break; jbe@0: case 4: jbe@0: uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) jbe@0: + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); jbe@0: if (uc < 0x10000 || uc >= 0x110000) uc = -1; jbe@0: break; jbe@0: } jbe@0: if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return UTF8PROC_ERROR_INVALIDUTF8; jbe@0: *dst = uc; jbe@0: return length; jbe@0: } jbe@0: jbe@0: ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { jbe@0: if (uc < 0x00) { jbe@0: return 0; jbe@0: } else if (uc < 0x80) { jbe@0: dst[0] = uc; jbe@0: return 1; jbe@0: } else if (uc < 0x800) { jbe@0: dst[0] = 0xC0 + (uc >> 6); jbe@0: dst[1] = 0x80 + (uc & 0x3F); jbe@0: return 2; jbe@0: } else if (uc < 0x10000) { jbe@0: dst[0] = 0xE0 + (uc >> 12); jbe@0: dst[1] = 0x80 + ((uc >> 6) & 0x3F); jbe@0: dst[2] = 0x80 + (uc & 0x3F); jbe@0: return 3; jbe@0: } else if (uc < 0x110000) { jbe@0: dst[0] = 0xF0 + (uc >> 18); jbe@0: dst[1] = 0x80 + ((uc >> 12) & 0x3F); jbe@0: dst[2] = 0x80 + ((uc >> 6) & 0x3F); jbe@0: dst[3] = 0x80 + (uc & 0x3F); jbe@0: return 4; jbe@0: } else return 0; jbe@0: } jbe@0: jbe@0: const utf8proc_property_t *utf8proc_get_property(int32_t uc) { jbe@0: // ASSERT: uc >= 0 && uc < 0x110000 jbe@0: return utf8proc_properties + ( jbe@0: utf8proc_stage2table[ jbe@0: utf8proc_stage1table[uc >> 8] + (uc & 0xFF) jbe@0: ] jbe@0: ); jbe@0: } jbe@0: jbe@0: ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, jbe@0: int options) { jbe@0: // ASSERT: uc >= 0 && uc < 0x110000 jbe@0: const utf8proc_property_t *property; jbe@0: int32_t hangul_sindex; jbe@0: property = utf8proc_get_property(uc); jbe@0: hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; jbe@0: if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { jbe@0: int32_t hangul_tindex; jbe@0: if (bufsize >= 1) { jbe@0: dst[0] = UTF8PROC_HANGUL_LBASE + jbe@0: hangul_sindex / UTF8PROC_HANGUL_NCOUNT; jbe@0: if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + jbe@0: (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; jbe@0: } jbe@0: hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; jbe@0: if (!hangul_tindex) return 2; jbe@0: if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; jbe@0: return 3; jbe@0: } else if ((options & UTF8PROC_REJECTNA) && !property->category) { jbe@0: return UTF8PROC_ERROR_NOTASSIGNED; jbe@0: } else if ((options & UTF8PROC_IGNORE) && property->ignorable) { jbe@0: return 0; jbe@0: } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) { jbe@0: const int32_t *casefold_entry; jbe@0: ssize_t written = 0; jbe@0: for (casefold_entry = property->casefold_mapping; jbe@0: *casefold_entry >= 0; casefold_entry++) { jbe@0: written += utf8proc_decompose_char(*casefold_entry, dst+written, jbe@0: (bufsize > written) ? (bufsize - written) : 0, options); jbe@0: if (written < 0) return UTF8PROC_ERROR_OVERFLOW; jbe@0: } jbe@0: return written; jbe@0: } else if (property->decomp_mapping && jbe@0: (!property->decomp_type || (options & UTF8PROC_COMPAT))) { jbe@0: const int32_t *decomp_entry; jbe@0: ssize_t written = 0; jbe@0: for (decomp_entry = property->decomp_mapping; jbe@0: *decomp_entry >= 0; decomp_entry++) { jbe@0: written += utf8proc_decompose_char(*decomp_entry, dst+written, jbe@0: (bufsize > written) ? (bufsize - written) : 0, options); jbe@0: if (written < 0) return UTF8PROC_ERROR_OVERFLOW; jbe@0: } jbe@0: return written; jbe@0: } else { jbe@0: if (bufsize >= 1) *dst = uc; jbe@0: return 1; jbe@0: } jbe@0: } jbe@0: jbe@0: ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen, jbe@0: int32_t *buffer, ssize_t bufsize, int options) { jbe@0: // strlen will be ignored, if UTF8PROC_NULLTERM is set in options jbe@0: ssize_t wpos = 0; jbe@0: { jbe@0: int32_t uc; jbe@0: ssize_t rpos = 0; jbe@0: ssize_t decomp_result; jbe@0: while (1) { jbe@0: if (options & UTF8PROC_NULLTERM) { jbe@0: rpos += utf8proc_iterate(str + rpos, -1, &uc); jbe@0: // checking of return value is not neccessary, jbe@0: // as 'uc' is < 0 in case of error jbe@0: if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; jbe@0: if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; jbe@0: if (uc == 0) break; jbe@0: } else { jbe@0: if (rpos >= strlen) break; jbe@0: rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); jbe@0: if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; jbe@0: } jbe@0: decomp_result = utf8proc_decompose_char( jbe@0: uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options jbe@0: ); jbe@0: if (decomp_result < 0) return decomp_result; jbe@0: wpos += decomp_result; jbe@0: // prohibiting integer overflows due to too long strings: jbe@0: if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) jbe@0: return UTF8PROC_ERROR_OVERFLOW; jbe@0: } jbe@0: } jbe@0: if (bufsize >= wpos) { jbe@0: ssize_t pos = 0; jbe@0: while (pos < wpos-1) { jbe@0: int32_t uc1, uc2; jbe@0: const utf8proc_property_t *property1, *property2; jbe@0: uc1 = buffer[pos]; jbe@0: uc2 = buffer[pos+1]; jbe@0: property1 = utf8proc_get_property(uc1); jbe@0: property2 = utf8proc_get_property(uc2); jbe@0: if (property1->combining_class > property2->combining_class && jbe@0: property2->combining_class > 0) { jbe@0: buffer[pos] = uc2; jbe@0: buffer[pos+1] = uc1; jbe@0: if (pos > 0) pos--; else pos++; jbe@0: } else { jbe@0: pos++; jbe@0: } jbe@0: } jbe@0: } jbe@0: return wpos; jbe@0: } jbe@0: jbe@0: ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) { jbe@0: // UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored jbe@0: // ASSERT: 'buffer' has one spare byte of free space at the end! jbe@0: if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { jbe@0: ssize_t rpos; jbe@0: ssize_t wpos = 0; jbe@0: int32_t uc; jbe@0: for (rpos = 0; rpos < length; rpos++) { jbe@0: uc = buffer[rpos]; jbe@0: if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; jbe@0: if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || jbe@0: ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { jbe@0: if (options & UTF8PROC_NLF2LS) { jbe@0: if (options & UTF8PROC_NLF2PS) { jbe@0: buffer[wpos++] = 0x000A; jbe@0: } else { jbe@0: buffer[wpos++] = 0x2028; jbe@0: } jbe@0: } else { jbe@0: if (options & UTF8PROC_NLF2PS) { jbe@0: buffer[wpos++] = 0x2029; jbe@0: } else { jbe@0: buffer[wpos++] = 0x0020; jbe@0: } jbe@0: } jbe@0: } else if ((options & UTF8PROC_STRIPCC) && jbe@0: (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { jbe@0: if (uc == 0x0009) buffer[wpos++] = 0x0020; jbe@0: } else { jbe@0: buffer[wpos++] = uc; jbe@0: } jbe@0: } jbe@0: length = wpos; jbe@0: } jbe@0: if (options & UTF8PROC_COMPOSE) { jbe@0: int32_t *starter = NULL; jbe@0: int32_t current_char; jbe@0: const utf8proc_property_t *starter_property = NULL, *current_property; jbe@0: int16_t max_combining_class = -1; jbe@0: ssize_t rpos; jbe@0: ssize_t wpos = 0; jbe@0: int32_t composition; jbe@0: for (rpos = 0; rpos < length; rpos++) { jbe@0: current_char = buffer[rpos]; jbe@0: current_property = utf8proc_get_property(current_char); jbe@0: if (starter && current_property->combining_class > max_combining_class) { jbe@0: // combination perhaps possible jbe@0: int32_t hangul_lindex; jbe@0: int32_t hangul_sindex; jbe@0: hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; jbe@0: if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { jbe@0: int32_t hangul_vindex; jbe@0: hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; jbe@0: if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { jbe@0: *starter = UTF8PROC_HANGUL_SBASE + jbe@0: (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * jbe@0: UTF8PROC_HANGUL_TCOUNT; jbe@0: starter_property = NULL; jbe@0: continue; jbe@0: } jbe@0: } jbe@0: hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; jbe@0: if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && jbe@0: (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { jbe@0: int32_t hangul_tindex; jbe@0: hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; jbe@0: if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { jbe@0: *starter += hangul_tindex; jbe@0: starter_property = NULL; jbe@0: continue; jbe@0: } jbe@0: } jbe@0: if (!starter_property) { jbe@0: starter_property = utf8proc_get_property(*starter); jbe@0: } jbe@0: if (starter_property->comb1st_index >= 0 && jbe@0: current_property->comb2nd_index >= 0) { jbe@0: composition = utf8proc_combinations[ jbe@0: starter_property->comb1st_index + jbe@0: current_property->comb2nd_index jbe@0: ]; jbe@0: if (composition >= 0 && (!(options & UTF8PROC_STABLE) || jbe@0: !(utf8proc_get_property(composition)->comp_exclusion))) { jbe@0: *starter = composition; jbe@0: starter_property = NULL; jbe@0: continue; jbe@0: } jbe@0: } jbe@0: } jbe@0: buffer[wpos] = current_char; jbe@0: if (current_property->combining_class) { jbe@0: if (current_property->combining_class > max_combining_class) { jbe@0: max_combining_class = current_property->combining_class; jbe@0: } jbe@0: } else { jbe@0: starter = buffer + wpos; jbe@0: starter_property = NULL; jbe@0: max_combining_class = -1; jbe@0: } jbe@0: wpos++; jbe@0: } jbe@0: length = wpos; jbe@0: } jbe@0: { jbe@0: ssize_t rpos, wpos = 0; jbe@0: int32_t uc; jbe@0: for (rpos = 0; rpos < length; rpos++) { jbe@0: uc = buffer[rpos]; jbe@0: wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); jbe@0: } jbe@0: ((uint8_t *)buffer)[wpos] = 0; jbe@0: return wpos; jbe@0: } jbe@0: } jbe@0: jbe@0: ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr, jbe@0: int options) { jbe@0: int32_t *buffer; jbe@0: ssize_t result; jbe@0: *dstptr = NULL; jbe@0: result = utf8proc_decompose(str, strlen, NULL, 0, options); jbe@0: if (result < 0) return result; jbe@0: buffer = malloc(result * sizeof(int32_t) + 1); jbe@0: if (!buffer) return UTF8PROC_ERROR_NOMEM; jbe@0: result = utf8proc_decompose(str, strlen, buffer, result, options); jbe@0: if (result < 0) { jbe@0: free(buffer); jbe@0: return result; jbe@0: } jbe@0: result = utf8proc_reencode(buffer, result, options); jbe@0: if (result < 0) { jbe@0: free(buffer); jbe@0: return result; jbe@0: } jbe@0: { jbe@0: int32_t *newptr; jbe@0: newptr = realloc(buffer, result+1); jbe@0: if (newptr) buffer = newptr; jbe@0: } jbe@0: *dstptr = (uint8_t *)buffer; jbe@0: return result; jbe@0: } jbe@0: jbe@0: uint8_t *utf8proc_NFD(uint8_t *str) { jbe@0: uint8_t *retval; jbe@0: utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE); jbe@0: return retval; jbe@0: } jbe@0: jbe@0: uint8_t *utf8proc_NFC(uint8_t *str) { jbe@0: uint8_t *retval; jbe@0: utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | jbe@0: UTF8PROC_COMPOSE); jbe@0: return retval; jbe@0: } jbe@0: jbe@0: uint8_t *utf8proc_NFKD(uint8_t *str) { jbe@0: uint8_t *retval; jbe@0: utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | jbe@0: UTF8PROC_COMPAT); jbe@0: return retval; jbe@0: } jbe@0: jbe@0: uint8_t *utf8proc_NFKC(uint8_t *str) { jbe@0: uint8_t *retval; jbe@0: utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | jbe@0: UTF8PROC_COMPOSE | UTF8PROC_COMPAT); jbe@0: return retval; jbe@0: } jbe@0: jbe@0: