utf8proc
diff utf8proc.c @ 7:fcfd8c836c64
Version 1.1.1
- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
author | jbe |
---|---|
date | Sun Jul 22 12:00:00 2007 +0200 (2007-07-22) |
parents | 4ee0d5f54af1 |
children | 951e73a98021 |
line diff
1.1 --- a/utf8proc.c Fri Mar 16 12:00:00 2007 +0100 1.2 +++ b/utf8proc.c Sun Jul 22 12:00:00 2007 +0200 1.3 @@ -1,34 +1,26 @@ 1.4 /* 1.5 - * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany 1.6 - * Author: Jan Behrens <jan.behrens@flexiguided.de> 1.7 - * All rights reserved. 1.8 + * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin 1.9 * 1.10 - * Redistribution and use in source and binary forms, with or without 1.11 - * modification, are permitted provided that the following conditions are 1.12 - * met: 1.13 + * Permission is hereby granted, free of charge, to any person obtaining a 1.14 + * copy of this software and associated documentation files (the "Software"), 1.15 + * to deal in the Software without restriction, including without limitation 1.16 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1.17 + * and/or sell copies of the Software, and to permit persons to whom the 1.18 + * Software is furnished to do so, subject to the following conditions: 1.19 * 1.20 - * 1. Redistributions of source code must retain the above copyright 1.21 - * notice, this list of conditions and the following disclaimer. 1.22 - * 2. Redistributions in binary form must reproduce the above copyright 1.23 - * notice, this list of conditions and the following disclaimer in the 1.24 - * documentation and/or other materials provided with the distribution. 1.25 - * 3. Neither the name of the FlexiGuided GmbH nor the names of its 1.26 - * contributors may be used to endorse or promote products derived from 1.27 - * this software without specific prior written permission. 1.28 + * The above copyright notice and this permission notice shall be included in 1.29 + * all copies or substantial portions of the Software. 1.30 * 1.31 - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.32 - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.33 - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 1.34 - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 1.35 - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.36 - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.37 - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.38 - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.39 - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.40 - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.41 - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.42 - * 1.43 - * 1.44 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1.45 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1.46 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1.47 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1.48 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1.49 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 1.50 + * DEALINGS IN THE SOFTWARE. 1.51 + */ 1.52 + 1.53 +/* 1.54 * This library contains derived data from a modified version of the 1.55 * Unicode data files. 1.56 * 1.57 @@ -36,14 +28,13 @@ 1.58 * http://www.unicode.org/Public/UNIDATA/ 1.59 * 1.60 * Please notice the copyright statement in the file "utf8proc_data.c". 1.61 - * 1.62 */ 1.63 1.64 1.65 /* 1.66 * File name: utf8proc.c 1.67 - * Version: 1.0 1.68 - * Last changed: 2006-09-17 1.69 + * Version: 1.1.1 1.70 + * Last changed: 2007-07-22 1.71 * 1.72 * Description: 1.73 * Implementation of libutf8proc. 1.74 @@ -123,7 +114,9 @@ 1.75 } 1.76 } 1.77 1.78 -ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst) { 1.79 +ssize_t utf8proc_iterate( 1.80 + const uint8_t *str, ssize_t strlen, int32_t *dst 1.81 +) { 1.82 int length; 1.83 int i; 1.84 int32_t uc = -1; 1.85 @@ -155,11 +148,19 @@ 1.86 if (uc < 0x10000 || uc >= 0x110000) uc = -1; 1.87 break; 1.88 } 1.89 - if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return UTF8PROC_ERROR_INVALIDUTF8; 1.90 + if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) 1.91 + return UTF8PROC_ERROR_INVALIDUTF8; 1.92 *dst = uc; 1.93 return length; 1.94 } 1.95 1.96 +bool utf8proc_codepoint_valid(int32_t uc) { 1.97 + if (uc < 0 || uc >= 0x110000 || 1.98 + ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) || 1.99 + (uc >= 0xFDD0 && uc < 0xFDF0)) return false; 1.100 + else return true; 1.101 +} 1.102 + 1.103 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { 1.104 if (uc < 0x00) { 1.105 return 0; 1.106 @@ -348,8 +349,10 @@ 1.107 return 1; 1.108 } 1.109 1.110 -ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen, 1.111 - int32_t *buffer, ssize_t bufsize, int options) { 1.112 +ssize_t utf8proc_decompose( 1.113 + const uint8_t *str, ssize_t strlen, 1.114 + int32_t *buffer, ssize_t bufsize, int options 1.115 +) { 1.116 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options 1.117 ssize_t wpos = 0; 1.118 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 1.119 @@ -523,8 +526,9 @@ 1.120 } 1.121 } 1.122 1.123 -ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr, 1.124 - int options) { 1.125 +ssize_t utf8proc_map( 1.126 + const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options 1.127 +) { 1.128 int32_t *buffer; 1.129 ssize_t result; 1.130 *dstptr = NULL; 1.131 @@ -551,32 +555,31 @@ 1.132 return result; 1.133 } 1.134 1.135 -uint8_t *utf8proc_NFD(uint8_t *str) { 1.136 +uint8_t *utf8proc_NFD(const uint8_t *str) { 1.137 uint8_t *retval; 1.138 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.139 UTF8PROC_DECOMPOSE); 1.140 return retval; 1.141 } 1.142 1.143 -uint8_t *utf8proc_NFC(uint8_t *str) { 1.144 +uint8_t *utf8proc_NFC(const uint8_t *str) { 1.145 uint8_t *retval; 1.146 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.147 UTF8PROC_COMPOSE); 1.148 return retval; 1.149 } 1.150 1.151 -uint8_t *utf8proc_NFKD(uint8_t *str) { 1.152 +uint8_t *utf8proc_NFKD(const uint8_t *str) { 1.153 uint8_t *retval; 1.154 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.155 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); 1.156 return retval; 1.157 } 1.158 1.159 -uint8_t *utf8proc_NFKC(uint8_t *str) { 1.160 +uint8_t *utf8proc_NFKC(const uint8_t *str) { 1.161 uint8_t *retval; 1.162 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 1.163 UTF8PROC_COMPOSE | UTF8PROC_COMPAT); 1.164 return retval; 1.165 } 1.166 1.167 -