utf8proc
diff utf8proc.h @ 7:fcfd8c836c64
Version 1.1.1
- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
author | jbe |
---|---|
date | Sun Jul 22 12:00:00 2007 +0200 (2007-07-22) |
parents | 4ee0d5f54af1 |
children | 951e73a98021 |
line diff
1.1 --- a/utf8proc.h Fri Mar 16 12:00:00 2007 +0100 1.2 +++ b/utf8proc.h Sun Jul 22 12:00:00 2007 +0200 1.3 @@ -1,49 +1,30 @@ 1.4 /* 1.5 - * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany 1.6 - * Author: Jan Behrens <jan.behrens@flexiguided.de> 1.7 - * All rights reserved. 1.8 - * 1.9 - * Redistribution and use in source and binary forms, with or without 1.10 - * modification, are permitted provided that the following conditions are 1.11 - * met: 1.12 + * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin 1.13 * 1.14 - * 1. Redistributions of source code must retain the above copyright 1.15 - * notice, this list of conditions and the following disclaimer. 1.16 - * 2. Redistributions in binary form must reproduce the above copyright 1.17 - * notice, this list of conditions and the following disclaimer in the 1.18 - * documentation and/or other materials provided with the distribution. 1.19 - * 3. Neither the name of the FlexiGuided GmbH nor the names of its 1.20 - * contributors may be used to endorse or promote products derived from 1.21 - * this software without specific prior written permission. 1.22 + * Permission is hereby granted, free of charge, to any person obtaining a 1.23 + * copy of this software and associated documentation files (the "Software"), 1.24 + * to deal in the Software without restriction, including without limitation 1.25 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1.26 + * and/or sell copies of the Software, and to permit persons to whom the 1.27 + * Software is furnished to do so, subject to the following conditions: 1.28 * 1.29 - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.30 - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.31 - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 1.32 - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 1.33 - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.34 - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.35 - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.36 - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.37 - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.38 - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.39 - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.40 + * The above copyright notice and this permission notice shall be included in 1.41 + * all copies or substantial portions of the Software. 1.42 * 1.43 - * 1.44 - * This library contains derived data from a modified version of the 1.45 - * Unicode data files. 1.46 - * 1.47 - * The original data files are available at 1.48 - * http://www.unicode.org/Public/UNIDATA/ 1.49 - * 1.50 - * Please notice the copyright statement in the file "utf8proc_data.c". 1.51 - * 1.52 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1.53 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1.54 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1.55 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1.56 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1.57 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 1.58 + * DEALINGS IN THE SOFTWARE. 1.59 */ 1.60 - 1.61 + 1.62 1.63 /* 1.64 * File name: utf8proc.h 1.65 - * Version: 1.0 1.66 - * Last changed: 2006-09-17 1.67 + * Version: 1.1.1 1.68 + * Last changed: 2007-07-22 1.69 * 1.70 * Description: 1.71 * Header files for libutf8proc, which is a mapping tool for UTF-8 strings 1.72 @@ -62,7 +43,8 @@ 1.73 * or to the unicode chararacters for paragraph separation (PS) 1.74 * or line separation (LS). 1.75 * - unicode case folding (for case insensitive string comparisons) 1.76 - * - rejection of illegal UTF-8 data (i.e. UTF-8 encoded UTF-16 surrogates) 1.77 + * - rejection of illegal UTF-8 data 1.78 + * (i.e. UTF-8 encoded UTF-16 surrogates) 1.79 * - support for korean hangul characters 1.80 * Unicode Version 5.0.0 is supported. 1.81 */ 1.82 @@ -106,7 +88,8 @@ 1.83 * COMPOSE: Return a result with composed characters. 1.84 * DECOMPOSE: Return a result with decomposed characters. 1.85 * IGNORE: Strip "default ignorable characters" 1.86 - * REJECTNA: Return an error, if the input contains unassigned code points. 1.87 + * REJECTNA: Return an error, if the input contains unassigned 1.88 + * code points. 1.89 * NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are 1.90 * representing a line break, and should be converted to the 1.91 * unicode character for line separation (LS). 1.92 @@ -115,20 +98,20 @@ 1.93 * paragraph separation (PS). 1.94 * NLF2LF: Indicating that the meaning of NLF-sequences is unknown. 1.95 * STRIPCC: Strips and/or convers control characters. 1.96 - * NLF-sequences are transformed into space, except if one of the 1.97 - * NLF2LS/PS/LF options is given. 1.98 + * NLF-sequences are transformed into space, except if one of 1.99 + * the NLF2LS/PS/LF options is given. 1.100 * HorizontalTab (HT) and FormFeed (FF) are treated as a 1.101 * NLF-sequence in this case. 1.102 * All other control characters are simply removed. 1.103 * CASEFOLD: Performs unicode case folding, to be able to do a 1.104 * case-insensitive string comparison. 1.105 - * CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which is 1.106 - * representing a single grapheme cluster (a single character). 1.107 + * CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which 1.108 + * is representing a single grapheme cluster (see UAX#29). 1.109 * LUMP: Lumps certain characters together 1.110 * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-"). 1.111 * (See lump.txt for details.) 1.112 - * If NLF2LF is set, this includes a transformation of paragraph 1.113 - * and line separators to ASCII line-feed (LF). 1.114 + * If NLF2LF is set, this includes a transformation of 1.115 + * paragraph and line separators to ASCII line-feed (LF). 1.116 * STRIPMARK: Strips all character markings 1.117 * (non-spacing, spacing and enclosing) (i.e. accents) 1.118 * NOTE: this option works only with COMPOSE or DECOMPOSE 1.119 @@ -156,16 +139,16 @@ 1.120 utf8proc_propval_t bidi_class; 1.121 utf8proc_propval_t decomp_type; 1.122 const int32_t *decomp_mapping; 1.123 - const unsigned bidi_mirrored:1; 1.124 - const int32_t uppercase_mapping; 1.125 - const int32_t lowercase_mapping; 1.126 - const int32_t titlecase_mapping; 1.127 - const int32_t comb1st_index; 1.128 - const int32_t comb2nd_index; 1.129 - const unsigned comp_exclusion:1; 1.130 - const unsigned ignorable:1; 1.131 - const unsigned control_boundary:1; 1.132 - const unsigned extend:1; 1.133 + unsigned bidi_mirrored:1; 1.134 + int32_t uppercase_mapping; 1.135 + int32_t lowercase_mapping; 1.136 + int32_t titlecase_mapping; 1.137 + int32_t comb1st_index; 1.138 + int32_t comb2nd_index; 1.139 + unsigned comp_exclusion:1; 1.140 + unsigned ignorable:1; 1.141 + unsigned control_boundary:1; 1.142 + unsigned extend:1; 1.143 const int32_t *casefold_mapping; 1.144 } utf8proc_property_t; 1.145 1.146 @@ -242,7 +225,7 @@ 1.147 * Returns a static error string for the given error code. 1.148 */ 1.149 1.150 -ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst); 1.151 +ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst); 1.152 /* 1.153 * Reads a single char from the UTF-8 sequence being pointed to by 'str'. 1.154 * The maximum number of bytes read is 'strlen', unless 'strlen' is 1.155 @@ -253,12 +236,18 @@ 1.156 * negative error code is returned. 1.157 */ 1.158 1.159 +bool utf8proc_codepoint_valid(int32_t uc); 1.160 +/* 1.161 + * Returns 1, if the given unicode code-point is valid, otherwise 0. 1.162 + */ 1.163 + 1.164 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst); 1.165 /* 1.166 * Encodes the unicode char with the code point 'uc' as an UTF-8 string in 1.167 * the byte array being pointed to by 'dst'. This array has to be at least 1.168 * 4 bytes long. 1.169 - * In case of success the number of bytes written is returned, otherwise 0. 1.170 + * In case of success the number of bytes written is returned, 1.171 + * otherwise 0. 1.172 * This function does not check if 'uc' is a valid unicode code point. 1.173 */ 1.174 1.175 @@ -272,8 +261,10 @@ 1.176 * 0x10FFFF, otherwise the program might crash! 1.177 */ 1.178 1.179 -ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, 1.180 - int options, int *last_boundclass); 1.181 +ssize_t utf8proc_decompose_char( 1.182 + int32_t uc, int32_t *dst, ssize_t bufsize, 1.183 + int options, int *last_boundclass 1.184 +); 1.185 /* 1.186 * Writes a decomposition of the unicode char 'uc' into the array being 1.187 * pointed to by 'dst'. 1.188 @@ -286,9 +277,9 @@ 1.189 * CHARBOUND: Inserts 0xFF bytes before each grapheme cluster 1.190 * LUMP: lumps certain different characters together 1.191 * STRIPMARK: removes all character marks 1.192 - * The pointer 'last_boundclass' has to point to an integer variable which is 1.193 - * storing the last character boundary class, if the CHARBOUND option is 1.194 - * used. 1.195 + * The pointer 'last_boundclass' has to point to an integer variable which 1.196 + * is storing the last character boundary class, if the CHARBOUND option 1.197 + * is used. 1.198 * In case of success the number of chars written is returned, 1.199 * in case of an error, a negative error code is returned. 1.200 * If the number of written chars would be bigger than 'bufsize', 1.201 @@ -298,8 +289,10 @@ 1.202 * 0x10FFFF, otherwise the program might crash! 1.203 */ 1.204 1.205 -ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen, 1.206 - int32_t *buffer, ssize_t bufsize, int options); 1.207 +ssize_t utf8proc_decompose( 1.208 + const uint8_t *str, ssize_t strlen, 1.209 + int32_t *buffer, ssize_t bufsize, int options 1.210 +); 1.211 /* 1.212 * Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8 1.213 * string, and orders the decomposed sequences correctly. 1.214 @@ -324,39 +317,42 @@ 1.215 * NLF2PS: converts LF, CRLF, CR and NEL into PS 1.216 * NLF2LF: converts LF, CRLF, CR and NEL into LF 1.217 * STRIPCC: strips or converts all non-affected control characters 1.218 - * COMPOSE: tries to combine decomposed characters into composite characters 1.219 + * COMPOSE: tries to combine decomposed characters into composite 1.220 + * characters 1.221 * STABLE: prohibits combining characters which would violate 1.222 * the unicode versioning stability 1.223 - * In case of success the length of the resulting UTF-8 string is returned, 1.224 - * otherwise a negative error code is returned. 1.225 + * In case of success the length of the resulting UTF-8 string is 1.226 + * returned, otherwise a negative error code is returned. 1.227 * WARNING: The amount of free space being pointed to by 'buffer', has to 1.228 * exceed the amount of the input data by one byte, and the 1.229 * entries of the array pointed to by 'str' have to be in the 1.230 - * range of 0x0000 to 0x10FFFF, otherwise the program might crash! 1.231 + * range of 0x0000 to 0x10FFFF, otherwise the program might 1.232 + * crash! 1.233 */ 1.234 1.235 -ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr, 1.236 - int options); 1.237 +ssize_t utf8proc_map( 1.238 + const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options 1.239 +); 1.240 /* 1.241 * Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8 1.242 - * string, which is allocated dynamically, and afterwards pointed to by the 1.243 - * pointer being pointed to by 'dstptr'. 1.244 + * string, which is allocated dynamically, and afterwards pointed to by 1.245 + * the pointer being pointed to by 'dstptr'. 1.246 * If the NULLTERM flag in the 'options' field is set, the length is 1.247 * determined by a NULL terminator, otherwise the parameter 'strlen' is 1.248 * evaluated to determine the string length, but in any case the result 1.249 - * will be NULL terminated (though it might contain NULL characters before). 1.250 - * Other flags in the 'options' field are passed to the functions defined 1.251 - * above, and regarded as described. 1.252 + * will be NULL terminated (though it might contain NULL characters 1.253 + * before). Other flags in the 'options' field are passed to the functions 1.254 + * defined above, and regarded as described. 1.255 * In case of success the length of the new string is returned, 1.256 * otherwise a negative error code is returned. 1.257 * NOTICE: The memory of the new UTF-8 string will have been allocated with 1.258 * 'malloc', and has theirfore to be freed with 'free'. 1.259 */ 1.260 1.261 -uint8_t *utf8proc_NFD(uint8_t *str); 1.262 -uint8_t *utf8proc_NFC(uint8_t *str); 1.263 -uint8_t *utf8proc_NFKD(uint8_t *str); 1.264 -uint8_t *utf8proc_NFKC(uint8_t *str); 1.265 +uint8_t *utf8proc_NFD(const uint8_t *str); 1.266 +uint8_t *utf8proc_NFC(const uint8_t *str); 1.267 +uint8_t *utf8proc_NFKD(const uint8_t *str); 1.268 +uint8_t *utf8proc_NFKC(const uint8_t *str); 1.269 /* 1.270 * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC 1.271 * normalized version of the null-terminated string 'str'. 1.272 @@ -365,4 +361,3 @@ 1.273 1.274 #endif 1.275 1.276 -