utf8proc
diff utf8proc.h @ 0:a0368662434c
Version 0.1
author | jbe |
---|---|
date | Fri Jun 02 12:00:00 2006 +0200 (2006-06-02) |
parents | |
children | 61a89ecc2fb9 |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/utf8proc.h Fri Jun 02 12:00:00 2006 +0200 1.3 @@ -0,0 +1,271 @@ 1.4 +/* 1.5 + * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany 1.6 + * Author: Jan Behrens <jan.behrens@flexiguided.de> 1.7 + * All rights reserved. 1.8 + * 1.9 + * Redistribution and use in source and binary forms, with or without 1.10 + * modification, are permitted provided that the following conditions are 1.11 + * met: 1.12 + * 1.13 + * 1. Redistributions of source code must retain the above copyright 1.14 + * notice, this list of conditions and the following disclaimer. 1.15 + * 2. Redistributions in binary form must reproduce the above copyright 1.16 + * notice, this list of conditions and the following disclaimer in the 1.17 + * documentation and/or other materials provided with the distribution. 1.18 + * 3. Neither the name of the FlexiGuided GmbH nor the names of its 1.19 + * contributors may be used to endorse or promote products derived from 1.20 + * this software without specific prior written permission. 1.21 + * 1.22 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.23 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.24 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 1.25 + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 1.26 + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.27 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.28 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.29 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.30 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.31 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.32 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.33 + * 1.34 + * 1.35 + * This library contains derived data from a modified version of the 1.36 + * Unicode data files. 1.37 + * 1.38 + * The original data files are available at 1.39 + * http://www.unicode.org/Public/UNIDATA/ 1.40 + * 1.41 + * Please notice the copyright statement in the file "utf8proc_data.c". 1.42 + * 1.43 + */ 1.44 + 1.45 + 1.46 +/* 1.47 + * File name: utf8proc.h 1.48 + * Version: 0.1 1.49 + * Last changed: 2006-05-31 1.50 + * 1.51 + * Description: 1.52 + * Header files for libutf8proc, which is a mapping tool for UTF-8 strings 1.53 + * with following features: 1.54 + * - decomposing and composing of strings 1.55 + * - replacing compatibility characters with their equivalents 1.56 + * - stripping of "default ignorable characters" 1.57 + * like SOFT-HYPHEN or ZERO-WIDTH-SPACE 1.58 + * - optional rejection of strings containing non-assigned code points 1.59 + * - stripping of control characters 1.60 + * - transformation of LF, CRLF, CR and NEL to line-feed (LF) 1.61 + * or to the unicode chararacters for paragraph separation (PS) 1.62 + * or line separation (LS). 1.63 + * - unicode case folding (for case insensitive string comparisons) 1.64 + * - rejection of illegal UTF-8 data (i.e. UTF-8 encoded UTF-16 surrogates) 1.65 + * - support for korean hangul characters 1.66 + * Unicode Version 4.1.0 is supported. 1.67 + */ 1.68 + 1.69 + 1.70 +#ifndef UTF8PROC_H 1.71 +#define UTF8PROC_H 1.72 + 1.73 + 1.74 +#include <stdlib.h> 1.75 +#include <stdbool.h> 1.76 +#include <sys/types.h> 1.77 +#include <inttypes.h> 1.78 +#include <limits.h> 1.79 + 1.80 +#ifndef SSIZE_MAX 1.81 +#define SSIZE_MAX (SIZE_MAX/2) 1.82 +#endif 1.83 + 1.84 +#define UTF8PROC_NULLTERM (1<<0) 1.85 +#define UTF8PROC_STABLE (1<<1) 1.86 +#define UTF8PROC_COMPAT (1<<2) 1.87 +#define UTF8PROC_COMPOSE (1<<3) 1.88 +#define UTF8PROC_IGNORE (1<<4) 1.89 +#define UTF8PROC_REJECTNA (1<<5) 1.90 +#define UTF8PROC_NLF2LS (1<<6) 1.91 +#define UTF8PROC_NLF2PS (1<<7) 1.92 +#define UTF8PROC_NLF2LF (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS) 1.93 +#define UTF8PROC_STRIPCC (1<<8) 1.94 +#define UTF8PROC_CASEFOLD (1<<9) 1.95 +/* 1.96 + * Flags being regarded by several functions in the library: 1.97 + * NULLTERM: The given UTF-8 input is NULL terminated. 1.98 + * STABLE: Unicode Versioning Stability has to be respected. 1.99 + * COMPAT: Compatiblity decomposition (i.e. formatting information is lost) 1.100 + * COMPOSE: Return a result with composed characters, instead of decomposed. 1.101 + * IGNORE: Strip "default ignorable characters" 1.102 + * REJECTNA: Return an error, if the input contains unassigned code points. 1.103 + * NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are 1.104 + * representing a line break, and should be converted to the 1.105 + * unicode character for line separation (LS). 1.106 + * NLF2PS: Indicating that NLF-sequences are representing a paragraph 1.107 + * break, and should be converted to the unicode character for 1.108 + * paragraph separation (PS). 1.109 + * NLF2LF: Indicating that the meaning of NLF-sequences is unknown. 1.110 + * STRIPCC: Strips and/or convers control characters. 1.111 + * NLF-sequences are transformed into space, except if one of the 1.112 + * NLF2LS/PS/LF options is given. 1.113 + * HorizontalTab (HT) and FormFeed (FF) are treated as a 1.114 + * NLF-sequence in this case. 1.115 + * All other control characters are simply removed. 1.116 + * CASEFOLD: Performs unicode case folding, to be able to do a 1.117 + * case-insensitive string comparison. 1.118 + */ 1.119 + 1.120 +#define UTF8PROC_ERROR_NOMEM -1 1.121 +#define UTF8PROC_ERROR_OVERFLOW -2 1.122 +#define UTF8PROC_ERROR_INVALIDUTF8 -3 1.123 +#define UTF8PROC_ERROR_NOTASSIGNED -4 1.124 +/* 1.125 + * Error codes being returned by almost all functions: 1.126 + * ERROR_NOMEM: Memory could not be allocated. 1.127 + * ERROR_OVERFLOW: The given string is too long to be processed. 1.128 + * ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string. 1.129 + * ERROR_NOTASSIGNED: The REJECTNA flag was set, 1.130 + * and an unassigned code point was found. 1.131 + */ 1.132 + 1.133 +typedef struct utf8proc_property_struct { 1.134 + const char *category; 1.135 + const int16_t combining_class; 1.136 + const char *bidi_class; 1.137 + const char *decomp_type; 1.138 + const int32_t *decomp_mapping; 1.139 + const unsigned bidi_mirrored:1; 1.140 + const int32_t uppercase_mapping; 1.141 + const int32_t lowercase_mapping; 1.142 + const int32_t titlecase_mapping; 1.143 + const int32_t comb1st_index; 1.144 + const int32_t comb2nd_index; 1.145 + const unsigned comp_exclusion:1; 1.146 + const unsigned ignorable:1; 1.147 + const int32_t *casefold_mapping; 1.148 +} utf8proc_property_t; 1.149 + 1.150 +extern const int8_t utf8proc_utf8class[256]; 1.151 + 1.152 +const char *utf8proc_errmsg(ssize_t errcode); 1.153 +/* 1.154 + * Returns a static error string for the given error code. 1.155 + */ 1.156 + 1.157 +ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst); 1.158 +/* 1.159 + * Reads a single char from the UTF-8 sequence being pointed to by 'str'. 1.160 + * The maximum number of bytes read is 'strlen', unless 'strlen' is 1.161 + * negative. 1.162 + * If a valid unicode char could be read, it is stored in the variable 1.163 + * being pointed to by 'dst', otherwise that variable will be set to -1. 1.164 + * In case of success the number of bytes read is returned, otherwise a 1.165 + * negative error code is returned. 1.166 + */ 1.167 + 1.168 +ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst); 1.169 +/* 1.170 + * Encodes the unicode char with the code point 'uc' as an UTF-8 string in 1.171 + * the byte array being pointed to by 'dst'. This array has to be at least 1.172 + * 4 bytes long. 1.173 + * In case of success the number of bytes written is returned, otherwise 0. 1.174 + * This function does not check if 'uc' is a valid unicode code point. 1.175 + */ 1.176 + 1.177 +const utf8proc_property_t *utf8proc_get_property(int32_t uc); 1.178 +/* 1.179 + * Returns a pointer to a (constant) struct containing information about 1.180 + * the unicode char with the given code point 'uc'. 1.181 + * If the character is not existent a pointer to a special struct is 1.182 + * returned, where 'category' is a NULL pointer. 1.183 + * WARNING: The parameter 'uc' has to be in the range of 0x0000 to 1.184 + * 0x10FFFF, otherwise the program might crash! 1.185 + */ 1.186 + 1.187 +ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, 1.188 + int options); 1.189 +/* 1.190 + * Writes a decomposition of the unicode char 'uc' into the array being 1.191 + * pointed to by 'dst'. 1.192 + * Following flags in the 'options' field are regarded: 1.193 + * REJECTNA: an unassigned unicode code point leads to an error 1.194 + * IGNORE: "default ignorable" chars are stripped 1.195 + * CASEFOLD: unicode casefolding is applied 1.196 + * COMPAT: replace certain characters with their 1.197 + * compatibility decomposition 1.198 + * In case of success the number of chars written is returned, 1.199 + * in case of an error, a negative error code is returned. 1.200 + * If the number of written chars would be bigger than 'bufsize', 1.201 + * the buffer (up to 'bufsize') has inpredictable data, and the needed 1.202 + * buffer size is returned. 1.203 + * WARNING: The parameter 'uc' has to be in the range of 0x0000 to 1.204 + * 0x10FFFF, otherwise the program might crash! 1.205 + */ 1.206 + 1.207 +ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen, 1.208 + int32_t *buffer, ssize_t bufsize, int options); 1.209 +/* 1.210 + * Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8 1.211 + * string, and orders the decomposed sequences correctly. 1.212 + * If the NULLTERM flag in 'options' is set, processing will be stopped, 1.213 + * when a NULL byte is encounted, otherwise 'strlen' bytes are processed. 1.214 + * The result in form of unicode code points is written into the buffer 1.215 + * being pointed to by 'buffer', having the length of 'bufsize' entries. 1.216 + * In case of success the number of chars written is returned, 1.217 + * in case of an error, a negative error code is returned. 1.218 + * If the number of written chars would be bigger than 'bufsize', 1.219 + * the buffer (up to 'bufsize') has inpredictable data, and the needed 1.220 + * buffer size is returned. 1.221 + */ 1.222 + 1.223 +ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options); 1.224 +/* 1.225 + * Reencodes the sequence of unicode characters given by the pointer 1.226 + * 'buffer' and 'length' as UTF-8. 1.227 + * The result is stored in the same memory area where the data is read. 1.228 + * Following flags in the 'options' field are regarded: 1.229 + * NLF2LS: converts LF, CRLF, CR and NEL into LS 1.230 + * NLF2PS: converts LF, CRLF, CR and NEL into PS 1.231 + * NLF2LF: converts LF, CRLF, CR and NEL into LF 1.232 + * STRIPCC: strips or converts all non-affected control characters 1.233 + * COMPOSE: tries to combine decomposed characters into composite characters 1.234 + * STABLE: prohibits combining characters which would violate 1.235 + * the unicode versioning stability 1.236 + * In case of success the length of the resulting UTF-8 string is returned, 1.237 + * otherwise a negative error code is returned. 1.238 + * WARNING: The amount of free space being pointed to by 'buffer', has to 1.239 + * exceed the amount of the input data by one byte, and the 1.240 + * entries of the array pointed to by 'str' have to be in the 1.241 + * range of 0x0000 to 0x10FFFF, otherwise the program might crash! 1.242 + */ 1.243 + 1.244 +ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr, 1.245 + int options); 1.246 +/* 1.247 + * Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8 1.248 + * string, which is allocated dynamically, and afterwards pointed to by the 1.249 + * pointer being pointed to by 'dstptr'. 1.250 + * If the NULLTERM flag in the 'options' field is set, the length is 1.251 + * determined by a NULL terminator, otherwise the parameter 'strlen' is 1.252 + * evaluated to determine the string length, but in any case the result 1.253 + * will be NULL terminated (though it might contain NULL characters before). 1.254 + * Other flags in the 'options' field are passed to the functions defined 1.255 + * above, and regarded as described. 1.256 + * In case of success the length of the new string is returned, 1.257 + * otherwise a negative error code is returned. 1.258 + * NOTICE: The memory of the new UTF-8 string will have been allocated with 1.259 + * 'malloc', and has theirfore to be freed with 'free'. 1.260 + */ 1.261 + 1.262 +uint8_t *utf8proc_NFD(uint8_t *str); 1.263 +uint8_t *utf8proc_NFC(uint8_t *str); 1.264 +uint8_t *utf8proc_NFKD(uint8_t *str); 1.265 +uint8_t *utf8proc_NFKC(uint8_t *str); 1.266 +/* 1.267 + * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC 1.268 + * normalized version of the null-terminated string 'str'. 1.269 + */ 1.270 + 1.271 + 1.272 +#endif 1.273 + 1.274 +