utf8proc

diff utf8proc.h @ 0:a0368662434c
Version 0.1
author: jbe
date: Fri Jun 02 12:00:00 2006 +0200 (2006-06-02)
children: 61a89ecc2fb9
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/utf8proc.h	Fri Jun 02 12:00:00 2006 +0200
     1.3 @@ -0,0 +1,271 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
     1.6 + *  Author: Jan Behrens <jan.behrens@flexiguided.de>
     1.7 + *  All rights reserved.
     1.8 + *
     1.9 + *  Redistribution and use in source and binary forms, with or without
    1.10 + *  modification, are permitted provided that the following conditions are
    1.11 + *  met:
    1.12 + *
    1.13 + *  1. Redistributions of source code must retain the above copyright
    1.14 + *     notice, this list of conditions and the following disclaimer.
    1.15 + *  2. Redistributions in binary form must reproduce the above copyright
    1.16 + *     notice, this list of conditions and the following disclaimer in the
    1.17 + *     documentation and/or other materials provided with the distribution.
    1.18 + *  3. Neither the name of the FlexiGuided GmbH nor the names of its
    1.19 + *     contributors may be used to endorse or promote products derived from
    1.20 + *     this software without specific prior written permission.
    1.21 + *
    1.22 + *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.23 + *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.24 + *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
    1.25 + *  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
    1.26 + *  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.27 + *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.28 + *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.29 + *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    1.30 + *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    1.31 + *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    1.32 + *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.33 + *
    1.34 + *
    1.35 + *  This library contains derived data from a modified version of the
    1.36 + *  Unicode data files.
    1.37 + *
    1.38 + *  The original data files are available at
    1.39 + *  http://www.unicode.org/Public/UNIDATA/
    1.40 + *
    1.41 + *  Please notice the copyright statement in the file "utf8proc_data.c".
    1.42 + *
    1.43 + */
    1.44 + 
    1.45 +
    1.46 +/*
    1.47 + *  File name:    utf8proc.h
    1.48 + *  Version:      0.1
    1.49 + *  Last changed: 2006-05-31
    1.50 + *
    1.51 + *  Description:
    1.52 + *  Header files for libutf8proc, which is a mapping tool for UTF-8 strings
    1.53 + *  with following features:
    1.54 + *  - decomposing and composing of strings
    1.55 + *  - replacing compatibility characters with their equivalents
    1.56 + *  - stripping of "default ignorable characters"
    1.57 + *    like SOFT-HYPHEN or ZERO-WIDTH-SPACE
    1.58 + *  - optional rejection of strings containing non-assigned code points
    1.59 + *  - stripping of control characters
    1.60 + *  - transformation of LF, CRLF, CR and NEL to line-feed (LF)
    1.61 + *    or to the unicode chararacters for paragraph separation (PS)
    1.62 + *    or line separation (LS).
    1.63 + *  - unicode case folding (for case insensitive string comparisons)
    1.64 + *  - rejection of illegal UTF-8 data (i.e. UTF-8 encoded UTF-16 surrogates)
    1.65 + *  - support for korean hangul characters
    1.66 + *  Unicode Version 4.1.0 is supported.
    1.67 + */
    1.68 +
    1.69 +
    1.70 +#ifndef UTF8PROC_H
    1.71 +#define UTF8PROC_H
    1.72 +
    1.73 +
    1.74 +#include <stdlib.h>
    1.75 +#include <stdbool.h>
    1.76 +#include <sys/types.h>
    1.77 +#include <inttypes.h>
    1.78 +#include <limits.h>
    1.79 +
    1.80 +#ifndef SSIZE_MAX
    1.81 +#define SSIZE_MAX (SIZE_MAX/2)
    1.82 +#endif
    1.83 +
    1.84 +#define UTF8PROC_NULLTERM (1<<0)
    1.85 +#define UTF8PROC_STABLE   (1<<1)
    1.86 +#define UTF8PROC_COMPAT   (1<<2)
    1.87 +#define UTF8PROC_COMPOSE  (1<<3)
    1.88 +#define UTF8PROC_IGNORE   (1<<4)
    1.89 +#define UTF8PROC_REJECTNA (1<<5)
    1.90 +#define UTF8PROC_NLF2LS   (1<<6)
    1.91 +#define UTF8PROC_NLF2PS   (1<<7)
    1.92 +#define UTF8PROC_NLF2LF   (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
    1.93 +#define UTF8PROC_STRIPCC  (1<<8)
    1.94 +#define UTF8PROC_CASEFOLD (1<<9)
    1.95 +/*
    1.96 + *  Flags being regarded by several functions in the library:
    1.97 + *  NULLTERM: The given UTF-8 input is NULL terminated.
    1.98 + *  STABLE:   Unicode Versioning Stability has to be respected.
    1.99 + *  COMPAT:   Compatiblity decomposition (i.e. formatting information is lost)
   1.100 + *  COMPOSE:  Return a result with composed characters, instead of decomposed.
   1.101 + *  IGNORE:   Strip "default ignorable characters"
   1.102 + *  REJECTNA: Return an error, if the input contains unassigned code points.
   1.103 + *  NLF2LS:   Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
   1.104 + *            representing a line break, and should be converted to the
   1.105 + *            unicode character for line separation (LS).
   1.106 + *  NLF2PS:   Indicating that NLF-sequences are representing a paragraph
   1.107 + *            break, and should be converted to the unicode character for
   1.108 + *            paragraph separation (PS).
   1.109 + *  NLF2LF:   Indicating that the meaning of NLF-sequences is unknown.
   1.110 + *  STRIPCC:  Strips and/or convers control characters.
   1.111 + *            NLF-sequences are transformed into space, except if one of the
   1.112 + *            NLF2LS/PS/LF options is given.
   1.113 + *            HorizontalTab (HT) and FormFeed (FF) are treated as a
   1.114 + *            NLF-sequence in this case.
   1.115 + *            All other control characters are simply removed.
   1.116 + *  CASEFOLD: Performs unicode case folding, to be able to do a
   1.117 + *            case-insensitive string comparison.
   1.118 + */
   1.119 +
   1.120 +#define UTF8PROC_ERROR_NOMEM -1
   1.121 +#define UTF8PROC_ERROR_OVERFLOW -2
   1.122 +#define UTF8PROC_ERROR_INVALIDUTF8 -3
   1.123 +#define UTF8PROC_ERROR_NOTASSIGNED -4
   1.124 +/*
   1.125 + *  Error codes being returned by almost all functions:
   1.126 + *  ERROR_NOMEM:       Memory could not be allocated.
   1.127 + *  ERROR_OVERFLOW:    The given string is too long to be processed.
   1.128 + *  ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
   1.129 + *  ERROR_NOTASSIGNED: The REJECTNA flag was set,
   1.130 + *                     and an unassigned code point was found.
   1.131 + */
   1.132 +
   1.133 +typedef struct utf8proc_property_struct {
   1.134 +  const char *category;
   1.135 +  const int16_t combining_class;
   1.136 +  const char *bidi_class;
   1.137 +  const char *decomp_type;
   1.138 +  const int32_t *decomp_mapping;
   1.139 +  const unsigned bidi_mirrored:1;
   1.140 +  const int32_t uppercase_mapping;
   1.141 +  const int32_t lowercase_mapping;
   1.142 +  const int32_t titlecase_mapping;
   1.143 +  const int32_t comb1st_index;
   1.144 +  const int32_t comb2nd_index;
   1.145 +  const unsigned comp_exclusion:1;
   1.146 +  const unsigned ignorable:1;
   1.147 +  const int32_t *casefold_mapping;
   1.148 +} utf8proc_property_t;
   1.149 +
   1.150 +extern const int8_t utf8proc_utf8class[256];
   1.151 +
   1.152 +const char *utf8proc_errmsg(ssize_t errcode);
   1.153 +/*
   1.154 + *  Returns a static error string for the given error code.
   1.155 + */
   1.156 +
   1.157 +ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst);
   1.158 +/*
   1.159 + *  Reads a single char from the UTF-8 sequence being pointed to by 'str'.
   1.160 + *  The maximum number of bytes read is 'strlen', unless 'strlen' is
   1.161 + *  negative.
   1.162 + *  If a valid unicode char could be read, it is stored in the variable
   1.163 + *  being pointed to by 'dst', otherwise that variable will be set to -1.
   1.164 + *  In case of success the number of bytes read is returned, otherwise a
   1.165 + *  negative error code is returned.
   1.166 + */
   1.167 +
   1.168 +ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
   1.169 +/*
   1.170 + *  Encodes the unicode char with the code point 'uc' as an UTF-8 string in
   1.171 + *  the byte array being pointed to by 'dst'. This array has to be at least
   1.172 + *  4 bytes long.
   1.173 + *  In case of success the number of bytes written is returned, otherwise 0.
   1.174 + *  This function does not check if 'uc' is a valid unicode code point.
   1.175 + */
   1.176 +
   1.177 +const utf8proc_property_t *utf8proc_get_property(int32_t uc);
   1.178 +/*
   1.179 + *  Returns a pointer to a (constant) struct containing information about
   1.180 + *  the unicode char with the given code point 'uc'.
   1.181 + *  If the character is not existent a pointer to a special struct is
   1.182 + *  returned, where 'category' is a NULL pointer.
   1.183 + *  WARNING: The parameter 'uc' has to be in the range of 0x0000 to
   1.184 + *           0x10FFFF, otherwise the program might crash!
   1.185 + */
   1.186 +
   1.187 +ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
   1.188 +  int options);
   1.189 +/*
   1.190 + *  Writes a decomposition of the unicode char 'uc' into the array being
   1.191 + *  pointed to by 'dst'.
   1.192 + *  Following flags in the 'options' field are regarded:
   1.193 + *  REJECTNA: an unassigned unicode code point leads to an error
   1.194 + *  IGNORE:   "default ignorable" chars are stripped
   1.195 + *  CASEFOLD: unicode casefolding is applied
   1.196 + *  COMPAT:   replace certain characters with their
   1.197 + *            compatibility decomposition
   1.198 + *  In case of success the number of chars written is returned,
   1.199 + *  in case of an error, a negative error code is returned.
   1.200 + *  If the number of written chars would be bigger than 'bufsize',
   1.201 + *  the buffer (up to 'bufsize') has inpredictable data, and the needed
   1.202 + *  buffer size is returned.
   1.203 + *  WARNING: The parameter 'uc' has to be in the range of 0x0000 to
   1.204 + *           0x10FFFF, otherwise the program might crash!
   1.205 + */
   1.206 +
   1.207 +ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen,
   1.208 +  int32_t *buffer, ssize_t bufsize, int options);
   1.209 +/*
   1.210 + *  Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
   1.211 + *  string, and orders the decomposed sequences correctly.
   1.212 + *  If the NULLTERM flag in 'options' is set, processing will be stopped,
   1.213 + *  when a NULL byte is encounted, otherwise 'strlen' bytes are processed.
   1.214 + *  The result in form of unicode code points is written into the buffer
   1.215 + *  being pointed to by 'buffer', having the length of 'bufsize' entries.
   1.216 + *  In case of success the number of chars written is returned,
   1.217 + *  in case of an error, a negative error code is returned.
   1.218 + *  If the number of written chars would be bigger than 'bufsize',
   1.219 + *  the buffer (up to 'bufsize') has inpredictable data, and the needed
   1.220 + *  buffer size is returned.
   1.221 + */
   1.222 +
   1.223 +ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
   1.224 +/*
   1.225 + *  Reencodes the sequence of unicode characters given by the pointer
   1.226 + *  'buffer' and 'length' as UTF-8.
   1.227 + *  The result is stored in the same memory area where the data is read.
   1.228 + *  Following flags in the 'options' field are regarded:
   1.229 + *  NLF2LS:  converts LF, CRLF, CR and NEL into LS
   1.230 + *  NLF2PS:  converts LF, CRLF, CR and NEL into PS
   1.231 + *  NLF2LF:  converts LF, CRLF, CR and NEL into LF
   1.232 + *  STRIPCC: strips or converts all non-affected control characters
   1.233 + *  COMPOSE: tries to combine decomposed characters into composite characters
   1.234 + *  STABLE:  prohibits combining characters which would violate
   1.235 + *           the unicode versioning stability
   1.236 + *  In case of success the length of the resulting UTF-8 string is returned,
   1.237 + *  otherwise a negative error code is returned.
   1.238 + *  WARNING: The amount of free space being pointed to by 'buffer', has to
   1.239 + *           exceed the amount of the input data by one byte, and the
   1.240 + *           entries of the array pointed to by 'str' have to be in the
   1.241 + *           range of 0x0000 to 0x10FFFF, otherwise the program might crash!
   1.242 + */
   1.243 +
   1.244 +ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr,
   1.245 +  int options);
   1.246 +/*
   1.247 + *  Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
   1.248 + *  string, which is allocated dynamically, and afterwards pointed to by the
   1.249 + *  pointer being pointed to by 'dstptr'.
   1.250 + *  If the NULLTERM flag in the 'options' field is set, the length is
   1.251 + *  determined by a NULL terminator, otherwise the parameter 'strlen' is
   1.252 + *  evaluated to determine the string length, but in any case the result
   1.253 + *  will be NULL terminated (though it might contain NULL characters before).
   1.254 + *  Other flags in the 'options' field are passed to the functions defined
   1.255 + *  above, and regarded as described.
   1.256 + *  In case of success the length of the new string is returned,
   1.257 + *  otherwise a negative error code is returned.
   1.258 + *  NOTICE: The memory of the new UTF-8 string will have been allocated with
   1.259 + *          'malloc', and has theirfore to be freed with 'free'.
   1.260 + */
   1.261 +
   1.262 +uint8_t *utf8proc_NFD(uint8_t *str);
   1.263 +uint8_t *utf8proc_NFC(uint8_t *str);
   1.264 +uint8_t *utf8proc_NFKD(uint8_t *str);
   1.265 +uint8_t *utf8proc_NFKC(uint8_t *str);
   1.266 +/*
   1.267 + *  Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
   1.268 + *  normalized version of the null-terminated string 'str'.
   1.269 + */
   1.270 +
   1.271 +
   1.272 +#endif
   1.273 +
   1.274 +
author	jbe
date	Fri Jun 02 12:00:00 2006 +0200 (2006-06-02)
parents
children	61a89ecc2fb9