utf8proc

diff utf8proc.h @ 7:fcfd8c836c64
Version 1.1.1

- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
author: jbe
date: Sun Jul 22 12:00:00 2007 +0200 (2007-07-22)
parents: 4ee0d5f54af1
children: 951e73a98021
     1.1 --- a/utf8proc.h	Fri Mar 16 12:00:00 2007 +0100
     1.2 +++ b/utf8proc.h	Sun Jul 22 12:00:00 2007 +0200
     1.3 @@ -1,49 +1,30 @@
     1.4  /*
     1.5 - *  Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
     1.6 - *  Author: Jan Behrens <jan.behrens@flexiguided.de>
     1.7 - *  All rights reserved.
     1.8 - *
     1.9 - *  Redistribution and use in source and binary forms, with or without
    1.10 - *  modification, are permitted provided that the following conditions are
    1.11 - *  met:
    1.12 + *  Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
    1.13   *
    1.14 - *  1. Redistributions of source code must retain the above copyright
    1.15 - *     notice, this list of conditions and the following disclaimer.
    1.16 - *  2. Redistributions in binary form must reproduce the above copyright
    1.17 - *     notice, this list of conditions and the following disclaimer in the
    1.18 - *     documentation and/or other materials provided with the distribution.
    1.19 - *  3. Neither the name of the FlexiGuided GmbH nor the names of its
    1.20 - *     contributors may be used to endorse or promote products derived from
    1.21 - *     this software without specific prior written permission.
    1.22 + *  Permission is hereby granted, free of charge, to any person obtaining a
    1.23 + *  copy of this software and associated documentation files (the "Software"),
    1.24 + *  to deal in the Software without restriction, including without limitation
    1.25 + *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
    1.26 + *  and/or sell copies of the Software, and to permit persons to whom the
    1.27 + *  Software is furnished to do so, subject to the following conditions:
    1.28   *
    1.29 - *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.30 - *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.31 - *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
    1.32 - *  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
    1.33 - *  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.34 - *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.35 - *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.36 - *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    1.37 - *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    1.38 - *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    1.39 - *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.40 + *  The above copyright notice and this permission notice shall be included in
    1.41 + *  all copies or substantial portions of the Software.
    1.42   *
    1.43 - *
    1.44 - *  This library contains derived data from a modified version of the
    1.45 - *  Unicode data files.
    1.46 - *
    1.47 - *  The original data files are available at
    1.48 - *  http://www.unicode.org/Public/UNIDATA/
    1.49 - *
    1.50 - *  Please notice the copyright statement in the file "utf8proc_data.c".
    1.51 - *
    1.52 + *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    1.53 + *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    1.54 + *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    1.55 + *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    1.56 + *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    1.57 + *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    1.58 + *  DEALINGS IN THE SOFTWARE.
    1.59   */
    1.60 - 
    1.61 +
    1.62  
    1.63  /*
    1.64   *  File name:    utf8proc.h
    1.65 - *  Version:      1.0
    1.66 - *  Last changed: 2006-09-17
    1.67 + *  Version:      1.1.1
    1.68 + *  Last changed: 2007-07-22
    1.69   *
    1.70   *  Description:
    1.71   *  Header files for libutf8proc, which is a mapping tool for UTF-8 strings
    1.72 @@ -62,7 +43,8 @@
    1.73   *    or to the unicode chararacters for paragraph separation (PS)
    1.74   *    or line separation (LS).
    1.75   *  - unicode case folding (for case insensitive string comparisons)
    1.76 - *  - rejection of illegal UTF-8 data (i.e. UTF-8 encoded UTF-16 surrogates)
    1.77 + *  - rejection of illegal UTF-8 data
    1.78 + *    (i.e. UTF-8 encoded UTF-16 surrogates)
    1.79   *  - support for korean hangul characters
    1.80   *  Unicode Version 5.0.0 is supported.
    1.81   */
    1.82 @@ -106,7 +88,8 @@
    1.83   *  COMPOSE:   Return a result with composed characters.
    1.84   *  DECOMPOSE: Return a result with decomposed characters.
    1.85   *  IGNORE:    Strip "default ignorable characters"
    1.86 - *  REJECTNA:  Return an error, if the input contains unassigned code points.
    1.87 + *  REJECTNA:  Return an error, if the input contains unassigned
    1.88 + *             code points.
    1.89   *  NLF2LS:    Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
    1.90   *             representing a line break, and should be converted to the
    1.91   *             unicode character for line separation (LS).
    1.92 @@ -115,20 +98,20 @@
    1.93   *             paragraph separation (PS).
    1.94   *  NLF2LF:    Indicating that the meaning of NLF-sequences is unknown.
    1.95   *  STRIPCC:   Strips and/or convers control characters.
    1.96 - *             NLF-sequences are transformed into space, except if one of the
    1.97 - *             NLF2LS/PS/LF options is given.
    1.98 + *             NLF-sequences are transformed into space, except if one of
    1.99 + *             the NLF2LS/PS/LF options is given.
   1.100   *             HorizontalTab (HT) and FormFeed (FF) are treated as a
   1.101   *             NLF-sequence in this case.
   1.102   *             All other control characters are simply removed.
   1.103   *  CASEFOLD:  Performs unicode case folding, to be able to do a
   1.104   *             case-insensitive string comparison.
   1.105 - *  CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which is
   1.106 - *             representing a single grapheme cluster (a single character).
   1.107 + *  CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which
   1.108 + *             is representing a single grapheme cluster (see UAX#29).
   1.109   *  LUMP:      Lumps certain characters together
   1.110   *             (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
   1.111   *             (See lump.txt for details.)
   1.112 - *             If NLF2LF is set, this includes a transformation of paragraph
   1.113 - *             and line separators to ASCII line-feed (LF).
   1.114 + *             If NLF2LF is set, this includes a transformation of
   1.115 + *             paragraph and line separators to ASCII line-feed (LF).
   1.116   *  STRIPMARK: Strips all character markings
   1.117   *             (non-spacing, spacing and enclosing) (i.e. accents)
   1.118   *             NOTE: this option works only with COMPOSE or DECOMPOSE
   1.119 @@ -156,16 +139,16 @@
   1.120    utf8proc_propval_t bidi_class;
   1.121    utf8proc_propval_t decomp_type;
   1.122    const int32_t *decomp_mapping;
   1.123 -  const unsigned bidi_mirrored:1;
   1.124 -  const int32_t uppercase_mapping;
   1.125 -  const int32_t lowercase_mapping;
   1.126 -  const int32_t titlecase_mapping;
   1.127 -  const int32_t comb1st_index;
   1.128 -  const int32_t comb2nd_index;
   1.129 -  const unsigned comp_exclusion:1;
   1.130 -  const unsigned ignorable:1;
   1.131 -  const unsigned control_boundary:1;
   1.132 -  const unsigned extend:1;
   1.133 +  unsigned bidi_mirrored:1;
   1.134 +  int32_t uppercase_mapping;
   1.135 +  int32_t lowercase_mapping;
   1.136 +  int32_t titlecase_mapping;
   1.137 +  int32_t comb1st_index;
   1.138 +  int32_t comb2nd_index;
   1.139 +  unsigned comp_exclusion:1;
   1.140 +  unsigned ignorable:1;
   1.141 +  unsigned control_boundary:1;
   1.142 +  unsigned extend:1;
   1.143    const int32_t *casefold_mapping;
   1.144  } utf8proc_property_t;
   1.145  
   1.146 @@ -242,7 +225,7 @@
   1.147   *  Returns a static error string for the given error code.
   1.148   */
   1.149  
   1.150 -ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst);
   1.151 +ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
   1.152  /*
   1.153   *  Reads a single char from the UTF-8 sequence being pointed to by 'str'.
   1.154   *  The maximum number of bytes read is 'strlen', unless 'strlen' is
   1.155 @@ -253,12 +236,18 @@
   1.156   *  negative error code is returned.
   1.157   */
   1.158  
   1.159 +bool utf8proc_codepoint_valid(int32_t uc);
   1.160 +/*
   1.161 + *  Returns 1, if the given unicode code-point is valid, otherwise 0.
   1.162 + */
   1.163 +
   1.164  ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
   1.165  /*
   1.166   *  Encodes the unicode char with the code point 'uc' as an UTF-8 string in
   1.167   *  the byte array being pointed to by 'dst'. This array has to be at least
   1.168   *  4 bytes long.
   1.169 - *  In case of success the number of bytes written is returned, otherwise 0.
   1.170 + *  In case of success the number of bytes written is returned,
   1.171 + *  otherwise 0.
   1.172   *  This function does not check if 'uc' is a valid unicode code point.
   1.173   */
   1.174  
   1.175 @@ -272,8 +261,10 @@
   1.176   *           0x10FFFF, otherwise the program might crash!
   1.177   */
   1.178  
   1.179 -ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
   1.180 -  int options, int *last_boundclass);
   1.181 +ssize_t utf8proc_decompose_char(
   1.182 +  int32_t uc, int32_t *dst, ssize_t bufsize,
   1.183 +  int options, int *last_boundclass
   1.184 +);
   1.185  /*
   1.186   *  Writes a decomposition of the unicode char 'uc' into the array being
   1.187   *  pointed to by 'dst'.
   1.188 @@ -286,9 +277,9 @@
   1.189   *  CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
   1.190   *  LUMP:      lumps certain different characters together
   1.191   *  STRIPMARK: removes all character marks
   1.192 - *  The pointer 'last_boundclass' has to point to an integer variable which is
   1.193 - *  storing the last character boundary class, if the CHARBOUND option is
   1.194 - *  used.
   1.195 + *  The pointer 'last_boundclass' has to point to an integer variable which
   1.196 + *  is storing the last character boundary class, if the CHARBOUND option
   1.197 + *  is used.
   1.198   *  In case of success the number of chars written is returned,
   1.199   *  in case of an error, a negative error code is returned.
   1.200   *  If the number of written chars would be bigger than 'bufsize',
   1.201 @@ -298,8 +289,10 @@
   1.202   *           0x10FFFF, otherwise the program might crash!
   1.203   */
   1.204  
   1.205 -ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen,
   1.206 -  int32_t *buffer, ssize_t bufsize, int options);
   1.207 +ssize_t utf8proc_decompose(
   1.208 +  const uint8_t *str, ssize_t strlen,
   1.209 +  int32_t *buffer, ssize_t bufsize, int options
   1.210 +);
   1.211  /*
   1.212   *  Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
   1.213   *  string, and orders the decomposed sequences correctly.
   1.214 @@ -324,39 +317,42 @@
   1.215   *  NLF2PS:  converts LF, CRLF, CR and NEL into PS
   1.216   *  NLF2LF:  converts LF, CRLF, CR and NEL into LF
   1.217   *  STRIPCC: strips or converts all non-affected control characters
   1.218 - *  COMPOSE: tries to combine decomposed characters into composite characters
   1.219 + *  COMPOSE: tries to combine decomposed characters into composite
   1.220 + *           characters
   1.221   *  STABLE:  prohibits combining characters which would violate
   1.222   *           the unicode versioning stability
   1.223 - *  In case of success the length of the resulting UTF-8 string is returned,
   1.224 - *  otherwise a negative error code is returned.
   1.225 + *  In case of success the length of the resulting UTF-8 string is
   1.226 + *  returned, otherwise a negative error code is returned.
   1.227   *  WARNING: The amount of free space being pointed to by 'buffer', has to
   1.228   *           exceed the amount of the input data by one byte, and the
   1.229   *           entries of the array pointed to by 'str' have to be in the
   1.230 - *           range of 0x0000 to 0x10FFFF, otherwise the program might crash!
   1.231 + *           range of 0x0000 to 0x10FFFF, otherwise the program might
   1.232 + *           crash!
   1.233   */
   1.234  
   1.235 -ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr,
   1.236 -  int options);
   1.237 +ssize_t utf8proc_map(
   1.238 +  const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
   1.239 +);
   1.240  /*
   1.241   *  Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
   1.242 - *  string, which is allocated dynamically, and afterwards pointed to by the
   1.243 - *  pointer being pointed to by 'dstptr'.
   1.244 + *  string, which is allocated dynamically, and afterwards pointed to by
   1.245 + *  the pointer being pointed to by 'dstptr'.
   1.246   *  If the NULLTERM flag in the 'options' field is set, the length is
   1.247   *  determined by a NULL terminator, otherwise the parameter 'strlen' is
   1.248   *  evaluated to determine the string length, but in any case the result
   1.249 - *  will be NULL terminated (though it might contain NULL characters before).
   1.250 - *  Other flags in the 'options' field are passed to the functions defined
   1.251 - *  above, and regarded as described.
   1.252 + *  will be NULL terminated (though it might contain NULL characters
   1.253 + *  before). Other flags in the 'options' field are passed to the functions
   1.254 + *  defined above, and regarded as described.
   1.255   *  In case of success the length of the new string is returned,
   1.256   *  otherwise a negative error code is returned.
   1.257   *  NOTICE: The memory of the new UTF-8 string will have been allocated with
   1.258   *          'malloc', and has theirfore to be freed with 'free'.
   1.259   */
   1.260  
   1.261 -uint8_t *utf8proc_NFD(uint8_t *str);
   1.262 -uint8_t *utf8proc_NFC(uint8_t *str);
   1.263 -uint8_t *utf8proc_NFKD(uint8_t *str);
   1.264 -uint8_t *utf8proc_NFKC(uint8_t *str);
   1.265 +uint8_t *utf8proc_NFD(const uint8_t *str);
   1.266 +uint8_t *utf8proc_NFC(const uint8_t *str);
   1.267 +uint8_t *utf8proc_NFKD(const uint8_t *str);
   1.268 +uint8_t *utf8proc_NFKC(const uint8_t *str);
   1.269  /*
   1.270   *  Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
   1.271   *  normalized version of the null-terminated string 'str'.
   1.272 @@ -365,4 +361,3 @@
   1.273  
   1.274  #endif
   1.275  
   1.276 -
author	jbe
date	Sun Jul 22 12:00:00 2007 +0200 (2007-07-22)
parents	4ee0d5f54af1
children	951e73a98021