utf8proc

diff utf8proc.c @ 7:fcfd8c836c64

Version 1.1.1

- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
author jbe
date Sun Jul 22 12:00:00 2007 +0200 (2007-07-22)
parents 4ee0d5f54af1
children 951e73a98021
line diff
     1.1 --- a/utf8proc.c	Fri Mar 16 12:00:00 2007 +0100
     1.2 +++ b/utf8proc.c	Sun Jul 22 12:00:00 2007 +0200
     1.3 @@ -1,34 +1,26 @@
     1.4  /*
     1.5 - *  Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
     1.6 - *  Author: Jan Behrens <jan.behrens@flexiguided.de>
     1.7 - *  All rights reserved.
     1.8 + *  Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
     1.9   *
    1.10 - *  Redistribution and use in source and binary forms, with or without
    1.11 - *  modification, are permitted provided that the following conditions are
    1.12 - *  met:
    1.13 + *  Permission is hereby granted, free of charge, to any person obtaining a
    1.14 + *  copy of this software and associated documentation files (the "Software"),
    1.15 + *  to deal in the Software without restriction, including without limitation
    1.16 + *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
    1.17 + *  and/or sell copies of the Software, and to permit persons to whom the
    1.18 + *  Software is furnished to do so, subject to the following conditions:
    1.19   *
    1.20 - *  1. Redistributions of source code must retain the above copyright
    1.21 - *     notice, this list of conditions and the following disclaimer.
    1.22 - *  2. Redistributions in binary form must reproduce the above copyright
    1.23 - *     notice, this list of conditions and the following disclaimer in the
    1.24 - *     documentation and/or other materials provided with the distribution.
    1.25 - *  3. Neither the name of the FlexiGuided GmbH nor the names of its
    1.26 - *     contributors may be used to endorse or promote products derived from
    1.27 - *     this software without specific prior written permission.
    1.28 + *  The above copyright notice and this permission notice shall be included in
    1.29 + *  all copies or substantial portions of the Software.
    1.30   *
    1.31 - *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.32 - *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.33 - *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
    1.34 - *  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
    1.35 - *  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.36 - *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.37 - *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.38 - *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    1.39 - *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    1.40 - *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    1.41 - *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.42 - *
    1.43 - *
    1.44 + *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    1.45 + *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    1.46 + *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    1.47 + *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    1.48 + *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    1.49 + *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    1.50 + *  DEALINGS IN THE SOFTWARE.
    1.51 + */
    1.52 +
    1.53 +/*
    1.54   *  This library contains derived data from a modified version of the
    1.55   *  Unicode data files.
    1.56   *
    1.57 @@ -36,14 +28,13 @@
    1.58   *  http://www.unicode.org/Public/UNIDATA/
    1.59   *
    1.60   *  Please notice the copyright statement in the file "utf8proc_data.c".
    1.61 - *
    1.62   */
    1.63  
    1.64  
    1.65  /*
    1.66   *  File name:    utf8proc.c
    1.67 - *  Version:      1.0
    1.68 - *  Last changed: 2006-09-17
    1.69 + *  Version:      1.1.1
    1.70 + *  Last changed: 2007-07-22
    1.71   *
    1.72   *  Description:
    1.73   *  Implementation of libutf8proc.
    1.74 @@ -123,7 +114,9 @@
    1.75    }
    1.76  }
    1.77  
    1.78 -ssize_t utf8proc_iterate(uint8_t *str, ssize_t strlen, int32_t *dst) {
    1.79 +ssize_t utf8proc_iterate(
    1.80 +  const uint8_t *str, ssize_t strlen, int32_t *dst
    1.81 +) {
    1.82    int length;
    1.83    int i;
    1.84    int32_t uc = -1;
    1.85 @@ -155,11 +148,19 @@
    1.86      if (uc < 0x10000 || uc >= 0x110000) uc = -1;
    1.87      break;
    1.88    }
    1.89 -  if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return UTF8PROC_ERROR_INVALIDUTF8;
    1.90 +  if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
    1.91 +    return UTF8PROC_ERROR_INVALIDUTF8;
    1.92    *dst = uc;
    1.93    return length;
    1.94  }
    1.95  
    1.96 +bool utf8proc_codepoint_valid(int32_t uc) {
    1.97 +  if (uc < 0 || uc >= 0x110000 ||
    1.98 +    ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
    1.99 +    (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
   1.100 +  else return true;
   1.101 +}
   1.102 +
   1.103  ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
   1.104    if (uc < 0x00) {
   1.105      return 0;
   1.106 @@ -348,8 +349,10 @@
   1.107    return 1;
   1.108  }
   1.109  
   1.110 -ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen,
   1.111 -    int32_t *buffer, ssize_t bufsize, int options) {
   1.112 +ssize_t utf8proc_decompose(
   1.113 +  const uint8_t *str, ssize_t strlen,
   1.114 +  int32_t *buffer, ssize_t bufsize, int options
   1.115 +) {
   1.116    // strlen will be ignored, if UTF8PROC_NULLTERM is set in options
   1.117    ssize_t wpos = 0;
   1.118    if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
   1.119 @@ -523,8 +526,9 @@
   1.120    }
   1.121  }
   1.122  
   1.123 -ssize_t utf8proc_map(uint8_t *str, ssize_t strlen, uint8_t **dstptr,
   1.124 -    int options) {
   1.125 +ssize_t utf8proc_map(
   1.126 +  const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
   1.127 +) {
   1.128    int32_t *buffer;
   1.129    ssize_t result;
   1.130    *dstptr = NULL;
   1.131 @@ -551,32 +555,31 @@
   1.132    return result;
   1.133  }
   1.134  
   1.135 -uint8_t *utf8proc_NFD(uint8_t *str) {
   1.136 +uint8_t *utf8proc_NFD(const uint8_t *str) {
   1.137    uint8_t *retval;
   1.138    utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
   1.139      UTF8PROC_DECOMPOSE);
   1.140    return retval;
   1.141  }
   1.142  
   1.143 -uint8_t *utf8proc_NFC(uint8_t *str) {
   1.144 +uint8_t *utf8proc_NFC(const uint8_t *str) {
   1.145    uint8_t *retval;
   1.146    utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
   1.147      UTF8PROC_COMPOSE);
   1.148    return retval;
   1.149  }
   1.150  
   1.151 -uint8_t *utf8proc_NFKD(uint8_t *str) {
   1.152 +uint8_t *utf8proc_NFKD(const uint8_t *str) {
   1.153    uint8_t *retval;
   1.154    utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
   1.155      UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
   1.156    return retval;
   1.157  }
   1.158  
   1.159 -uint8_t *utf8proc_NFKC(uint8_t *str) {
   1.160 +uint8_t *utf8proc_NFKC(const uint8_t *str) {
   1.161    uint8_t *retval;
   1.162    utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
   1.163      UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
   1.164    return retval;
   1.165  }
   1.166  
   1.167 -

Impressum / About Us