utf8proc

diff pgsql/utf8proc_pgsql.c @ 7:fcfd8c836c64

Version 1.1.1

- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
author jbe
date Sun Jul 22 12:00:00 2007 +0200 (2007-07-22)
parents c18366878af9
children 951e73a98021
line diff
     1.1 --- a/pgsql/utf8proc_pgsql.c	Fri Mar 16 12:00:00 2007 +0100
     1.2 +++ b/pgsql/utf8proc_pgsql.c	Sun Jul 22 12:00:00 2007 +0200
     1.3 @@ -1,44 +1,35 @@
     1.4  /*
     1.5 - *  Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
     1.6 - *  Author: Jan Behrens <jan.behrens@flexiguided.de>
     1.7 - *  All rights reserved.
     1.8 + *  Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
     1.9   *
    1.10 - *  Redistribution and use in source and binary forms, with or without
    1.11 - *  modification, are permitted provided that the following conditions are
    1.12 - *  met:
    1.13 + *  Permission is hereby granted, free of charge, to any person obtaining a
    1.14 + *  copy of this software and associated documentation files (the "Software"),
    1.15 + *  to deal in the Software without restriction, including without limitation
    1.16 + *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
    1.17 + *  and/or sell copies of the Software, and to permit persons to whom the
    1.18 + *  Software is furnished to do so, subject to the following conditions:
    1.19   *
    1.20 - *  1. Redistributions of source code must retain the above copyright
    1.21 - *     notice, this list of conditions and the following disclaimer.
    1.22 - *  2. Redistributions in binary form must reproduce the above copyright
    1.23 - *     notice, this list of conditions and the following disclaimer in the
    1.24 - *     documentation and/or other materials provided with the distribution.
    1.25 - *  3. Neither the name of the FlexiGuided GmbH nor the names of its
    1.26 - *     contributors may be used to endorse or promote products derived from
    1.27 - *     this software without specific prior written permission.
    1.28 + *  The above copyright notice and this permission notice shall be included in
    1.29 + *  all copies or substantial portions of the Software.
    1.30   *
    1.31 - *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.32 - *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.33 - *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
    1.34 - *  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
    1.35 - *  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.36 - *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.37 - *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.38 - *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    1.39 - *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    1.40 - *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    1.41 - *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.42 - *
    1.43 + *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    1.44 + *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    1.45 + *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    1.46 + *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    1.47 + *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    1.48 + *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    1.49 + *  DEALINGS IN THE SOFTWARE.
    1.50   */
    1.51 - 
    1.52 +
    1.53  
    1.54  /*
    1.55   *  File name:    pgsql/utf8proc_pgsql.c
    1.56 - *  Version:      1.0
    1.57 - *  Last changed: 2006-09-17
    1.58 + *  Version:      1.1.1
    1.59 + *  Last changed: 2007-07-22
    1.60   *
    1.61   *  Description:
    1.62 - *  PostgreSQL extension to provide a function 'unifold', which can be used
    1.63 - *  to case-fold and normalize index fields.
    1.64 + *  PostgreSQL extension to provide two functions 'unifold' and 'unistrip',
    1.65 + *  which can be used to case-fold and normalize index fields and
    1.66 + *  optionally strip marks (e.g. accents) from strings.
    1.67   */
    1.68  
    1.69  
    1.70 @@ -55,40 +46,42 @@
    1.71  PG_MODULE_MAGIC;
    1.72  #endif
    1.73  
    1.74 -#define UTF8PROC_PGSQL_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
    1.75 +#define UTF8PROC_PGSQL_FOLD_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
    1.76    UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \
    1.77    UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP )
    1.78 +#define UTF8PROC_PGSQL_STRIP_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
    1.79 +  UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \
    1.80 +  UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP | UTF8PROC_STRIPMARK )
    1.81  
    1.82 -PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold);
    1.83 -Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) {
    1.84 -  text *input_string;
    1.85 -  text *output_string = NULL;
    1.86 +ssize_t utf8proc_pgsql_utf8map(
    1.87 +  text *input_string, text **output_string_ptr, int options
    1.88 +) {
    1.89    ssize_t result;
    1.90 -  input_string = PG_GETARG_TEXT_P(0);
    1.91 -  do {
    1.92 -    result = utf8proc_decompose(
    1.93 -      VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
    1.94 -      NULL, 0, UTF8PROC_PGSQL_OPTS
    1.95 -    );
    1.96 -    if (result < 0) break;
    1.97 -    if (result > (SIZE_MAX-1-VARHDRSZ)/sizeof(int32_t)) {
    1.98 -      result = UTF8PROC_ERROR_OVERFLOW;
    1.99 -      break;
   1.100 -    }
   1.101 -    output_string = palloc(result * sizeof(int32_t) + 1 + VARHDRSZ);
   1.102 -    // reserve one extra byte for termination
   1.103 -    if (!output_string) {
   1.104 -      result = UTF8PROC_ERROR_NOMEM;
   1.105 -      break;
   1.106 -    }
   1.107 -    result = utf8proc_decompose(
   1.108 -      VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
   1.109 -      (int32_t *)VARDATA(output_string), result, UTF8PROC_PGSQL_OPTS);
   1.110 -    if (result < 0) break;
   1.111 -    result = utf8proc_reencode((int32_t *)VARDATA(output_string), result,
   1.112 -      UTF8PROC_PGSQL_OPTS);
   1.113 -  } while (0);
   1.114 -  PG_FREE_IF_COPY(input_string, 0);
   1.115 +  text *output_string;
   1.116 +  result = utf8proc_decompose(
   1.117 +    VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
   1.118 +    NULL, 0, options
   1.119 +  );
   1.120 +  if (result < 0) return result;
   1.121 +  if (result > (SIZE_MAX-1-VARHDRSZ)/sizeof(int32_t))
   1.122 +    return UTF8PROC_ERROR_OVERFLOW;
   1.123 +  // reserve one extra byte for termination
   1.124 +  *output_string_ptr = palloc(result * sizeof(int32_t) + 1 + VARHDRSZ);
   1.125 +  output_string = *output_string_ptr;
   1.126 +  if (!output_string) return UTF8PROC_ERROR_NOMEM;
   1.127 +  result = utf8proc_decompose(
   1.128 +    VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
   1.129 +    (int32_t *)VARDATA(output_string), result, options
   1.130 +  );
   1.131 +  if (result < 0) return result;
   1.132 +  result = utf8proc_reencode(
   1.133 +    (int32_t *)VARDATA(output_string), result, options
   1.134 +  );
   1.135 +  if (result >= 0) VARATT_SIZEP(output_string) = result + VARHDRSZ;
   1.136 +  return result;
   1.137 +}
   1.138 +
   1.139 +void utf8proc_pgsql_utf8map_errchk(ssize_t result, text *output_string) {
   1.140    if (result < 0) {
   1.141      int sqlerrcode;
   1.142      if (output_string) pfree(output_string);
   1.143 @@ -99,7 +92,7 @@
   1.144        sqlerrcode = ERRCODE_PROGRAM_LIMIT_EXCEEDED; break;
   1.145        case UTF8PROC_ERROR_INVALIDUTF8:
   1.146        case UTF8PROC_ERROR_NOTASSIGNED:
   1.147 -      PG_RETURN_NULL();
   1.148 +      return;
   1.149        default:
   1.150        sqlerrcode = ERRCODE_INTERNAL_ERROR;
   1.151      }
   1.152 @@ -107,11 +100,42 @@
   1.153        errcode(sqlerrcode),
   1.154        errmsg("%s", utf8proc_errmsg(result))
   1.155      ));
   1.156 -  } else {
   1.157 -    VARATT_SIZEP(output_string) = result + VARHDRSZ;
   1.158 -    PG_RETURN_TEXT_P(output_string);
   1.159    }
   1.160 -  PG_RETURN_NULL();  // prohibit compiler warning
   1.161  }
   1.162  
   1.163 +PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold);
   1.164 +Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) {
   1.165 +  text *input_string;
   1.166 +  text *output_string = NULL;
   1.167 +  ssize_t result;
   1.168 +  input_string = PG_GETARG_TEXT_P(0);
   1.169 +  result = utf8proc_pgsql_utf8map(
   1.170 +    input_string, &output_string, UTF8PROC_PGSQL_FOLD_OPTS
   1.171 +  );
   1.172 +  PG_FREE_IF_COPY(input_string, 0);
   1.173 +  utf8proc_pgsql_utf8map_errchk(result, output_string);
   1.174 +  if (result >= 0) {
   1.175 +    PG_RETURN_TEXT_P(output_string);
   1.176 +  } else {
   1.177 +    PG_RETURN_NULL();
   1.178 +  }
   1.179 +}
   1.180  
   1.181 +PG_FUNCTION_INFO_V1(utf8proc_pgsql_unistrip);
   1.182 +Datum utf8proc_pgsql_unistrip(PG_FUNCTION_ARGS) {
   1.183 +  text *input_string;
   1.184 +  text *output_string = NULL;
   1.185 +  ssize_t result;
   1.186 +  input_string = PG_GETARG_TEXT_P(0);
   1.187 +  result = utf8proc_pgsql_utf8map(
   1.188 +    input_string, &output_string, UTF8PROC_PGSQL_STRIP_OPTS
   1.189 +  );
   1.190 +  PG_FREE_IF_COPY(input_string, 0);
   1.191 +  utf8proc_pgsql_utf8map_errchk(result, output_string);
   1.192 +  if (result >= 0) {
   1.193 +    PG_RETURN_TEXT_P(output_string);
   1.194 +  } else {
   1.195 +    PG_RETURN_NULL();
   1.196 +  }
   1.197 +}
   1.198 +

Impressum / About Us