utf8proc
diff pgsql/utf8proc_pgsql.c @ 7:fcfd8c836c64
Version 1.1.1
- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
- Added a new PostgreSQL function 'unistrip', which behaves like 'unifold', but also removes all character marks (e.g. accents).
- Changed license from BSD to MIT style.
- Added a new function 'utf8proc_codepoint_valid' to the C library.
- Changed compiler flags in Makefile from -g -O0 to -O2
- The ruby script, which was used to build the utf8proc_data.c file, is now included in the distribution.
author | jbe |
---|---|
date | Sun Jul 22 12:00:00 2007 +0200 (2007-07-22) |
parents | c18366878af9 |
children | 951e73a98021 |
line diff
1.1 --- a/pgsql/utf8proc_pgsql.c Fri Mar 16 12:00:00 2007 +0100 1.2 +++ b/pgsql/utf8proc_pgsql.c Sun Jul 22 12:00:00 2007 +0200 1.3 @@ -1,44 +1,35 @@ 1.4 /* 1.5 - * Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany 1.6 - * Author: Jan Behrens <jan.behrens@flexiguided.de> 1.7 - * All rights reserved. 1.8 + * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin 1.9 * 1.10 - * Redistribution and use in source and binary forms, with or without 1.11 - * modification, are permitted provided that the following conditions are 1.12 - * met: 1.13 + * Permission is hereby granted, free of charge, to any person obtaining a 1.14 + * copy of this software and associated documentation files (the "Software"), 1.15 + * to deal in the Software without restriction, including without limitation 1.16 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1.17 + * and/or sell copies of the Software, and to permit persons to whom the 1.18 + * Software is furnished to do so, subject to the following conditions: 1.19 * 1.20 - * 1. Redistributions of source code must retain the above copyright 1.21 - * notice, this list of conditions and the following disclaimer. 1.22 - * 2. Redistributions in binary form must reproduce the above copyright 1.23 - * notice, this list of conditions and the following disclaimer in the 1.24 - * documentation and/or other materials provided with the distribution. 1.25 - * 3. Neither the name of the FlexiGuided GmbH nor the names of its 1.26 - * contributors may be used to endorse or promote products derived from 1.27 - * this software without specific prior written permission. 1.28 + * The above copyright notice and this permission notice shall be included in 1.29 + * all copies or substantial portions of the Software. 1.30 * 1.31 - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.32 - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.33 - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 1.34 - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 1.35 - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.36 - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.37 - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.38 - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.39 - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.40 - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.41 - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.42 - * 1.43 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1.44 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1.45 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1.46 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1.47 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1.48 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 1.49 + * DEALINGS IN THE SOFTWARE. 1.50 */ 1.51 - 1.52 + 1.53 1.54 /* 1.55 * File name: pgsql/utf8proc_pgsql.c 1.56 - * Version: 1.0 1.57 - * Last changed: 2006-09-17 1.58 + * Version: 1.1.1 1.59 + * Last changed: 2007-07-22 1.60 * 1.61 * Description: 1.62 - * PostgreSQL extension to provide a function 'unifold', which can be used 1.63 - * to case-fold and normalize index fields. 1.64 + * PostgreSQL extension to provide two functions 'unifold' and 'unistrip', 1.65 + * which can be used to case-fold and normalize index fields and 1.66 + * optionally strip marks (e.g. accents) from strings. 1.67 */ 1.68 1.69 1.70 @@ -55,40 +46,42 @@ 1.71 PG_MODULE_MAGIC; 1.72 #endif 1.73 1.74 -#define UTF8PROC_PGSQL_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \ 1.75 +#define UTF8PROC_PGSQL_FOLD_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \ 1.76 UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \ 1.77 UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP ) 1.78 +#define UTF8PROC_PGSQL_STRIP_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \ 1.79 + UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \ 1.80 + UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP | UTF8PROC_STRIPMARK ) 1.81 1.82 -PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold); 1.83 -Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) { 1.84 - text *input_string; 1.85 - text *output_string = NULL; 1.86 +ssize_t utf8proc_pgsql_utf8map( 1.87 + text *input_string, text **output_string_ptr, int options 1.88 +) { 1.89 ssize_t result; 1.90 - input_string = PG_GETARG_TEXT_P(0); 1.91 - do { 1.92 - result = utf8proc_decompose( 1.93 - VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ, 1.94 - NULL, 0, UTF8PROC_PGSQL_OPTS 1.95 - ); 1.96 - if (result < 0) break; 1.97 - if (result > (SIZE_MAX-1-VARHDRSZ)/sizeof(int32_t)) { 1.98 - result = UTF8PROC_ERROR_OVERFLOW; 1.99 - break; 1.100 - } 1.101 - output_string = palloc(result * sizeof(int32_t) + 1 + VARHDRSZ); 1.102 - // reserve one extra byte for termination 1.103 - if (!output_string) { 1.104 - result = UTF8PROC_ERROR_NOMEM; 1.105 - break; 1.106 - } 1.107 - result = utf8proc_decompose( 1.108 - VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ, 1.109 - (int32_t *)VARDATA(output_string), result, UTF8PROC_PGSQL_OPTS); 1.110 - if (result < 0) break; 1.111 - result = utf8proc_reencode((int32_t *)VARDATA(output_string), result, 1.112 - UTF8PROC_PGSQL_OPTS); 1.113 - } while (0); 1.114 - PG_FREE_IF_COPY(input_string, 0); 1.115 + text *output_string; 1.116 + result = utf8proc_decompose( 1.117 + VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ, 1.118 + NULL, 0, options 1.119 + ); 1.120 + if (result < 0) return result; 1.121 + if (result > (SIZE_MAX-1-VARHDRSZ)/sizeof(int32_t)) 1.122 + return UTF8PROC_ERROR_OVERFLOW; 1.123 + // reserve one extra byte for termination 1.124 + *output_string_ptr = palloc(result * sizeof(int32_t) + 1 + VARHDRSZ); 1.125 + output_string = *output_string_ptr; 1.126 + if (!output_string) return UTF8PROC_ERROR_NOMEM; 1.127 + result = utf8proc_decompose( 1.128 + VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ, 1.129 + (int32_t *)VARDATA(output_string), result, options 1.130 + ); 1.131 + if (result < 0) return result; 1.132 + result = utf8proc_reencode( 1.133 + (int32_t *)VARDATA(output_string), result, options 1.134 + ); 1.135 + if (result >= 0) VARATT_SIZEP(output_string) = result + VARHDRSZ; 1.136 + return result; 1.137 +} 1.138 + 1.139 +void utf8proc_pgsql_utf8map_errchk(ssize_t result, text *output_string) { 1.140 if (result < 0) { 1.141 int sqlerrcode; 1.142 if (output_string) pfree(output_string); 1.143 @@ -99,7 +92,7 @@ 1.144 sqlerrcode = ERRCODE_PROGRAM_LIMIT_EXCEEDED; break; 1.145 case UTF8PROC_ERROR_INVALIDUTF8: 1.146 case UTF8PROC_ERROR_NOTASSIGNED: 1.147 - PG_RETURN_NULL(); 1.148 + return; 1.149 default: 1.150 sqlerrcode = ERRCODE_INTERNAL_ERROR; 1.151 } 1.152 @@ -107,11 +100,42 @@ 1.153 errcode(sqlerrcode), 1.154 errmsg("%s", utf8proc_errmsg(result)) 1.155 )); 1.156 - } else { 1.157 - VARATT_SIZEP(output_string) = result + VARHDRSZ; 1.158 - PG_RETURN_TEXT_P(output_string); 1.159 } 1.160 - PG_RETURN_NULL(); // prohibit compiler warning 1.161 } 1.162 1.163 +PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold); 1.164 +Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) { 1.165 + text *input_string; 1.166 + text *output_string = NULL; 1.167 + ssize_t result; 1.168 + input_string = PG_GETARG_TEXT_P(0); 1.169 + result = utf8proc_pgsql_utf8map( 1.170 + input_string, &output_string, UTF8PROC_PGSQL_FOLD_OPTS 1.171 + ); 1.172 + PG_FREE_IF_COPY(input_string, 0); 1.173 + utf8proc_pgsql_utf8map_errchk(result, output_string); 1.174 + if (result >= 0) { 1.175 + PG_RETURN_TEXT_P(output_string); 1.176 + } else { 1.177 + PG_RETURN_NULL(); 1.178 + } 1.179 +} 1.180 1.181 +PG_FUNCTION_INFO_V1(utf8proc_pgsql_unistrip); 1.182 +Datum utf8proc_pgsql_unistrip(PG_FUNCTION_ARGS) { 1.183 + text *input_string; 1.184 + text *output_string = NULL; 1.185 + ssize_t result; 1.186 + input_string = PG_GETARG_TEXT_P(0); 1.187 + result = utf8proc_pgsql_utf8map( 1.188 + input_string, &output_string, UTF8PROC_PGSQL_STRIP_OPTS 1.189 + ); 1.190 + PG_FREE_IF_COPY(input_string, 0); 1.191 + utf8proc_pgsql_utf8map_errchk(result, output_string); 1.192 + if (result >= 0) { 1.193 + PG_RETURN_TEXT_P(output_string); 1.194 + } else { 1.195 + PG_RETURN_NULL(); 1.196 + } 1.197 +} 1.198 +