utf8proc
changeset 3:4ee0d5f54af1 v1.0
Version 1.0
- added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function)
- added the STRIPMARK option, which strips marking characters (or marks of composed characters)
- deprecated ruby method String#char_ary in favour of String#utf8chars
- added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function)
- added the STRIPMARK option, which strips marking characters (or marks of composed characters)
- deprecated ruby method String#char_ary in favour of String#utf8chars
author | jbe |
---|---|
date | Sun Sep 17 12:00:00 2006 +0200 (2006-09-17) |
parents | aaad485d5335 |
children | a49e32490aac |
files | Changelog lump.txt pgsql/utf8proc_pgsql.c ruby/utf8proc.rb ruby/utf8proc_native.c utf8proc.c utf8proc.h |
line diff
1.1 --- a/Changelog Fri Aug 04 12:00:00 2006 +0200 1.2 +++ b/Changelog Sun Sep 17 12:00:00 2006 +0200 1.3 @@ -31,3 +31,12 @@ 1.4 1.5 Release of version 0.3 1.6 1.7 +2006-09-17: 1.8 +- added the LUMP option, which lumps certain characters together 1.9 + (see lump.txt) (also used for the PostgreSQL "unifold" function) 1.10 +- added the STRIPMARK option, which strips marking characters 1.11 + (or marks of composed characters) 1.12 +- deprecated ruby method String#char_ary in favour of String#utf8chars 1.13 + 1.14 +Release of version 1.0 1.15 +
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/lump.txt Sun Sep 17 12:00:00 2006 +0200 2.3 @@ -0,0 +1,26 @@ 2.4 +U+0020 <-- all space characters (general category Zs) 2.5 +U+0027 ' <-- left/right single quotation mark U+2018..2019, 2.6 + modifier letter apostrophe U+02BC, 2.7 + modifier letter vertical line U+02C8 2.8 +U+002D - <-- all dash characters (general category Pd), 2.9 + minus U+2212 2.10 +U+002F / <-- fraction slash U+2044, 2.11 + division slash U+2215 2.12 +U+003A : <-- ratio U+2236 2.13 +U+003C < <-- single left-pointing angle quotation mark U+2039, 2.14 + left-pointing angle bracket U+2329, 2.15 + left angle bracket U+3008 2.16 +U+003E > <-- single right-pointing angle quotation mark U+203A, 2.17 + right-pointing angle bracket U+232A, 2.18 + right angle bracket U+3009 2.19 +U+005C \ <-- set minus U+2216 2.20 +U+005E ^ <-- modifier letter up arrowhead U+02C4, 2.21 + modifier letter circumflex accent U+02C6, 2.22 + caret U+2038, 2.23 + up arrowhead U+2303 2.24 +U+005F _ <-- all connector characters (general category Pc), 2.25 + modifier letter low macron U+02CD 2.26 +U+0060 ` <-- modifier letter grave accent U+02CB 2.27 +U+007C | <-- divides U+2223 2.28 +U+007E ~ <-- tilde operator U+223C 2.29 +
3.1 --- a/pgsql/utf8proc_pgsql.c Fri Aug 04 12:00:00 2006 +0200 3.2 +++ b/pgsql/utf8proc_pgsql.c Sun Sep 17 12:00:00 2006 +0200 3.3 @@ -33,8 +33,8 @@ 3.4 3.5 /* 3.6 * File name: pgsql/utf8proc_pgsql.c 3.7 - * Version: 0.3 3.8 - * Last changed: 2006-08-04 3.9 + * Version: 1.0 3.10 + * Last changed: 2006-09-17 3.11 * 3.12 * Description: 3.13 * PostgreSQL extension to provide a function 'unifold', which can be used 3.14 @@ -53,7 +53,7 @@ 3.15 3.16 #define UTF8PROC_PGSQL_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \ 3.17 UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \ 3.18 - UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD ) 3.19 + UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP ) 3.20 3.21 PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold); 3.22 Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) {
4.1 --- a/ruby/utf8proc.rb Fri Aug 04 12:00:00 2006 +0200 4.2 +++ b/ruby/utf8proc.rb Sun Sep 17 12:00:00 2006 +0200 4.3 @@ -33,8 +33,8 @@ 4.4 4.5 ## 4.6 # File name: ruby/utf8proc.rb 4.7 - # Version: 0.3 4.8 - # Last changed: 2006-08-04 4.9 + # Version: 1.0 4.10 + # Last changed: 2006-09-17 4.11 # 4.12 # Description: 4.13 # Part of the ruby wrapper for libutf8proc, which is written in ruby. 4.14 @@ -82,10 +82,14 @@ 4.15 def utf8nfkd!; utf8map!(:stable, :decompose, :compat); end 4.16 def utf8nfkc; utf8map( :stable, :compose, :compat); end 4.17 def utf8nfkc!; utf8map!(:stable, :compose, :compat); end 4.18 + def utf8chars 4.19 + result = self.utf8map(:charbound).split("\377") 4.20 + result.shift if result.first.empty? 4.21 + result 4.22 + end 4.23 def char_ary 4.24 - char_ary = self.utf8map(:charbound).split("\377") 4.25 - char_ary.shift if char_ary.first == '' 4.26 - char_ary 4.27 + # depecated, use String#utf8chars instead 4.28 + utf8chars 4.29 end 4.30 end 4.31
5.1 --- a/ruby/utf8proc_native.c Fri Aug 04 12:00:00 2006 +0200 5.2 +++ b/ruby/utf8proc_native.c Sun Sep 17 12:00:00 2006 +0200 5.3 @@ -33,8 +33,8 @@ 5.4 5.5 /* 5.6 * File name: ruby/utf8proc_native.c 5.7 - * Version: 0.3 5.8 - * Last changed: 2006-08-04 5.9 + * Version: 1.0 5.10 + * Last changed: 2006-09-17 5.11 * 5.12 * Description: 5.13 * Native part of the ruby wrapper for libutf8proc. 5.14 @@ -65,6 +65,7 @@ 5.15 case UTF8PROC_ERROR_NOMEM: 5.16 excpt_class = rb_eNoMemError; break; 5.17 case UTF8PROC_ERROR_OVERFLOW: 5.18 + case UTF8PROC_ERROR_INVALIDOPTS: 5.19 excpt_class = rb_eArgError; break; 5.20 case UTF8PROC_ERROR_INVALIDUTF8: 5.21 excpt_class = utf8proc_ruby_eInvalidUtf8Error; break; 5.22 @@ -157,6 +158,8 @@ 5.23 register_utf8proc_option("stripcc", UTF8PROC_STRIPCC); 5.24 register_utf8proc_option("casefold", UTF8PROC_CASEFOLD); 5.25 register_utf8proc_option("charbound", UTF8PROC_CHARBOUND); 5.26 + register_utf8proc_option("lump", UTF8PROC_LUMP); 5.27 + register_utf8proc_option("stripmark", UTF8PROC_STRIPMARK); 5.28 OBJ_FREEZE(utf8proc_ruby_options); 5.29 rb_define_const(utf8proc_ruby_module, "Options", utf8proc_ruby_options); 5.30 }
6.1 --- a/utf8proc.c Fri Aug 04 12:00:00 2006 +0200 6.2 +++ b/utf8proc.c Sun Sep 17 12:00:00 2006 +0200 6.3 @@ -42,8 +42,8 @@ 6.4 6.5 /* 6.6 * File name: utf8proc.c 6.7 - * Version: 0.3 6.8 - * Last changed: 2006-08-04 6.9 + * Version: 1.0 6.10 + * Last changed: 2006-09-17 6.11 * 6.12 * Description: 6.13 * Implementation of libutf8proc. 6.14 @@ -116,6 +116,8 @@ 6.15 return "Invalid UTF-8 string"; 6.16 case UTF8PROC_ERROR_NOTASSIGNED: 6.17 return "Unassigned Unicode code point found in UTF-8 string."; 6.18 + case UTF8PROC_ERROR_INVALIDOPTS: 6.19 + return "Invalid options for UTF-8 processing chosen."; 6.20 default: 6.21 return "An unknown error occured while processing UTF-8 data."; 6.22 } 6.23 @@ -197,59 +199,103 @@ 6.24 ); 6.25 } 6.26 6.27 +#define utf8proc_decompose_lump(replacement_uc) \ 6.28 + return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 6.29 + options & ~UTF8PROC_LUMP, last_boundclass) 6.30 + 6.31 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, 6.32 int options, int *last_boundclass) { 6.33 // ASSERT: uc >= 0 && uc < 0x110000 6.34 const utf8proc_property_t *property; 6.35 + utf8proc_propval_t category; 6.36 int32_t hangul_sindex; 6.37 property = utf8proc_get_property(uc); 6.38 + category = property->category; 6.39 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 6.40 - if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && 6.41 - hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 6.42 - int32_t hangul_tindex; 6.43 - if (bufsize >= 1) { 6.44 - dst[0] = UTF8PROC_HANGUL_LBASE + 6.45 - hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 6.46 - if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 6.47 - (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 6.48 + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 6.49 + if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 6.50 + int32_t hangul_tindex; 6.51 + if (bufsize >= 1) { 6.52 + dst[0] = UTF8PROC_HANGUL_LBASE + 6.53 + hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 6.54 + if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 6.55 + (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 6.56 + } 6.57 + hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 6.58 + if (!hangul_tindex) return 2; 6.59 + if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 6.60 + return 3; 6.61 } 6.62 - hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 6.63 - if (!hangul_tindex) return 2; 6.64 - if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 6.65 - return 3; 6.66 - } else if ((options & UTF8PROC_REJECTNA) && !property->category) { 6.67 - return UTF8PROC_ERROR_NOTASSIGNED; 6.68 - } else if ((options & UTF8PROC_IGNORE) && property->ignorable) { 6.69 - return 0; 6.70 - } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) { 6.71 - const int32_t *casefold_entry; 6.72 - ssize_t written = 0; 6.73 - for (casefold_entry = property->casefold_mapping; 6.74 - *casefold_entry >= 0; casefold_entry++) { 6.75 - written += utf8proc_decompose_char(*casefold_entry, dst+written, 6.76 - (bufsize > written) ? (bufsize - written) : 0, options, 6.77 + } 6.78 + if (options & UTF8PROC_REJECTNA) { 6.79 + if (!category) return UTF8PROC_ERROR_NOTASSIGNED; 6.80 + } 6.81 + if (options & UTF8PROC_IGNORE) { 6.82 + if (property->ignorable) return 0; 6.83 + } 6.84 + if (options & UTF8PROC_LUMP) { 6.85 + if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); 6.86 + if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) 6.87 + utf8proc_decompose_lump(0x0027); 6.88 + if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) 6.89 + utf8proc_decompose_lump(0x002D); 6.90 + if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); 6.91 + if (uc == 0x2236) utf8proc_decompose_lump(0x003A); 6.92 + if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) 6.93 + utf8proc_decompose_lump(0x003C); 6.94 + if (uc == 0x203A || uc == 0x232A || uc == 0x3009) 6.95 + utf8proc_decompose_lump(0x003E); 6.96 + if (uc == 0x2216) utf8proc_decompose_lump(0x005C); 6.97 + if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) 6.98 + utf8proc_decompose_lump(0x005E); 6.99 + if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) 6.100 + utf8proc_decompose_lump(0x005F); 6.101 + if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); 6.102 + if (uc == 0x2223) utf8proc_decompose_lump(0x007C); 6.103 + if (uc == 0x223C) utf8proc_decompose_lump(0x007E); 6.104 + if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { 6.105 + if (category == UTF8PROC_CATEGORY_ZL || 6.106 + category == UTF8PROC_CATEGORY_ZP) 6.107 + utf8proc_decompose_lump(0x000A); 6.108 + } 6.109 + } 6.110 + if (options & UTF8PROC_STRIPMARK) { 6.111 + if (category == UTF8PROC_CATEGORY_MN || 6.112 + category == UTF8PROC_CATEGORY_MC || 6.113 + category == UTF8PROC_CATEGORY_ME) return 0; 6.114 + } 6.115 + if (options & UTF8PROC_CASEFOLD) { 6.116 + if (property->casefold_mapping) { 6.117 + const int32_t *casefold_entry; 6.118 + ssize_t written = 0; 6.119 + for (casefold_entry = property->casefold_mapping; 6.120 + *casefold_entry >= 0; casefold_entry++) { 6.121 + written += utf8proc_decompose_char(*casefold_entry, dst+written, 6.122 + (bufsize > written) ? (bufsize - written) : 0, options, 6.123 + last_boundclass); 6.124 + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 6.125 + } 6.126 + return written; 6.127 + } 6.128 + } 6.129 + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 6.130 + if (property->decomp_mapping && 6.131 + (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 6.132 + const int32_t *decomp_entry; 6.133 + ssize_t written = 0; 6.134 + for (decomp_entry = property->decomp_mapping; 6.135 + *decomp_entry >= 0; decomp_entry++) { 6.136 + written += utf8proc_decompose_char(*decomp_entry, dst+written, 6.137 + (bufsize > written) ? (bufsize - written) : 0, options, 6.138 last_boundclass); 6.139 - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 6.140 + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 6.141 + } 6.142 + return written; 6.143 } 6.144 - return written; 6.145 - } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && 6.146 - property->decomp_mapping && 6.147 - (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 6.148 - const int32_t *decomp_entry; 6.149 - ssize_t written = 0; 6.150 - for (decomp_entry = property->decomp_mapping; 6.151 - *decomp_entry >= 0; decomp_entry++) { 6.152 - written += utf8proc_decompose_char(*decomp_entry, dst+written, 6.153 - (bufsize > written) ? (bufsize - written) : 0, options, 6.154 - last_boundclass); 6.155 - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 6.156 - } 6.157 - return written; 6.158 - } else if (options & UTF8PROC_CHARBOUND) { 6.159 + } 6.160 + if (options & UTF8PROC_CHARBOUND) { 6.161 bool boundary; 6.162 int tbc, lbc; 6.163 - int category; 6.164 - category = property->category; 6.165 tbc = 6.166 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : 6.167 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : 6.168 @@ -306,6 +352,11 @@ 6.169 int32_t *buffer, ssize_t bufsize, int options) { 6.170 // strlen will be ignored, if UTF8PROC_NULLTERM is set in options 6.171 ssize_t wpos = 0; 6.172 + if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 6.173 + return UTF8PROC_ERROR_INVALIDOPTS; 6.174 + if ((options & UTF8PROC_STRIPMARK) && 6.175 + !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 6.176 + return UTF8PROC_ERROR_INVALIDOPTS; 6.177 { 6.178 int32_t uc; 6.179 ssize_t rpos = 0; 6.180 @@ -395,7 +446,7 @@ 6.181 int32_t *starter = NULL; 6.182 int32_t current_char; 6.183 const utf8proc_property_t *starter_property = NULL, *current_property; 6.184 - int16_t max_combining_class = -1; 6.185 + utf8proc_propval_t max_combining_class = -1; 6.186 ssize_t rpos; 6.187 ssize_t wpos = 0; 6.188 int32_t composition;
7.1 --- a/utf8proc.h Fri Aug 04 12:00:00 2006 +0200 7.2 +++ b/utf8proc.h Sun Sep 17 12:00:00 2006 +0200 7.3 @@ -42,8 +42,8 @@ 7.4 7.5 /* 7.6 * File name: utf8proc.h 7.7 - * Version: 0.3 7.8 - * Last changed: 2006-08-04 7.9 + * Version: 1.0 7.10 + * Last changed: 2006-09-17 7.11 * 7.12 * Description: 7.13 * Header files for libutf8proc, which is a mapping tool for UTF-8 strings 7.14 @@ -52,8 +52,12 @@ 7.15 * - replacing compatibility characters with their equivalents 7.16 * - stripping of "default ignorable characters" 7.17 * like SOFT-HYPHEN or ZERO-WIDTH-SPACE 7.18 + * - folding of certain characters for string comparison 7.19 + * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-") 7.20 + * (see "LUMP" option) 7.21 * - optional rejection of strings containing non-assigned code points 7.22 * - stripping of control characters 7.23 + * - stripping of character marks (accents, etc.) 7.24 * - transformation of LF, CRLF, CR and NEL to line-feed (LF) 7.25 * or to the unicode chararacters for paragraph separation (PS) 7.26 * or line separation (LS). 7.27 @@ -91,6 +95,8 @@ 7.28 #define UTF8PROC_STRIPCC (1<<9) 7.29 #define UTF8PROC_CASEFOLD (1<<10) 7.30 #define UTF8PROC_CHARBOUND (1<<11) 7.31 +#define UTF8PROC_LUMP (1<<12) 7.32 +#define UTF8PROC_STRIPMARK (1<<13) 7.33 /* 7.34 * Flags being regarded by several functions in the library: 7.35 * NULLTERM: The given UTF-8 input is NULL terminated. 7.36 @@ -118,12 +124,21 @@ 7.37 * case-insensitive string comparison. 7.38 * CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which is 7.39 * representing a single grapheme cluster (a single character). 7.40 + * LUMP: Lumps certain characters together 7.41 + * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-"). 7.42 + * (See lump.txt for details.) 7.43 + * If NLF2LF is set, this includes a transformation of paragraph 7.44 + * and line separators to ASCII line-feed (LF). 7.45 + * STRIPMARK: Strips all character markings 7.46 + * (non-spacing, spacing and enclosing) (i.e. accents) 7.47 + * NOTE: this option works only with COMPOSE or DECOMPOSE 7.48 */ 7.49 7.50 #define UTF8PROC_ERROR_NOMEM -1 7.51 #define UTF8PROC_ERROR_OVERFLOW -2 7.52 #define UTF8PROC_ERROR_INVALIDUTF8 -3 7.53 #define UTF8PROC_ERROR_NOTASSIGNED -4 7.54 +#define UTF8PROC_ERROR_INVALIDOPTS -5 7.55 /* 7.56 * Error codes being returned by almost all functions: 7.57 * ERROR_NOMEM: Memory could not be allocated. 7.58 @@ -131,13 +146,15 @@ 7.59 * ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string. 7.60 * ERROR_NOTASSIGNED: The REJECTNA flag was set, 7.61 * and an unassigned code point was found. 7.62 + * ERROR_INVALIDOPTS: Invalid options have been used. 7.63 */ 7.64 7.65 +typedef int16_t utf8proc_propval_t; 7.66 typedef struct utf8proc_property_struct { 7.67 - int16_t category; 7.68 - int16_t combining_class; 7.69 - int16_t bidi_class; 7.70 - int16_t decomp_type; 7.71 + utf8proc_propval_t category; 7.72 + utf8proc_propval_t combining_class; 7.73 + utf8proc_propval_t bidi_class; 7.74 + utf8proc_propval_t decomp_type; 7.75 const int32_t *decomp_mapping; 7.76 const unsigned bidi_mirrored:1; 7.77 const int32_t uppercase_mapping; 7.78 @@ -267,6 +284,8 @@ 7.79 * COMPAT: replace certain characters with their 7.80 * compatibility decomposition 7.81 * CHARBOUND: Inserts 0xFF bytes before each grapheme cluster 7.82 + * LUMP: lumps certain different characters together 7.83 + * STRIPMARK: removes all character marks 7.84 * The pointer 'last_boundclass' has to point to an integer variable which is 7.85 * storing the last character boundary class, if the CHARBOUND option is 7.86 * used.