# HG changeset patch # User jbe # Date 1158487200 -7200 # Node ID 4ee0d5f54af1c55af281ec6cc73ffb7017c81d98 # Parent aaad485d5335bfff54db6579a8cf11cda89174b8 Version 1.0 - added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function) - added the STRIPMARK option, which strips marking characters (or marks of composed characters) - deprecated ruby method String#char_ary in favour of String#utf8chars diff -r aaad485d5335 -r 4ee0d5f54af1 Changelog --- a/Changelog Fri Aug 04 12:00:00 2006 +0200 +++ b/Changelog Sun Sep 17 12:00:00 2006 +0200 @@ -31,3 +31,12 @@ Release of version 0.3 +2006-09-17: +- added the LUMP option, which lumps certain characters together + (see lump.txt) (also used for the PostgreSQL "unifold" function) +- added the STRIPMARK option, which strips marking characters + (or marks of composed characters) +- deprecated ruby method String#char_ary in favour of String#utf8chars + +Release of version 1.0 + diff -r aaad485d5335 -r 4ee0d5f54af1 lump.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lump.txt Sun Sep 17 12:00:00 2006 +0200 @@ -0,0 +1,26 @@ +U+0020 <-- all space characters (general category Zs) +U+0027 ' <-- left/right single quotation mark U+2018..2019, + modifier letter apostrophe U+02BC, + modifier letter vertical line U+02C8 +U+002D - <-- all dash characters (general category Pd), + minus U+2212 +U+002F / <-- fraction slash U+2044, + division slash U+2215 +U+003A : <-- ratio U+2236 +U+003C < <-- single left-pointing angle quotation mark U+2039, + left-pointing angle bracket U+2329, + left angle bracket U+3008 +U+003E > <-- single right-pointing angle quotation mark U+203A, + right-pointing angle bracket U+232A, + right angle bracket U+3009 +U+005C \ <-- set minus U+2216 +U+005E ^ <-- modifier letter up arrowhead U+02C4, + modifier letter circumflex accent U+02C6, + caret U+2038, + up arrowhead U+2303 +U+005F _ <-- all connector characters (general category Pc), + modifier letter low macron U+02CD +U+0060 ` <-- modifier letter grave accent U+02CB +U+007C | <-- divides U+2223 +U+007E ~ <-- tilde operator U+223C + diff -r aaad485d5335 -r 4ee0d5f54af1 pgsql/utf8proc_pgsql.c --- a/pgsql/utf8proc_pgsql.c Fri Aug 04 12:00:00 2006 +0200 +++ b/pgsql/utf8proc_pgsql.c Sun Sep 17 12:00:00 2006 +0200 @@ -33,8 +33,8 @@ /* * File name: pgsql/utf8proc_pgsql.c - * Version: 0.3 - * Last changed: 2006-08-04 + * Version: 1.0 + * Last changed: 2006-09-17 * * Description: * PostgreSQL extension to provide a function 'unifold', which can be used @@ -53,7 +53,7 @@ #define UTF8PROC_PGSQL_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \ UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \ - UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD ) + UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP ) PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold); Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) { diff -r aaad485d5335 -r 4ee0d5f54af1 ruby/utf8proc.rb --- a/ruby/utf8proc.rb Fri Aug 04 12:00:00 2006 +0200 +++ b/ruby/utf8proc.rb Sun Sep 17 12:00:00 2006 +0200 @@ -33,8 +33,8 @@ ## # File name: ruby/utf8proc.rb - # Version: 0.3 - # Last changed: 2006-08-04 + # Version: 1.0 + # Last changed: 2006-09-17 # # Description: # Part of the ruby wrapper for libutf8proc, which is written in ruby. @@ -82,10 +82,14 @@ def utf8nfkd!; utf8map!(:stable, :decompose, :compat); end def utf8nfkc; utf8map( :stable, :compose, :compat); end def utf8nfkc!; utf8map!(:stable, :compose, :compat); end + def utf8chars + result = self.utf8map(:charbound).split("\377") + result.shift if result.first.empty? + result + end def char_ary - char_ary = self.utf8map(:charbound).split("\377") - char_ary.shift if char_ary.first == '' - char_ary + # depecated, use String#utf8chars instead + utf8chars end end diff -r aaad485d5335 -r 4ee0d5f54af1 ruby/utf8proc_native.c --- a/ruby/utf8proc_native.c Fri Aug 04 12:00:00 2006 +0200 +++ b/ruby/utf8proc_native.c Sun Sep 17 12:00:00 2006 +0200 @@ -33,8 +33,8 @@ /* * File name: ruby/utf8proc_native.c - * Version: 0.3 - * Last changed: 2006-08-04 + * Version: 1.0 + * Last changed: 2006-09-17 * * Description: * Native part of the ruby wrapper for libutf8proc. @@ -65,6 +65,7 @@ case UTF8PROC_ERROR_NOMEM: excpt_class = rb_eNoMemError; break; case UTF8PROC_ERROR_OVERFLOW: + case UTF8PROC_ERROR_INVALIDOPTS: excpt_class = rb_eArgError; break; case UTF8PROC_ERROR_INVALIDUTF8: excpt_class = utf8proc_ruby_eInvalidUtf8Error; break; @@ -157,6 +158,8 @@ register_utf8proc_option("stripcc", UTF8PROC_STRIPCC); register_utf8proc_option("casefold", UTF8PROC_CASEFOLD); register_utf8proc_option("charbound", UTF8PROC_CHARBOUND); + register_utf8proc_option("lump", UTF8PROC_LUMP); + register_utf8proc_option("stripmark", UTF8PROC_STRIPMARK); OBJ_FREEZE(utf8proc_ruby_options); rb_define_const(utf8proc_ruby_module, "Options", utf8proc_ruby_options); } diff -r aaad485d5335 -r 4ee0d5f54af1 utf8proc.c --- a/utf8proc.c Fri Aug 04 12:00:00 2006 +0200 +++ b/utf8proc.c Sun Sep 17 12:00:00 2006 +0200 @@ -42,8 +42,8 @@ /* * File name: utf8proc.c - * Version: 0.3 - * Last changed: 2006-08-04 + * Version: 1.0 + * Last changed: 2006-09-17 * * Description: * Implementation of libutf8proc. @@ -116,6 +116,8 @@ return "Invalid UTF-8 string"; case UTF8PROC_ERROR_NOTASSIGNED: return "Unassigned Unicode code point found in UTF-8 string."; + case UTF8PROC_ERROR_INVALIDOPTS: + return "Invalid options for UTF-8 processing chosen."; default: return "An unknown error occured while processing UTF-8 data."; } @@ -197,59 +199,103 @@ ); } +#define utf8proc_decompose_lump(replacement_uc) \ + return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ + options & ~UTF8PROC_LUMP, last_boundclass) + ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, int options, int *last_boundclass) { // ASSERT: uc >= 0 && uc < 0x110000 const utf8proc_property_t *property; + utf8proc_propval_t category; int32_t hangul_sindex; property = utf8proc_get_property(uc); + category = property->category; hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; - if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && - hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { - int32_t hangul_tindex; - if (bufsize >= 1) { - dst[0] = UTF8PROC_HANGUL_LBASE + - hangul_sindex / UTF8PROC_HANGUL_NCOUNT; - if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + - (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { + if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { + int32_t hangul_tindex; + if (bufsize >= 1) { + dst[0] = UTF8PROC_HANGUL_LBASE + + hangul_sindex / UTF8PROC_HANGUL_NCOUNT; + if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + + (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; + } + hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; + if (!hangul_tindex) return 2; + if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; + return 3; } - hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; - if (!hangul_tindex) return 2; - if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; - return 3; - } else if ((options & UTF8PROC_REJECTNA) && !property->category) { - return UTF8PROC_ERROR_NOTASSIGNED; - } else if ((options & UTF8PROC_IGNORE) && property->ignorable) { - return 0; - } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) { - const int32_t *casefold_entry; - ssize_t written = 0; - for (casefold_entry = property->casefold_mapping; - *casefold_entry >= 0; casefold_entry++) { - written += utf8proc_decompose_char(*casefold_entry, dst+written, - (bufsize > written) ? (bufsize - written) : 0, options, + } + if (options & UTF8PROC_REJECTNA) { + if (!category) return UTF8PROC_ERROR_NOTASSIGNED; + } + if (options & UTF8PROC_IGNORE) { + if (property->ignorable) return 0; + } + if (options & UTF8PROC_LUMP) { + if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); + if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) + utf8proc_decompose_lump(0x0027); + if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) + utf8proc_decompose_lump(0x002D); + if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); + if (uc == 0x2236) utf8proc_decompose_lump(0x003A); + if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) + utf8proc_decompose_lump(0x003C); + if (uc == 0x203A || uc == 0x232A || uc == 0x3009) + utf8proc_decompose_lump(0x003E); + if (uc == 0x2216) utf8proc_decompose_lump(0x005C); + if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) + utf8proc_decompose_lump(0x005E); + if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) + utf8proc_decompose_lump(0x005F); + if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); + if (uc == 0x2223) utf8proc_decompose_lump(0x007C); + if (uc == 0x223C) utf8proc_decompose_lump(0x007E); + if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { + if (category == UTF8PROC_CATEGORY_ZL || + category == UTF8PROC_CATEGORY_ZP) + utf8proc_decompose_lump(0x000A); + } + } + if (options & UTF8PROC_STRIPMARK) { + if (category == UTF8PROC_CATEGORY_MN || + category == UTF8PROC_CATEGORY_MC || + category == UTF8PROC_CATEGORY_ME) return 0; + } + if (options & UTF8PROC_CASEFOLD) { + if (property->casefold_mapping) { + const int32_t *casefold_entry; + ssize_t written = 0; + for (casefold_entry = property->casefold_mapping; + *casefold_entry >= 0; casefold_entry++) { + written += utf8proc_decompose_char(*casefold_entry, dst+written, + (bufsize > written) ? (bufsize - written) : 0, options, + last_boundclass); + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; + } + return written; + } + } + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { + if (property->decomp_mapping && + (!property->decomp_type || (options & UTF8PROC_COMPAT))) { + const int32_t *decomp_entry; + ssize_t written = 0; + for (decomp_entry = property->decomp_mapping; + *decomp_entry >= 0; decomp_entry++) { + written += utf8proc_decompose_char(*decomp_entry, dst+written, + (bufsize > written) ? (bufsize - written) : 0, options, last_boundclass); - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; + } + return written; } - return written; - } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && - property->decomp_mapping && - (!property->decomp_type || (options & UTF8PROC_COMPAT))) { - const int32_t *decomp_entry; - ssize_t written = 0; - for (decomp_entry = property->decomp_mapping; - *decomp_entry >= 0; decomp_entry++) { - written += utf8proc_decompose_char(*decomp_entry, dst+written, - (bufsize > written) ? (bufsize - written) : 0, options, - last_boundclass); - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; - } - return written; - } else if (options & UTF8PROC_CHARBOUND) { + } + if (options & UTF8PROC_CHARBOUND) { bool boundary; int tbc, lbc; - int category; - category = property->category; tbc = (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : @@ -306,6 +352,11 @@ int32_t *buffer, ssize_t bufsize, int options) { // strlen will be ignored, if UTF8PROC_NULLTERM is set in options ssize_t wpos = 0; + if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) + return UTF8PROC_ERROR_INVALIDOPTS; + if ((options & UTF8PROC_STRIPMARK) && + !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) + return UTF8PROC_ERROR_INVALIDOPTS; { int32_t uc; ssize_t rpos = 0; @@ -395,7 +446,7 @@ int32_t *starter = NULL; int32_t current_char; const utf8proc_property_t *starter_property = NULL, *current_property; - int16_t max_combining_class = -1; + utf8proc_propval_t max_combining_class = -1; ssize_t rpos; ssize_t wpos = 0; int32_t composition; diff -r aaad485d5335 -r 4ee0d5f54af1 utf8proc.h --- a/utf8proc.h Fri Aug 04 12:00:00 2006 +0200 +++ b/utf8proc.h Sun Sep 17 12:00:00 2006 +0200 @@ -42,8 +42,8 @@ /* * File name: utf8proc.h - * Version: 0.3 - * Last changed: 2006-08-04 + * Version: 1.0 + * Last changed: 2006-09-17 * * Description: * Header files for libutf8proc, which is a mapping tool for UTF-8 strings @@ -52,8 +52,12 @@ * - replacing compatibility characters with their equivalents * - stripping of "default ignorable characters" * like SOFT-HYPHEN or ZERO-WIDTH-SPACE + * - folding of certain characters for string comparison + * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-") + * (see "LUMP" option) * - optional rejection of strings containing non-assigned code points * - stripping of control characters + * - stripping of character marks (accents, etc.) * - transformation of LF, CRLF, CR and NEL to line-feed (LF) * or to the unicode chararacters for paragraph separation (PS) * or line separation (LS). @@ -91,6 +95,8 @@ #define UTF8PROC_STRIPCC (1<<9) #define UTF8PROC_CASEFOLD (1<<10) #define UTF8PROC_CHARBOUND (1<<11) +#define UTF8PROC_LUMP (1<<12) +#define UTF8PROC_STRIPMARK (1<<13) /* * Flags being regarded by several functions in the library: * NULLTERM: The given UTF-8 input is NULL terminated. @@ -118,12 +124,21 @@ * case-insensitive string comparison. * CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which is * representing a single grapheme cluster (a single character). + * LUMP: Lumps certain characters together + * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-"). + * (See lump.txt for details.) + * If NLF2LF is set, this includes a transformation of paragraph + * and line separators to ASCII line-feed (LF). + * STRIPMARK: Strips all character markings + * (non-spacing, spacing and enclosing) (i.e. accents) + * NOTE: this option works only with COMPOSE or DECOMPOSE */ #define UTF8PROC_ERROR_NOMEM -1 #define UTF8PROC_ERROR_OVERFLOW -2 #define UTF8PROC_ERROR_INVALIDUTF8 -3 #define UTF8PROC_ERROR_NOTASSIGNED -4 +#define UTF8PROC_ERROR_INVALIDOPTS -5 /* * Error codes being returned by almost all functions: * ERROR_NOMEM: Memory could not be allocated. @@ -131,13 +146,15 @@ * ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string. * ERROR_NOTASSIGNED: The REJECTNA flag was set, * and an unassigned code point was found. + * ERROR_INVALIDOPTS: Invalid options have been used. */ +typedef int16_t utf8proc_propval_t; typedef struct utf8proc_property_struct { - int16_t category; - int16_t combining_class; - int16_t bidi_class; - int16_t decomp_type; + utf8proc_propval_t category; + utf8proc_propval_t combining_class; + utf8proc_propval_t bidi_class; + utf8proc_propval_t decomp_type; const int32_t *decomp_mapping; const unsigned bidi_mirrored:1; const int32_t uppercase_mapping; @@ -267,6 +284,8 @@ * COMPAT: replace certain characters with their * compatibility decomposition * CHARBOUND: Inserts 0xFF bytes before each grapheme cluster + * LUMP: lumps certain different characters together + * STRIPMARK: removes all character marks * The pointer 'last_boundclass' has to point to an integer variable which is * storing the last character boundary class, if the CHARBOUND option is * used.