# HG changeset patch
# User jbe
# Date 1158487200 -7200
# Node ID 4ee0d5f54af1c55af281ec6cc73ffb7017c81d98
# Parent  aaad485d5335bfff54db6579a8cf11cda89174b8
Version 1.0

- added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function)
- added the STRIPMARK option, which strips marking characters (or marks of composed characters)
- deprecated ruby method String#char_ary in favour of String#utf8chars

diff -r aaad485d5335 -r 4ee0d5f54af1 Changelog
--- a/Changelog	Fri Aug 04 12:00:00 2006 +0200
+++ b/Changelog	Sun Sep 17 12:00:00 2006 +0200
@@ -31,3 +31,12 @@
 
 Release of version 0.3
 
+2006-09-17:
+- added the LUMP option, which lumps certain characters together
+  (see lump.txt) (also used for the PostgreSQL "unifold" function)
+- added the STRIPMARK option, which strips marking characters
+  (or marks of composed characters)
+- deprecated ruby method String#char_ary in favour of String#utf8chars
+
+Release of version 1.0
+
diff -r aaad485d5335 -r 4ee0d5f54af1 lump.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lump.txt	Sun Sep 17 12:00:00 2006 +0200
@@ -0,0 +1,26 @@
+U+0020      <-- all space characters (general category Zs)
+U+0027  '   <-- left/right single quotation mark U+2018..2019,
+                modifier letter apostrophe U+02BC,
+                modifier letter vertical line U+02C8
+U+002D  -   <-- all dash characters (general category Pd),
+                minus U+2212
+U+002F  /   <-- fraction slash U+2044,
+                division slash U+2215
+U+003A  :   <-- ratio U+2236
+U+003C  <   <-- single left-pointing angle quotation mark U+2039,
+                left-pointing angle bracket U+2329,
+                left angle bracket U+3008
+U+003E  >   <-- single right-pointing angle quotation mark U+203A,
+                right-pointing angle bracket U+232A,
+                right angle bracket U+3009
+U+005C  \   <-- set minus U+2216
+U+005E  ^   <-- modifier letter up arrowhead U+02C4,
+                modifier letter circumflex accent U+02C6,
+                caret U+2038,
+                up arrowhead U+2303
+U+005F  _   <-- all connector characters (general category Pc),
+                modifier letter low macron U+02CD
+U+0060  `   <-- modifier letter grave accent U+02CB
+U+007C  |   <-- divides U+2223
+U+007E  ~   <-- tilde operator U+223C
+
diff -r aaad485d5335 -r 4ee0d5f54af1 pgsql/utf8proc_pgsql.c
--- a/pgsql/utf8proc_pgsql.c	Fri Aug 04 12:00:00 2006 +0200
+++ b/pgsql/utf8proc_pgsql.c	Sun Sep 17 12:00:00 2006 +0200
@@ -33,8 +33,8 @@
 
 /*
  *  File name:    pgsql/utf8proc_pgsql.c
- *  Version:      0.3
- *  Last changed: 2006-08-04
+ *  Version:      1.0
+ *  Last changed: 2006-09-17
  *
  *  Description:
  *  PostgreSQL extension to provide a function 'unifold', which can be used
@@ -53,7 +53,7 @@
 
 #define UTF8PROC_PGSQL_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
   UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \
-  UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD )
+  UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP )
 
 PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold);
 Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) {
diff -r aaad485d5335 -r 4ee0d5f54af1 ruby/utf8proc.rb
--- a/ruby/utf8proc.rb	Fri Aug 04 12:00:00 2006 +0200
+++ b/ruby/utf8proc.rb	Sun Sep 17 12:00:00 2006 +0200
@@ -33,8 +33,8 @@
 
 ##
  #  File name:    ruby/utf8proc.rb
- #  Version:      0.3
- #  Last changed: 2006-08-04
+ #  Version:      1.0
+ #  Last changed: 2006-09-17
  #
  #  Description:
  #  Part of the ruby wrapper for libutf8proc, which is written in ruby.
@@ -82,10 +82,14 @@
     def utf8nfkd!; utf8map!(:stable, :decompose, :compat); end
     def utf8nfkc;  utf8map( :stable, :compose, :compat); end
     def utf8nfkc!; utf8map!(:stable, :compose, :compat); end
+    def utf8chars
+      result = self.utf8map(:charbound).split("\377")
+      result.shift if result.first.empty?
+      result
+    end
     def char_ary
-      char_ary = self.utf8map(:charbound).split("\377")
-      char_ary.shift if char_ary.first == ''
-      char_ary
+      # depecated, use String#utf8chars instead
+      utf8chars
     end
   end
 
diff -r aaad485d5335 -r 4ee0d5f54af1 ruby/utf8proc_native.c
--- a/ruby/utf8proc_native.c	Fri Aug 04 12:00:00 2006 +0200
+++ b/ruby/utf8proc_native.c	Sun Sep 17 12:00:00 2006 +0200
@@ -33,8 +33,8 @@
 
 /*
  *  File name:    ruby/utf8proc_native.c
- *  Version:      0.3
- *  Last changed: 2006-08-04
+ *  Version:      1.0
+ *  Last changed: 2006-09-17
  *
  *  Description:
  *  Native part of the ruby wrapper for libutf8proc.
@@ -65,6 +65,7 @@
     case UTF8PROC_ERROR_NOMEM:
     excpt_class = rb_eNoMemError; break;
     case UTF8PROC_ERROR_OVERFLOW:
+    case UTF8PROC_ERROR_INVALIDOPTS:
     excpt_class = rb_eArgError; break;
     case UTF8PROC_ERROR_INVALIDUTF8:
     excpt_class = utf8proc_ruby_eInvalidUtf8Error; break;
@@ -157,6 +158,8 @@
   register_utf8proc_option("stripcc",   UTF8PROC_STRIPCC);
   register_utf8proc_option("casefold",  UTF8PROC_CASEFOLD);
   register_utf8proc_option("charbound", UTF8PROC_CHARBOUND);
+  register_utf8proc_option("lump",      UTF8PROC_LUMP);
+  register_utf8proc_option("stripmark", UTF8PROC_STRIPMARK);
   OBJ_FREEZE(utf8proc_ruby_options);
   rb_define_const(utf8proc_ruby_module, "Options", utf8proc_ruby_options);
 }
diff -r aaad485d5335 -r 4ee0d5f54af1 utf8proc.c
--- a/utf8proc.c	Fri Aug 04 12:00:00 2006 +0200
+++ b/utf8proc.c	Sun Sep 17 12:00:00 2006 +0200
@@ -42,8 +42,8 @@
 
 /*
  *  File name:    utf8proc.c
- *  Version:      0.3
- *  Last changed: 2006-08-04
+ *  Version:      1.0
+ *  Last changed: 2006-09-17
  *
  *  Description:
  *  Implementation of libutf8proc.
@@ -116,6 +116,8 @@
     return "Invalid UTF-8 string";
     case UTF8PROC_ERROR_NOTASSIGNED:
     return "Unassigned Unicode code point found in UTF-8 string.";
+    case UTF8PROC_ERROR_INVALIDOPTS:
+    return "Invalid options for UTF-8 processing chosen.";
     default:
     return "An unknown error occured while processing UTF-8 data.";
   }
@@ -197,59 +199,103 @@
   );
 }
 
+#define utf8proc_decompose_lump(replacement_uc) \
+  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
+  options & ~UTF8PROC_LUMP, last_boundclass)
+
 ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
     int options, int *last_boundclass) {
   // ASSERT: uc >= 0 && uc < 0x110000
   const utf8proc_property_t *property;
+  utf8proc_propval_t category;
   int32_t hangul_sindex;
   property = utf8proc_get_property(uc);
+  category = property->category;
   hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
-  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
-      hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
-    int32_t hangul_tindex;
-    if (bufsize >= 1) {
-      dst[0] = UTF8PROC_HANGUL_LBASE +
-        hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
-      if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
-        (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
+  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
+    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
+      int32_t hangul_tindex;
+      if (bufsize >= 1) {
+        dst[0] = UTF8PROC_HANGUL_LBASE +
+          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
+        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
+          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
+      }
+      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
+      if (!hangul_tindex) return 2;
+      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
+      return 3;
     }
-    hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
-    if (!hangul_tindex) return 2;
-    if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
-    return 3;
-  } else if ((options & UTF8PROC_REJECTNA) && !property->category) {
-    return UTF8PROC_ERROR_NOTASSIGNED;
-  } else if ((options & UTF8PROC_IGNORE) && property->ignorable) {
-    return 0;
-  } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) {
-    const int32_t *casefold_entry;
-    ssize_t written = 0;
-    for (casefold_entry = property->casefold_mapping;
-        *casefold_entry >= 0; casefold_entry++) {
-      written += utf8proc_decompose_char(*casefold_entry, dst+written,
-        (bufsize > written) ? (bufsize - written) : 0, options,
+  }
+  if (options & UTF8PROC_REJECTNA) {
+    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
+  }
+  if (options & UTF8PROC_IGNORE) {
+    if (property->ignorable) return 0;
+  }
+  if (options & UTF8PROC_LUMP) {
+    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
+    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
+      utf8proc_decompose_lump(0x0027);
+    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
+      utf8proc_decompose_lump(0x002D);
+    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
+    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
+    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
+      utf8proc_decompose_lump(0x003C);
+    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
+      utf8proc_decompose_lump(0x003E);
+    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
+    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
+      utf8proc_decompose_lump(0x005E);
+    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
+      utf8proc_decompose_lump(0x005F);
+    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
+    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
+    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
+    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
+      if (category == UTF8PROC_CATEGORY_ZL ||
+          category == UTF8PROC_CATEGORY_ZP)
+        utf8proc_decompose_lump(0x000A);
+    }
+  }
+  if (options & UTF8PROC_STRIPMARK) {
+    if (category == UTF8PROC_CATEGORY_MN ||
+      category == UTF8PROC_CATEGORY_MC ||
+      category == UTF8PROC_CATEGORY_ME) return 0;
+  }
+  if (options & UTF8PROC_CASEFOLD) {
+    if (property->casefold_mapping) {
+      const int32_t *casefold_entry;
+      ssize_t written = 0;
+      for (casefold_entry = property->casefold_mapping;
+          *casefold_entry >= 0; casefold_entry++) {
+        written += utf8proc_decompose_char(*casefold_entry, dst+written,
+          (bufsize > written) ? (bufsize - written) : 0, options,
+          last_boundclass);
+        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
+      }
+      return written;
+    }
+  }
+  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
+    if (property->decomp_mapping &&
+        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
+      const int32_t *decomp_entry;
+      ssize_t written = 0;
+      for (decomp_entry = property->decomp_mapping;
+          *decomp_entry >= 0; decomp_entry++) {
+        written += utf8proc_decompose_char(*decomp_entry, dst+written,
+          (bufsize > written) ? (bufsize - written) : 0, options,
         last_boundclass);
-      if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
+        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
+      }
+      return written;
     }
-    return written;
-  } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
-      property->decomp_mapping &&
-      (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
-    const int32_t *decomp_entry;
-    ssize_t written = 0;
-    for (decomp_entry = property->decomp_mapping;
-        *decomp_entry >= 0; decomp_entry++) {
-      written += utf8proc_decompose_char(*decomp_entry, dst+written,
-        (bufsize > written) ? (bufsize - written) : 0, options,
-        last_boundclass);
-      if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
-    }
-    return written;
-  } else if (options & UTF8PROC_CHARBOUND) {
+  }
+  if (options & UTF8PROC_CHARBOUND) {
     bool boundary;
     int tbc, lbc;
-    int category;
-    category = property->category;
     tbc =
       (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
       (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
@@ -306,6 +352,11 @@
     int32_t *buffer, ssize_t bufsize, int options) {
   // strlen will be ignored, if UTF8PROC_NULLTERM is set in options
   ssize_t wpos = 0;
+  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
+    return UTF8PROC_ERROR_INVALIDOPTS;
+  if ((options & UTF8PROC_STRIPMARK) &&
+      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
+    return UTF8PROC_ERROR_INVALIDOPTS;
   {
     int32_t uc;
     ssize_t rpos = 0;
@@ -395,7 +446,7 @@
     int32_t *starter = NULL;
     int32_t current_char;
     const utf8proc_property_t *starter_property = NULL, *current_property;
-    int16_t max_combining_class = -1;
+    utf8proc_propval_t max_combining_class = -1;
     ssize_t rpos;
     ssize_t wpos = 0;
     int32_t composition;
diff -r aaad485d5335 -r 4ee0d5f54af1 utf8proc.h
--- a/utf8proc.h	Fri Aug 04 12:00:00 2006 +0200
+++ b/utf8proc.h	Sun Sep 17 12:00:00 2006 +0200
@@ -42,8 +42,8 @@
 
 /*
  *  File name:    utf8proc.h
- *  Version:      0.3
- *  Last changed: 2006-08-04
+ *  Version:      1.0
+ *  Last changed: 2006-09-17
  *
  *  Description:
  *  Header files for libutf8proc, which is a mapping tool for UTF-8 strings
@@ -52,8 +52,12 @@
  *  - replacing compatibility characters with their equivalents
  *  - stripping of "default ignorable characters"
  *    like SOFT-HYPHEN or ZERO-WIDTH-SPACE
+ *  - folding of certain characters for string comparison
+ *    (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
+ *    (see "LUMP" option)
  *  - optional rejection of strings containing non-assigned code points
  *  - stripping of control characters
+ *  - stripping of character marks (accents, etc.)
  *  - transformation of LF, CRLF, CR and NEL to line-feed (LF)
  *    or to the unicode chararacters for paragraph separation (PS)
  *    or line separation (LS).
@@ -91,6 +95,8 @@
 #define UTF8PROC_STRIPCC   (1<<9)
 #define UTF8PROC_CASEFOLD  (1<<10)
 #define UTF8PROC_CHARBOUND (1<<11)
+#define UTF8PROC_LUMP      (1<<12)
+#define UTF8PROC_STRIPMARK (1<<13)
 /*
  *  Flags being regarded by several functions in the library:
  *  NULLTERM:  The given UTF-8 input is NULL terminated.
@@ -118,12 +124,21 @@
  *             case-insensitive string comparison.
  *  CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which is
  *             representing a single grapheme cluster (a single character).
+ *  LUMP:      Lumps certain characters together
+ *             (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
+ *             (See lump.txt for details.)
+ *             If NLF2LF is set, this includes a transformation of paragraph
+ *             and line separators to ASCII line-feed (LF).
+ *  STRIPMARK: Strips all character markings
+ *             (non-spacing, spacing and enclosing) (i.e. accents)
+ *             NOTE: this option works only with COMPOSE or DECOMPOSE
  */
 
 #define UTF8PROC_ERROR_NOMEM -1
 #define UTF8PROC_ERROR_OVERFLOW -2
 #define UTF8PROC_ERROR_INVALIDUTF8 -3
 #define UTF8PROC_ERROR_NOTASSIGNED -4
+#define UTF8PROC_ERROR_INVALIDOPTS -5
 /*
  *  Error codes being returned by almost all functions:
  *  ERROR_NOMEM:       Memory could not be allocated.
@@ -131,13 +146,15 @@
  *  ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
  *  ERROR_NOTASSIGNED: The REJECTNA flag was set,
  *                     and an unassigned code point was found.
+ *  ERROR_INVALIDOPTS: Invalid options have been used.
  */
 
+typedef int16_t utf8proc_propval_t;
 typedef struct utf8proc_property_struct {
-  int16_t category;
-  int16_t combining_class;
-  int16_t bidi_class;
-  int16_t decomp_type;
+  utf8proc_propval_t category;
+  utf8proc_propval_t combining_class;
+  utf8proc_propval_t bidi_class;
+  utf8proc_propval_t decomp_type;
   const int32_t *decomp_mapping;
   const unsigned bidi_mirrored:1;
   const int32_t uppercase_mapping;
@@ -267,6 +284,8 @@
  *  COMPAT:    replace certain characters with their
  *             compatibility decomposition
  *  CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
+ *  LUMP:      lumps certain different characters together
+ *  STRIPMARK: removes all character marks
  *  The pointer 'last_boundclass' has to point to an integer variable which is
  *  storing the last character boundary class, if the CHARBOUND option is
  *  used.