utf8proc

diff utf8proc.c @ 2:aaad485d5335

Version 0.3

- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
author jbe
date Fri Aug 04 12:00:00 2006 +0200 (2006-08-04)
parents 61a89ecc2fb9
children 4ee0d5f54af1
line diff
     1.1 --- a/utf8proc.c	Tue Jun 20 12:00:00 2006 +0200
     1.2 +++ b/utf8proc.c	Fri Aug 04 12:00:00 2006 +0200
     1.3 @@ -42,8 +42,8 @@
     1.4  
     1.5  /*
     1.6   *  File name:    utf8proc.c
     1.7 - *  Version:      0.2
     1.8 - *  Last changed: 2006-05-31
     1.9 + *  Version:      0.3
    1.10 + *  Last changed: 2006-08-04
    1.11   *
    1.12   *  Description:
    1.13   *  Implementation of libutf8proc.
    1.14 @@ -81,6 +81,29 @@
    1.15  #define UTF8PROC_HANGUL_TCOUNT 28
    1.16  #define UTF8PROC_HANGUL_NCOUNT 588
    1.17  #define UTF8PROC_HANGUL_SCOUNT 11172
    1.18 +// END is exclusive
    1.19 +#define UTF8PROC_HANGUL_L_START  0x1100
    1.20 +#define UTF8PROC_HANGUL_L_END    0x115A
    1.21 +#define UTF8PROC_HANGUL_L_FILLER 0x115F
    1.22 +#define UTF8PROC_HANGUL_V_START  0x1160
    1.23 +#define UTF8PROC_HANGUL_V_END    0x11A3
    1.24 +#define UTF8PROC_HANGUL_T_START  0x11A8
    1.25 +#define UTF8PROC_HANGUL_T_END    0x11FA
    1.26 +#define UTF8PROC_HANGUL_S_START  0xAC00
    1.27 +#define UTF8PROC_HANGUL_S_END    0xD7A4
    1.28 +
    1.29 +
    1.30 +#define UTF8PROC_BOUNDCLASS_START    0
    1.31 +#define UTF8PROC_BOUNDCLASS_OTHER    1
    1.32 +#define UTF8PROC_BOUNDCLASS_CR       2
    1.33 +#define UTF8PROC_BOUNDCLASS_LF       3
    1.34 +#define UTF8PROC_BOUNDCLASS_CONTROL  4
    1.35 +#define UTF8PROC_BOUNDCLASS_EXTEND   5
    1.36 +#define UTF8PROC_BOUNDCLASS_L        6
    1.37 +#define UTF8PROC_BOUNDCLASS_V        7
    1.38 +#define UTF8PROC_BOUNDCLASS_T        8
    1.39 +#define UTF8PROC_BOUNDCLASS_LV       9
    1.40 +#define UTF8PROC_BOUNDCLASS_LVT     10
    1.41  
    1.42  
    1.43  const char *utf8proc_errmsg(ssize_t errcode) {
    1.44 @@ -145,6 +168,12 @@
    1.45      dst[0] = 0xC0 + (uc >> 6);
    1.46      dst[1] = 0x80 + (uc & 0x3F);
    1.47      return 2;
    1.48 +  } else if (uc == 0xFFFF) {
    1.49 +    dst[0] = 0xFF;
    1.50 +    return 1;
    1.51 +  } else if (uc == 0xFFFE) {
    1.52 +    dst[0] = 0xFE;
    1.53 +    return 1;
    1.54    } else if (uc < 0x10000) {
    1.55      dst[0] = 0xE0 + (uc >> 12);
    1.56      dst[1] = 0x80 + ((uc >> 6) & 0x3F);
    1.57 @@ -169,13 +198,14 @@
    1.58  }
    1.59  
    1.60  ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
    1.61 -    int options) {
    1.62 +    int options, int *last_boundclass) {
    1.63    // ASSERT: uc >= 0 && uc < 0x110000
    1.64    const utf8proc_property_t *property;
    1.65    int32_t hangul_sindex;
    1.66    property = utf8proc_get_property(uc);
    1.67    hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
    1.68 -  if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
    1.69 +  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
    1.70 +      hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
    1.71      int32_t hangul_tindex;
    1.72      if (bufsize >= 1) {
    1.73        dst[0] = UTF8PROC_HANGUL_LBASE +
    1.74 @@ -197,25 +227,79 @@
    1.75      for (casefold_entry = property->casefold_mapping;
    1.76          *casefold_entry >= 0; casefold_entry++) {
    1.77        written += utf8proc_decompose_char(*casefold_entry, dst+written,
    1.78 -        (bufsize > written) ? (bufsize - written) : 0, options);
    1.79 +        (bufsize > written) ? (bufsize - written) : 0, options,
    1.80 +        last_boundclass);
    1.81        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
    1.82      }
    1.83      return written;
    1.84 -  } else if (property->decomp_mapping &&
    1.85 +  } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
    1.86 +      property->decomp_mapping &&
    1.87        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
    1.88      const int32_t *decomp_entry;
    1.89      ssize_t written = 0;
    1.90      for (decomp_entry = property->decomp_mapping;
    1.91          *decomp_entry >= 0; decomp_entry++) {
    1.92        written += utf8proc_decompose_char(*decomp_entry, dst+written,
    1.93 -        (bufsize > written) ? (bufsize - written) : 0, options);
    1.94 +        (bufsize > written) ? (bufsize - written) : 0, options,
    1.95 +        last_boundclass);
    1.96        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
    1.97      }
    1.98      return written;
    1.99 -  } else {
   1.100 -    if (bufsize >= 1) *dst = uc;
   1.101 -    return 1;
   1.102 +  } else if (options & UTF8PROC_CHARBOUND) {
   1.103 +    bool boundary;
   1.104 +    int tbc, lbc;
   1.105 +    int category;
   1.106 +    category = property->category;
   1.107 +    tbc =
   1.108 +      (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
   1.109 +      (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
   1.110 +      ((category == UTF8PROC_CATEGORY_ZL ||
   1.111 +        category == UTF8PROC_CATEGORY_ZP ||
   1.112 +        category == UTF8PROC_CATEGORY_CC ||
   1.113 +        category == UTF8PROC_CATEGORY_CF) &&
   1.114 +        !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
   1.115 +      property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
   1.116 +      ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
   1.117 +        uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
   1.118 +      (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
   1.119 +        UTF8PROC_BOUNDCLASS_V :
   1.120 +      (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
   1.121 +        UTF8PROC_BOUNDCLASS_T :
   1.122 +      (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
   1.123 +        ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
   1.124 +          UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
   1.125 +      ) :
   1.126 +      UTF8PROC_BOUNDCLASS_OTHER;
   1.127 +    lbc = *last_boundclass;
   1.128 +    boundary =
   1.129 +      (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
   1.130 +      (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
   1.131 +      (lbc == UTF8PROC_BOUNDCLASS_CR &&
   1.132 +       tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
   1.133 +      (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
   1.134 +      (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
   1.135 +      (lbc == UTF8PROC_BOUNDCLASS_L &&
   1.136 +       (tbc == UTF8PROC_BOUNDCLASS_L ||
   1.137 +        tbc == UTF8PROC_BOUNDCLASS_V ||
   1.138 +        tbc == UTF8PROC_BOUNDCLASS_LV ||
   1.139 +        tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
   1.140 +      ((lbc == UTF8PROC_BOUNDCLASS_LV ||
   1.141 +        lbc == UTF8PROC_BOUNDCLASS_V) &&
   1.142 +       (tbc == UTF8PROC_BOUNDCLASS_V ||
   1.143 +        tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
   1.144 +      ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
   1.145 +        lbc == UTF8PROC_BOUNDCLASS_T) &&
   1.146 +       tbc == UTF8PROC_BOUNDCLASS_T) ? false :
   1.147 +       true;
   1.148 +    *last_boundclass = tbc;
   1.149 +    if (boundary) {
   1.150 +      if (bufsize >= 1) dst[0] = 0xFFFF;
   1.151 +      if (bufsize >= 2) dst[1] = uc;
   1.152 +      return 2;
   1.153 +    }
   1.154    }
   1.155 +  if (bufsize >= 1) *dst = uc;
   1.156 +  return 1;
   1.157  }
   1.158  
   1.159  ssize_t utf8proc_decompose(uint8_t *str, ssize_t strlen,
   1.160 @@ -226,6 +310,7 @@
   1.161      int32_t uc;
   1.162      ssize_t rpos = 0;
   1.163      ssize_t decomp_result;
   1.164 +    int boundclass = UTF8PROC_BOUNDCLASS_START;
   1.165      while (1) {
   1.166        if (options & UTF8PROC_NULLTERM) {
   1.167          rpos += utf8proc_iterate(str + rpos, -1, &uc);
   1.168 @@ -240,7 +325,8 @@
   1.169          if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
   1.170        }
   1.171        decomp_result = utf8proc_decompose_char(
   1.172 -        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options
   1.173 +        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
   1.174 +        &boundclass
   1.175        );
   1.176        if (decomp_result < 0) return decomp_result;
   1.177        wpos += decomp_result;
   1.178 @@ -249,7 +335,7 @@
   1.179          return UTF8PROC_ERROR_OVERFLOW;
   1.180      }
   1.181    }
   1.182 -  if (bufsize >= wpos) {
   1.183 +  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
   1.184      ssize_t pos = 0;
   1.185      while (pos < wpos-1) {
   1.186        int32_t uc1, uc2;
   1.187 @@ -416,7 +502,8 @@
   1.188  
   1.189  uint8_t *utf8proc_NFD(uint8_t *str) {
   1.190    uint8_t *retval;
   1.191 -  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE);
   1.192 +  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
   1.193 +    UTF8PROC_DECOMPOSE);
   1.194    return retval;
   1.195  }
   1.196  
   1.197 @@ -430,7 +517,7 @@
   1.198  uint8_t *utf8proc_NFKD(uint8_t *str) {
   1.199    uint8_t *retval;
   1.200    utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
   1.201 -    UTF8PROC_COMPAT);
   1.202 +    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
   1.203    return retval;
   1.204  }
   1.205  

Impressum / About Us