utf8proc

diff utf8proc.c @ 3:4ee0d5f54af1

Version 1.0

- added the LUMP option, which lumps certain characters together (see lump.txt) (also used for the PostgreSQL "unifold" function)
- added the STRIPMARK option, which strips marking characters (or marks of composed characters)
- deprecated ruby method String#char_ary in favour of String#utf8chars
author jbe
date Sun Sep 17 12:00:00 2006 +0200 (2006-09-17)
parents aaad485d5335
children fcfd8c836c64
line diff
     1.1 --- a/utf8proc.c	Fri Aug 04 12:00:00 2006 +0200
     1.2 +++ b/utf8proc.c	Sun Sep 17 12:00:00 2006 +0200
     1.3 @@ -42,8 +42,8 @@
     1.4  
     1.5  /*
     1.6   *  File name:    utf8proc.c
     1.7 - *  Version:      0.3
     1.8 - *  Last changed: 2006-08-04
     1.9 + *  Version:      1.0
    1.10 + *  Last changed: 2006-09-17
    1.11   *
    1.12   *  Description:
    1.13   *  Implementation of libutf8proc.
    1.14 @@ -116,6 +116,8 @@
    1.15      return "Invalid UTF-8 string";
    1.16      case UTF8PROC_ERROR_NOTASSIGNED:
    1.17      return "Unassigned Unicode code point found in UTF-8 string.";
    1.18 +    case UTF8PROC_ERROR_INVALIDOPTS:
    1.19 +    return "Invalid options for UTF-8 processing chosen.";
    1.20      default:
    1.21      return "An unknown error occured while processing UTF-8 data.";
    1.22    }
    1.23 @@ -197,59 +199,103 @@
    1.24    );
    1.25  }
    1.26  
    1.27 +#define utf8proc_decompose_lump(replacement_uc) \
    1.28 +  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
    1.29 +  options & ~UTF8PROC_LUMP, last_boundclass)
    1.30 +
    1.31  ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
    1.32      int options, int *last_boundclass) {
    1.33    // ASSERT: uc >= 0 && uc < 0x110000
    1.34    const utf8proc_property_t *property;
    1.35 +  utf8proc_propval_t category;
    1.36    int32_t hangul_sindex;
    1.37    property = utf8proc_get_property(uc);
    1.38 +  category = property->category;
    1.39    hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
    1.40 -  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
    1.41 -      hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
    1.42 -    int32_t hangul_tindex;
    1.43 -    if (bufsize >= 1) {
    1.44 -      dst[0] = UTF8PROC_HANGUL_LBASE +
    1.45 -        hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
    1.46 -      if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
    1.47 -        (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
    1.48 +  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
    1.49 +    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
    1.50 +      int32_t hangul_tindex;
    1.51 +      if (bufsize >= 1) {
    1.52 +        dst[0] = UTF8PROC_HANGUL_LBASE +
    1.53 +          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
    1.54 +        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
    1.55 +          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
    1.56 +      }
    1.57 +      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
    1.58 +      if (!hangul_tindex) return 2;
    1.59 +      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
    1.60 +      return 3;
    1.61      }
    1.62 -    hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
    1.63 -    if (!hangul_tindex) return 2;
    1.64 -    if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
    1.65 -    return 3;
    1.66 -  } else if ((options & UTF8PROC_REJECTNA) && !property->category) {
    1.67 -    return UTF8PROC_ERROR_NOTASSIGNED;
    1.68 -  } else if ((options & UTF8PROC_IGNORE) && property->ignorable) {
    1.69 -    return 0;
    1.70 -  } else if ((options & UTF8PROC_CASEFOLD) && property->casefold_mapping) {
    1.71 -    const int32_t *casefold_entry;
    1.72 -    ssize_t written = 0;
    1.73 -    for (casefold_entry = property->casefold_mapping;
    1.74 -        *casefold_entry >= 0; casefold_entry++) {
    1.75 -      written += utf8proc_decompose_char(*casefold_entry, dst+written,
    1.76 -        (bufsize > written) ? (bufsize - written) : 0, options,
    1.77 +  }
    1.78 +  if (options & UTF8PROC_REJECTNA) {
    1.79 +    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
    1.80 +  }
    1.81 +  if (options & UTF8PROC_IGNORE) {
    1.82 +    if (property->ignorable) return 0;
    1.83 +  }
    1.84 +  if (options & UTF8PROC_LUMP) {
    1.85 +    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
    1.86 +    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
    1.87 +      utf8proc_decompose_lump(0x0027);
    1.88 +    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
    1.89 +      utf8proc_decompose_lump(0x002D);
    1.90 +    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
    1.91 +    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
    1.92 +    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
    1.93 +      utf8proc_decompose_lump(0x003C);
    1.94 +    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
    1.95 +      utf8proc_decompose_lump(0x003E);
    1.96 +    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
    1.97 +    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
    1.98 +      utf8proc_decompose_lump(0x005E);
    1.99 +    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
   1.100 +      utf8proc_decompose_lump(0x005F);
   1.101 +    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
   1.102 +    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
   1.103 +    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
   1.104 +    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
   1.105 +      if (category == UTF8PROC_CATEGORY_ZL ||
   1.106 +          category == UTF8PROC_CATEGORY_ZP)
   1.107 +        utf8proc_decompose_lump(0x000A);
   1.108 +    }
   1.109 +  }
   1.110 +  if (options & UTF8PROC_STRIPMARK) {
   1.111 +    if (category == UTF8PROC_CATEGORY_MN ||
   1.112 +      category == UTF8PROC_CATEGORY_MC ||
   1.113 +      category == UTF8PROC_CATEGORY_ME) return 0;
   1.114 +  }
   1.115 +  if (options & UTF8PROC_CASEFOLD) {
   1.116 +    if (property->casefold_mapping) {
   1.117 +      const int32_t *casefold_entry;
   1.118 +      ssize_t written = 0;
   1.119 +      for (casefold_entry = property->casefold_mapping;
   1.120 +          *casefold_entry >= 0; casefold_entry++) {
   1.121 +        written += utf8proc_decompose_char(*casefold_entry, dst+written,
   1.122 +          (bufsize > written) ? (bufsize - written) : 0, options,
   1.123 +          last_boundclass);
   1.124 +        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
   1.125 +      }
   1.126 +      return written;
   1.127 +    }
   1.128 +  }
   1.129 +  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
   1.130 +    if (property->decomp_mapping &&
   1.131 +        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
   1.132 +      const int32_t *decomp_entry;
   1.133 +      ssize_t written = 0;
   1.134 +      for (decomp_entry = property->decomp_mapping;
   1.135 +          *decomp_entry >= 0; decomp_entry++) {
   1.136 +        written += utf8proc_decompose_char(*decomp_entry, dst+written,
   1.137 +          (bufsize > written) ? (bufsize - written) : 0, options,
   1.138          last_boundclass);
   1.139 -      if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
   1.140 +        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
   1.141 +      }
   1.142 +      return written;
   1.143      }
   1.144 -    return written;
   1.145 -  } else if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) &&
   1.146 -      property->decomp_mapping &&
   1.147 -      (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
   1.148 -    const int32_t *decomp_entry;
   1.149 -    ssize_t written = 0;
   1.150 -    for (decomp_entry = property->decomp_mapping;
   1.151 -        *decomp_entry >= 0; decomp_entry++) {
   1.152 -      written += utf8proc_decompose_char(*decomp_entry, dst+written,
   1.153 -        (bufsize > written) ? (bufsize - written) : 0, options,
   1.154 -        last_boundclass);
   1.155 -      if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
   1.156 -    }
   1.157 -    return written;
   1.158 -  } else if (options & UTF8PROC_CHARBOUND) {
   1.159 +  }
   1.160 +  if (options & UTF8PROC_CHARBOUND) {
   1.161      bool boundary;
   1.162      int tbc, lbc;
   1.163 -    int category;
   1.164 -    category = property->category;
   1.165      tbc =
   1.166        (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
   1.167        (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
   1.168 @@ -306,6 +352,11 @@
   1.169      int32_t *buffer, ssize_t bufsize, int options) {
   1.170    // strlen will be ignored, if UTF8PROC_NULLTERM is set in options
   1.171    ssize_t wpos = 0;
   1.172 +  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
   1.173 +    return UTF8PROC_ERROR_INVALIDOPTS;
   1.174 +  if ((options & UTF8PROC_STRIPMARK) &&
   1.175 +      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
   1.176 +    return UTF8PROC_ERROR_INVALIDOPTS;
   1.177    {
   1.178      int32_t uc;
   1.179      ssize_t rpos = 0;
   1.180 @@ -395,7 +446,7 @@
   1.181      int32_t *starter = NULL;
   1.182      int32_t current_char;
   1.183      const utf8proc_property_t *starter_property = NULL, *current_property;
   1.184 -    int16_t max_combining_class = -1;
   1.185 +    utf8proc_propval_t max_combining_class = -1;
   1.186      ssize_t rpos;
   1.187      ssize_t wpos = 0;
   1.188      int32_t composition;

Impressum / About Us