utf8proc
view ruby/utf8proc_native.c @ 2:aaad485d5335
Version 0.3
- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
| author | jbe | 
|---|---|
| date | Fri Aug 04 12:00:00 2006 +0200 (2006-08-04) | 
| parents | 61a89ecc2fb9 | 
| children | 4ee0d5f54af1 | 
 line source
     1 /*
     2  *  Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
     3  *  Author: Jan Behrens <jan.behrens@flexiguided.de>
     4  *  All rights reserved.
     5  *
     6  *  Redistribution and use in source and binary forms, with or without
     7  *  modification, are permitted provided that the following conditions are
     8  *  met:
     9  *
    10  *  1. Redistributions of source code must retain the above copyright
    11  *     notice, this list of conditions and the following disclaimer.
    12  *  2. Redistributions in binary form must reproduce the above copyright
    13  *     notice, this list of conditions and the following disclaimer in the
    14  *     documentation and/or other materials provided with the distribution.
    15  *  3. Neither the name of the FlexiGuided GmbH nor the names of its
    16  *     contributors may be used to endorse or promote products derived from
    17  *     this software without specific prior written permission.
    18  *
    19  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    20  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    21  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
    22  *  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
    23  *  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    24  *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    25  *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    26  *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    27  *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    28  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    29  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    30  *
    31  */
    34 /*
    35  *  File name:    ruby/utf8proc_native.c
    36  *  Version:      0.3
    37  *  Last changed: 2006-08-04
    38  *
    39  *  Description:
    40  *  Native part of the ruby wrapper for libutf8proc.
    41  */
    44 #include "../utf8proc.c"
    45 #include "ruby.h"
    47 typedef struct utf8proc_ruby_mapenv_struct {
    48   int32_t *buffer;
    49 } utf8proc_ruby_mapenv_t;
    51 void utf8proc_ruby_mapenv_free(utf8proc_ruby_mapenv_t *env) {
    52   free(env->buffer);
    53   free(env);
    54 }
    56 VALUE utf8proc_ruby_module;
    57 VALUE utf8proc_ruby_options;
    58 VALUE utf8proc_ruby_eUnicodeError;
    59 VALUE utf8proc_ruby_eInvalidUtf8Error;
    60 VALUE utf8proc_ruby_eCodeNotAssignedError;
    62 VALUE utf8proc_ruby_map_error(ssize_t result) {
    63   VALUE excpt_class;
    64   switch (result) {
    65     case UTF8PROC_ERROR_NOMEM:
    66     excpt_class = rb_eNoMemError; break;
    67     case UTF8PROC_ERROR_OVERFLOW:
    68     excpt_class = rb_eArgError; break;
    69     case UTF8PROC_ERROR_INVALIDUTF8:
    70     excpt_class = utf8proc_ruby_eInvalidUtf8Error; break;
    71     case UTF8PROC_ERROR_NOTASSIGNED:
    72     excpt_class = utf8proc_ruby_eCodeNotAssignedError; break;
    73     default:
    74     excpt_class = rb_eRuntimeError;
    75   }
    76   rb_raise(excpt_class, "%s", utf8proc_errmsg(result));
    77   return Qnil;
    78 }
    80 VALUE utf8proc_ruby_map(VALUE self, VALUE str_param, VALUE options_param) {
    81   VALUE str;
    82   int options;
    83   VALUE env_obj;
    84   utf8proc_ruby_mapenv_t *env;
    85   ssize_t result;
    86   VALUE retval;
    87   str = StringValue(str_param);
    88   options = NUM2INT(options_param) & ~UTF8PROC_NULLTERM;
    89   env_obj = Data_Make_Struct(rb_cObject, utf8proc_ruby_mapenv_t, NULL,
    90     utf8proc_ruby_mapenv_free, env);
    91   result = utf8proc_decompose(RSTRING(str)->ptr, RSTRING(str)->len,
    92     NULL, 0, options);
    93   if (result < 0) {
    94     utf8proc_ruby_map_error(result);
    95     return Qnil;  // needed to prevent problems with optimization
    96   }
    97   env->buffer = ALLOC_N(int32_t, result+1);
    98   result = utf8proc_decompose(RSTRING(str)->ptr, RSTRING(str)->len,
    99     env->buffer, result, options);
   100   if (result < 0) {
   101     free(env->buffer);
   102     env->buffer = 0;
   103     utf8proc_ruby_map_error(result);
   104     return Qnil;  // needed to prevent problems with optimization
   105   }
   106   result = utf8proc_reencode(env->buffer, result, options);
   107   if (result < 0) {
   108     free(env->buffer);
   109     env->buffer = 0;
   110     utf8proc_ruby_map_error(result);
   111     return Qnil;  // needed to prevent problems with optimization
   112   }
   113   retval = rb_str_new((char *)env->buffer, result);
   114   free(env->buffer);
   115   env->buffer = 0;
   116   return retval;
   117 }
   119 static VALUE utf8proc_ruby_char(VALUE self, VALUE code_param) {
   120   char buffer[4];
   121   ssize_t result;
   122   int uc;
   123   uc = NUM2INT(code_param);
   124   if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
   125       (uc >= 0xFDD0 && uc < 0xFDF0))
   126     rb_raise(rb_eArgError, "Invalid Unicode code point");
   127   result = utf8proc_encode_char(uc, buffer);
   128   return rb_str_new(buffer, result);
   129 }
   131 #define register_utf8proc_option(sym, field) \
   132   rb_hash_aset(utf8proc_ruby_options, ID2SYM(rb_intern(sym)), INT2FIX(field))
   134 void Init_utf8proc_native() {
   135   utf8proc_ruby_module = rb_define_module("Utf8Proc");
   136   rb_define_module_function(utf8proc_ruby_module, "utf8map",
   137     utf8proc_ruby_map, 2);
   138   rb_define_module_function(utf8proc_ruby_module, "utf8char",
   139     utf8proc_ruby_char, 1);
   140   utf8proc_ruby_eUnicodeError = rb_define_class_under(utf8proc_ruby_module,
   141     "UnicodeError", rb_eStandardError);
   142   utf8proc_ruby_eInvalidUtf8Error = rb_define_class_under(
   143     utf8proc_ruby_module, "InvalidUtf8Error", utf8proc_ruby_eUnicodeError);
   144   utf8proc_ruby_eCodeNotAssignedError = rb_define_class_under(
   145     utf8proc_ruby_module, "CodeNotAssignedError",
   146     utf8proc_ruby_eUnicodeError);
   147   utf8proc_ruby_options = rb_hash_new();
   148   register_utf8proc_option("stable",    UTF8PROC_STABLE);
   149   register_utf8proc_option("compat",    UTF8PROC_COMPAT);
   150   register_utf8proc_option("compose",   UTF8PROC_COMPOSE);
   151   register_utf8proc_option("decompose", UTF8PROC_DECOMPOSE);
   152   register_utf8proc_option("ignore",    UTF8PROC_IGNORE);
   153   register_utf8proc_option("rejectna",  UTF8PROC_REJECTNA);
   154   register_utf8proc_option("nlf2ls",    UTF8PROC_NLF2LS);
   155   register_utf8proc_option("nlf2ps",    UTF8PROC_NLF2PS);
   156   register_utf8proc_option("nlf2lf",    UTF8PROC_NLF2LF);
   157   register_utf8proc_option("stripcc",   UTF8PROC_STRIPCC);
   158   register_utf8proc_option("casefold",  UTF8PROC_CASEFOLD);
   159   register_utf8proc_option("charbound", UTF8PROC_CHARBOUND);
   160   OBJ_FREEZE(utf8proc_ruby_options);
   161   rb_define_const(utf8proc_ruby_module, "Options", utf8proc_ruby_options);
   162 }
