utf8proc
view ruby/utf8proc.rb @ 2:aaad485d5335
Version 0.3
- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
- changed normalization from NFC to NFKC for postgresql unifold function
- added support to mark the beginning of a grapheme cluster with 0xFF (option: CHARBOUND)
- added the ruby method String#chars, which is returning an array of UTF-8 encoded grapheme clusters
- added NLF2LF transformation in postgresql unifold function
- added the DECOMPOSE option, if you neither use COMPOSE or DECOMPOSE, no normalization will be performed (different from previous versions)
- using integer constants rather than C-strings for character properties
- fixed (hopefully) a problem with the ruby library on Mac OS X, which occured when compiler optimization was switched on
| author | jbe | 
|---|---|
| date | Fri Aug 04 12:00:00 2006 +0200 (2006-08-04) | 
| parents | 61a89ecc2fb9 | 
| children | 4ee0d5f54af1 | 
 line source
     1 ##
     2  #  Copyright (c) 2006, FlexiGuided GmbH, Berlin, Germany
     3  #  Author: Jan Behrens <jan.behrens@flexiguided.de>
     4  #  All rights reserved.
     5  #
     6  #  Redistribution and use in source and binary forms, with or without
     7  #  modification, are permitted provided that the following conditions are
     8  #  met:
     9  #
    10  #  1. Redistributions of source code must retain the above copyright
    11  #     notice, this list of conditions and the following disclaimer.
    12  #  2. Redistributions in binary form must reproduce the above copyright
    13  #     notice, this list of conditions and the following disclaimer in the
    14  #     documentation and/or other materials provided with the distribution.
    15  #  3. Neither the name of the FlexiGuided GmbH nor the names of its
    16  #     contributors may be used to endorse or promote products derived from
    17  #     this software without specific prior written permission.
    18  #
    19  #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    20  #  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    21  #  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
    22  #  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
    23  #  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    24  #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    25  #  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    26  #  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    27  #  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    28  #  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    29  #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    30  #
    31  ##
    34 ##
    35  #  File name:    ruby/utf8proc.rb
    36  #  Version:      0.3
    37  #  Last changed: 2006-08-04
    38  #
    39  #  Description:
    40  #  Part of the ruby wrapper for libutf8proc, which is written in ruby.
    41  ##
    44 require 'utf8proc_native'
    47 module Utf8Proc
    49   SpecialChars = {
    50     :HT => "\x09",
    51     :LF => "\x0A",
    52     :VT => "\x0B",
    53     :FF => "\x0C",
    54     :CR => "\x0D",
    55     :FS => "\x1C",
    56     :GS => "\x1D",
    57     :RS => "\x1E",
    58     :US => "\x1F",
    59     :LS => "\xE2\x80\xA8",
    60     :PS => "\xE2\x80\xA9",
    61   }
    63   module StringExtensions
    64     def utf8map(*option_array)
    65       options = 0
    66       option_array.each do |option|
    67         flag = Utf8Proc::Options[option]
    68         raise ArgumentError, "Unknown argument given to String#utf8map." unless
    69           flag
    70         options |= flag
    71       end
    72       return Utf8Proc::utf8map(self, options)
    73     end
    74     def utf8map!(*option_array)
    75       self.replace(self.utf8map(*option_array))
    76     end
    77     def utf8nfd;   utf8map( :stable, :decompose); end
    78     def utf8nfd!;  utf8map!(:stable, :decompose); end
    79     def utf8nfc;   utf8map( :stable, :compose); end
    80     def utf8nfc!;  utf8map!(:stable, :compose); end
    81     def utf8nfkd;  utf8map( :stable, :decompose, :compat); end
    82     def utf8nfkd!; utf8map!(:stable, :decompose, :compat); end
    83     def utf8nfkc;  utf8map( :stable, :compose, :compat); end
    84     def utf8nfkc!; utf8map!(:stable, :compose, :compat); end
    85     def char_ary
    86       char_ary = self.utf8map(:charbound).split("\377")
    87       char_ary.shift if char_ary.first == ''
    88       char_ary
    89     end
    90   end
    92   module IntegerExtensions
    93     def utf8
    94       return Utf8Proc::utf8char(self)
    95     end
    96   end
    98 end
   101 class String
   102   include(Utf8Proc::StringExtensions)
   103 end
   105 class Integer
   106   include(Utf8Proc::IntegerExtensions)
   107 end
