Jiahao@15: #!/usr/bin/env ruby jbe@7: jbe@7: # This file was used to generate the 'unicode_data.c' file by parsing the jbe@7: # Unicode data file 'UnicodeData.txt' of the Unicode Character Database. jbe@7: # It is included for informational purposes only and not intended for jbe@7: # production use. jbe@7: jbe@7: jbe@10: # Copyright (c) 2009 Public Software Group e. V., Berlin, Germany jbe@7: # jbe@7: # Permission is hereby granted, free of charge, to any person obtaining a jbe@7: # copy of this software and associated documentation files (the "Software"), jbe@7: # to deal in the Software without restriction, including without limitation jbe@7: # the rights to use, copy, modify, merge, publish, distribute, sublicense, jbe@7: # and/or sell copies of the Software, and to permit persons to whom the jbe@7: # Software is furnished to do so, subject to the following conditions: jbe@7: # jbe@7: # The above copyright notice and this permission notice shall be included in jbe@7: # all copies or substantial portions of the Software. jbe@7: # jbe@7: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jbe@7: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jbe@7: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE jbe@7: # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jbe@7: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING jbe@7: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER jbe@7: # DEALINGS IN THE SOFTWARE. jbe@7: jbe@7: jbe@7: # This file contains derived data from a modified version of the jbe@7: # Unicode data files. The following license applies to that data: jbe@7: # jbe@7: # COPYRIGHT AND PERMISSION NOTICE jbe@7: # jbe@7: # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed jbe@7: # under the Terms of Use in http://www.unicode.org/copyright.html. jbe@7: # jbe@7: # Permission is hereby granted, free of charge, to any person obtaining a jbe@7: # copy of the Unicode data files and any associated documentation (the "Data jbe@7: # Files") or Unicode software and any associated documentation (the jbe@7: # "Software") to deal in the Data Files or Software without restriction, jbe@7: # including without limitation the rights to use, copy, modify, merge, jbe@7: # publish, distribute, and/or sell copies of the Data Files or Software, and jbe@7: # to permit persons to whom the Data Files or Software are furnished to do jbe@7: # so, provided that (a) the above copyright notice(s) and this permission jbe@7: # notice appear with all copies of the Data Files or Software, (b) both the jbe@7: # above copyright notice(s) and this permission notice appear in associated jbe@7: # documentation, and (c) there is clear notice in each modified Data File or jbe@7: # in the Software as well as in the documentation associated with the Data jbe@7: # File(s) or Software that the data or software has been modified. jbe@7: # jbe@7: # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY jbe@7: # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF jbe@7: # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF jbe@7: # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS jbe@7: # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR jbe@7: # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF jbe@7: # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER jbe@7: # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR jbe@7: # PERFORMANCE OF THE DATA FILES OR SOFTWARE. jbe@7: # jbe@7: # Except as contained in this notice, the name of a copyright holder shall jbe@7: # not be used in advertising or otherwise to promote the sale, use or other jbe@7: # dealings in these Data Files or Software without prior written jbe@7: # authorization of the copyright holder. jbe@7: jbe@7: Jiahao@15: $ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m] jbe@7: $ignorable = [] Jiahao@15: $ignorable_list.each_line do |entry| jbe@8: if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ jbe@7: $1.hex.upto($2.hex) { |e2| $ignorable << e2 } jbe@7: elsif entry =~ /^[0-9A-F]+/ jbe@7: $ignorable << $&.hex jbe@7: end jbe@7: end jbe@7: Jiahao@15: $grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m] jbe@7: $grapheme_extend = [] Jiahao@15: $grapheme_extend_list.each_line do |entry| jbe@8: if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ jbe@7: $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 } jbe@7: elsif entry =~ /^[0-9A-F]+/ jbe@7: $grapheme_extend << $&.hex jbe@7: end jbe@7: end jbe@7: Jiahao@15: $exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m] jbe@7: $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex } jbe@7: Jiahao@15: $excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m] jbe@7: $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex } jbe@7: Jiahao@15: $case_folding_string = File.open("CaseFolding.txt").read jbe@7: jbe@7: $case_folding = {} jbe@7: $case_folding_string.chomp.split("\n").each do |line| jbe@7: next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i jbe@7: $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex } jbe@7: end jbe@7: jbe@7: $int_array = [] jbe@7: $int_array_indicies = {} jbe@7: jbe@7: def str2c(string, prefix) jbe@7: return "0" if string.nil? jbe@7: return "UTF8PROC_#{prefix}_#{string.upcase}" jbe@7: end jbe@7: def ary2c(array) jbe@7: return "NULL" if array.nil? jbe@7: unless $int_array_indicies[array] jbe@7: $int_array_indicies[array] = $int_array.length jbe@7: array.each { |entry| $int_array << entry } jbe@7: $int_array << -1 jbe@7: end jbe@7: return "utf8proc_sequences + #{$int_array_indicies[array]}" jbe@7: end jbe@7: jbe@7: class UnicodeChar jbe@7: attr_accessor :code, :name, :category, :combining_class, :bidi_class, jbe@7: :decomp_type, :decomp_mapping, jbe@7: :bidi_mirrored, jbe@7: :uppercase_mapping, :lowercase_mapping, :titlecase_mapping jbe@7: def initialize(line) jbe@7: raise "Could not parse input." unless line =~ /^ jbe@7: ([0-9A-F]+); # code jbe@7: ([^;]+); # name jbe@7: ([A-Z]+); # general category jbe@7: ([0-9]+); # canonical combining class jbe@7: ([A-Z]+); # bidi class jbe@7: (<([A-Z]*)>)? # decomposition type jbe@7: ((\ ?[0-9A-F]+)*); # decompomposition mapping jbe@7: ([0-9]*); # decimal digit jbe@7: ([0-9]*); # digit jbe@7: ([^;]*); # numeric jbe@7: ([YN]*); # bidi mirrored jbe@7: ([^;]*); # unicode 1.0 name jbe@7: ([^;]*); # iso comment jbe@7: ([0-9A-F]*); # simple uppercase mapping jbe@7: ([0-9A-F]*); # simple lowercase mapping jbe@7: ([0-9A-F]*)$/ix # simple titlecase mapping jbe@7: @code = $1.hex jbe@7: @name = $2 jbe@7: @category = $3 jbe@7: @combining_class = Integer($4) jbe@7: @bidi_class = $5 jbe@7: @decomp_type = $7 jbe@7: @decomp_mapping = ($8=='') ? nil : jbe@7: $8.split.collect { |element| element.hex } jbe@7: @bidi_mirrored = ($13=='Y') ? true : false jbe@7: @uppercase_mapping = ($16=='') ? nil : $16.hex jbe@7: @lowercase_mapping = ($17=='') ? nil : $17.hex jbe@7: @titlecase_mapping = ($18=='') ? nil : $18.hex jbe@7: end jbe@7: def case_folding jbe@7: $case_folding[code] jbe@7: end jbe@7: def c_entry(comb1_indicies, comb2_indicies) jbe@7: " " << jbe@7: "{#{str2c category, 'CATEGORY'}, #{combining_class}, " << jbe@7: "#{str2c bidi_class, 'BIDI_CLASS'}, " << jbe@7: "#{str2c decomp_type, 'DECOMP_TYPE'}, " << jbe@7: "#{ary2c decomp_mapping}, " << jbe@7: "#{bidi_mirrored}, " << jbe@7: "#{uppercase_mapping or -1}, " << jbe@7: "#{lowercase_mapping or -1}, " << jbe@7: "#{titlecase_mapping or -1}, " << jbe@7: "#{comb1_indicies[code] ? jbe@7: (comb1_indicies[code]*comb2_indicies.keys.length) : -1 jbe@7: }, #{comb2_indicies[code] or -1}, " << jbe@7: "#{$exclusions.include?(code) or $excl_version.include?(code)}, " << jbe@7: "#{$ignorable.include?(code)}, " << jbe@7: "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << jbe@7: "#{$grapheme_extend.include?(code)}, " << jbe@7: "#{ary2c case_folding}},\n" jbe@7: end jbe@7: end jbe@7: jbe@7: chars = [] jbe@7: char_hash = {} jbe@7: jbe@7: while gets jbe@7: if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i jbe@7: first = $1.hex jbe@7: gets jbe@7: char = UnicodeChar.new($_) jbe@7: raise "No last character of sequence found." unless jbe@7: $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i jbe@7: last = $1.hex jbe@7: name = "<#{$2}>" jbe@7: for i in first..last jbe@7: char_clone = char.clone jbe@7: char_clone.code = i jbe@7: char_clone.name = name jbe@7: char_hash[char_clone.code] = char_clone jbe@7: chars << char_clone jbe@7: end jbe@7: else jbe@7: char = UnicodeChar.new($_) jbe@7: char_hash[char.code] = char jbe@7: chars << char jbe@7: end jbe@7: end jbe@7: jbe@7: comb1st_indicies = {} jbe@7: comb2nd_indicies = {} jbe@7: comb_array = [] jbe@7: jbe@7: chars.each do |char| Jiahao@15: if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and Jiahao@15: char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and jbe@7: char_hash[char.decomp_mapping[0]].combining_class == 0 and jbe@7: not $exclusions.include?(char.code) jbe@7: unless comb1st_indicies[char.decomp_mapping[0]] jbe@7: comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length jbe@7: end jbe@7: unless comb2nd_indicies[char.decomp_mapping[1]] jbe@7: comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length jbe@7: end jbe@7: comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= [] jbe@7: raise "Duplicate canonical mapping" if jbe@7: comb_array[comb1st_indicies[char.decomp_mapping[0]]][ jbe@7: comb2nd_indicies[char.decomp_mapping[1]]] jbe@7: comb_array[comb1st_indicies[char.decomp_mapping[0]]][ jbe@7: comb2nd_indicies[char.decomp_mapping[1]]] = char.code jbe@7: end jbe@7: end jbe@7: jbe@7: properties_indicies = {} jbe@7: properties = [] jbe@7: chars.each do |char| jbe@7: c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies) jbe@7: unless properties_indicies[c_entry] jbe@7: properties_indicies[c_entry] = properties.length jbe@7: properties << c_entry jbe@7: end jbe@7: end jbe@7: jbe@7: stage1 = [] jbe@7: stage2 = [] jbe@7: for code in 0...0x110000 jbe@7: next unless code % 0x100 == 0 jbe@7: stage2_entry = [] jbe@7: for code2 in code...(code+0x100) jbe@7: if char_hash[code2] jbe@7: stage2_entry << (properties_indicies[char_hash[code2].c_entry( jbe@7: comb1st_indicies, comb2nd_indicies)] + 1) jbe@7: else jbe@7: stage2_entry << 0 jbe@7: end jbe@7: end jbe@7: old_index = stage2.index(stage2_entry) jbe@7: if old_index jbe@7: stage1 << (old_index * 0x100) jbe@7: else jbe@7: stage1 << (stage2.length * 0x100) jbe@7: stage2 << stage2_entry jbe@7: end jbe@7: end jbe@7: jbe@7: $stdout << "const int32_t utf8proc_sequences[] = {\n " jbe@7: i = 0 jbe@7: $int_array.each do |entry| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << entry << ", " jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const uint16_t utf8proc_stage1table[] = {\n " jbe@7: i = 0 jbe@7: stage1.each do |entry| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << entry << ", " jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const uint16_t utf8proc_stage2table[] = {\n " jbe@7: i = 0 jbe@7: stage2.flatten.each do |entry| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << entry << ", " jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n" jbe@7: $stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n" jbe@7: properties.each { |line| jbe@7: $stdout << line jbe@7: } jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const int32_t utf8proc_combinations[] = {\n " jbe@7: i = 0 jbe@7: comb1st_indicies.keys.each_index do |a| jbe@7: comb2nd_indicies.keys.each_index do |b| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << ( comb_array[a][b] or -1 ) << ", " jbe@7: end jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: