jbe@7: #!/usr/pkg/bin/ruby jbe@7: jbe@7: # This file was used to generate the 'unicode_data.c' file by parsing the jbe@7: # Unicode data file 'UnicodeData.txt' of the Unicode Character Database. jbe@7: # It is included for informational purposes only and not intended for jbe@7: # production use. jbe@7: jbe@7: jbe@7: # Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin jbe@7: # jbe@7: # Permission is hereby granted, free of charge, to any person obtaining a jbe@7: # copy of this software and associated documentation files (the "Software"), jbe@7: # to deal in the Software without restriction, including without limitation jbe@7: # the rights to use, copy, modify, merge, publish, distribute, sublicense, jbe@7: # and/or sell copies of the Software, and to permit persons to whom the jbe@7: # Software is furnished to do so, subject to the following conditions: jbe@7: # jbe@7: # The above copyright notice and this permission notice shall be included in jbe@7: # all copies or substantial portions of the Software. jbe@7: # jbe@7: # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jbe@7: # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jbe@7: # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE jbe@7: # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jbe@7: # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING jbe@7: # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER jbe@7: # DEALINGS IN THE SOFTWARE. jbe@7: jbe@7: jbe@7: # This file contains derived data from a modified version of the jbe@7: # Unicode data files. The following license applies to that data: jbe@7: # jbe@7: # COPYRIGHT AND PERMISSION NOTICE jbe@7: # jbe@7: # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed jbe@7: # under the Terms of Use in http://www.unicode.org/copyright.html. jbe@7: # jbe@7: # Permission is hereby granted, free of charge, to any person obtaining a jbe@7: # copy of the Unicode data files and any associated documentation (the "Data jbe@7: # Files") or Unicode software and any associated documentation (the jbe@7: # "Software") to deal in the Data Files or Software without restriction, jbe@7: # including without limitation the rights to use, copy, modify, merge, jbe@7: # publish, distribute, and/or sell copies of the Data Files or Software, and jbe@7: # to permit persons to whom the Data Files or Software are furnished to do jbe@7: # so, provided that (a) the above copyright notice(s) and this permission jbe@7: # notice appear with all copies of the Data Files or Software, (b) both the jbe@7: # above copyright notice(s) and this permission notice appear in associated jbe@7: # documentation, and (c) there is clear notice in each modified Data File or jbe@7: # in the Software as well as in the documentation associated with the Data jbe@7: # File(s) or Software that the data or software has been modified. jbe@7: # jbe@7: # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY jbe@7: # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF jbe@7: # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF jbe@7: # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS jbe@7: # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR jbe@7: # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF jbe@7: # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER jbe@7: # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR jbe@7: # PERFORMANCE OF THE DATA FILES OR SOFTWARE. jbe@7: # jbe@7: # Except as contained in this notice, the name of a copyright holder shall jbe@7: # not be used in advertising or otherwise to promote the sale, use or other jbe@7: # dealings in these Data Files or Software without prior written jbe@7: # authorization of the copyright holder. jbe@7: jbe@7: jbe@7: jbe@7: $ignorable_list = <.. jbe@7: 000E..001F ; Default_Ignorable_Code_Point # Cc [18] .. jbe@7: 007F..0084 ; Default_Ignorable_Code_Point # Cc [6] .. jbe@7: 0086..009F ; Default_Ignorable_Code_Point # Cc [26] .. jbe@7: 00AD ; Default_Ignorable_Code_Point # Cf SOFT HYPHEN jbe@7: 034F ; Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER jbe@7: 0600..0603 ; Default_Ignorable_Code_Point # Cf [4] ARABIC NUMBER SIGN..ARABIC SIGN SAFHA jbe@7: 06DD ; Default_Ignorable_Code_Point # Cf ARABIC END OF AYAH jbe@7: 070F ; Default_Ignorable_Code_Point # Cf SYRIAC ABBREVIATION MARK jbe@7: 115F..1160 ; Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER jbe@7: 17B4..17B5 ; Default_Ignorable_Code_Point # Cf [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA jbe@7: 180B..180D ; Default_Ignorable_Code_Point # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE jbe@7: 200B..200F ; Default_Ignorable_Code_Point # Cf [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK jbe@7: 202A..202E ; Default_Ignorable_Code_Point # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE jbe@7: 2060..2063 ; Default_Ignorable_Code_Point # Cf [4] WORD JOINER..INVISIBLE SEPARATOR jbe@7: 2064..2069 ; Default_Ignorable_Code_Point # Cn [6] .. jbe@7: 206A..206F ; Default_Ignorable_Code_Point # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES jbe@7: 3164 ; Default_Ignorable_Code_Point # Lo HANGUL FILLER jbe@7: D800..DFFF ; Default_Ignorable_Code_Point # Cs [2048] .. jbe@7: FE00..FE0F ; Default_Ignorable_Code_Point # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 jbe@7: FEFF ; Default_Ignorable_Code_Point # Cf ZERO WIDTH NO-BREAK SPACE jbe@7: FFA0 ; Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER jbe@7: FFF0..FFF8 ; Default_Ignorable_Code_Point # Cn [9] .. jbe@7: 1D173..1D17A ; Default_Ignorable_Code_Point # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE jbe@7: E0001 ; Default_Ignorable_Code_Point # Cf LANGUAGE TAG jbe@7: E0002..E001F ; Default_Ignorable_Code_Point # Cn [30] .. jbe@7: E0020..E007F ; Default_Ignorable_Code_Point # Cf [96] TAG SPACE..CANCEL TAG jbe@7: E0080..E00FF ; Default_Ignorable_Code_Point # Cn [128] .. jbe@7: E0100..E01EF ; Default_Ignorable_Code_Point # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 jbe@7: E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] .. jbe@7: END_OF_LIST jbe@7: jbe@7: $ignorable = [] jbe@7: $ignorable_list.each do |entry| jbe@7: if entry =~ /^([0-9A-F]+)..([0-9A-F]+)/ jbe@7: $1.hex.upto($2.hex) { |e2| $ignorable << e2 } jbe@7: elsif entry =~ /^[0-9A-F]+/ jbe@7: $ignorable << $&.hex jbe@7: end jbe@7: end jbe@7: jbe@7: $grapheme_extend_list = <)? # decomposition type jbe@7: ((\ ?[0-9A-F]+)*); # decompomposition mapping jbe@7: ([0-9]*); # decimal digit jbe@7: ([0-9]*); # digit jbe@7: ([^;]*); # numeric jbe@7: ([YN]*); # bidi mirrored jbe@7: ([^;]*); # unicode 1.0 name jbe@7: ([^;]*); # iso comment jbe@7: ([0-9A-F]*); # simple uppercase mapping jbe@7: ([0-9A-F]*); # simple lowercase mapping jbe@7: ([0-9A-F]*)$/ix # simple titlecase mapping jbe@7: @code = $1.hex jbe@7: @name = $2 jbe@7: @category = $3 jbe@7: @combining_class = Integer($4) jbe@7: @bidi_class = $5 jbe@7: @decomp_type = $7 jbe@7: @decomp_mapping = ($8=='') ? nil : jbe@7: $8.split.collect { |element| element.hex } jbe@7: @bidi_mirrored = ($13=='Y') ? true : false jbe@7: @uppercase_mapping = ($16=='') ? nil : $16.hex jbe@7: @lowercase_mapping = ($17=='') ? nil : $17.hex jbe@7: @titlecase_mapping = ($18=='') ? nil : $18.hex jbe@7: end jbe@7: def case_folding jbe@7: $case_folding[code] jbe@7: end jbe@7: def c_entry(comb1_indicies, comb2_indicies) jbe@7: " " << jbe@7: "{#{str2c category, 'CATEGORY'}, #{combining_class}, " << jbe@7: "#{str2c bidi_class, 'BIDI_CLASS'}, " << jbe@7: "#{str2c decomp_type, 'DECOMP_TYPE'}, " << jbe@7: "#{ary2c decomp_mapping}, " << jbe@7: "#{bidi_mirrored}, " << jbe@7: "#{uppercase_mapping or -1}, " << jbe@7: "#{lowercase_mapping or -1}, " << jbe@7: "#{titlecase_mapping or -1}, " << jbe@7: "#{comb1_indicies[code] ? jbe@7: (comb1_indicies[code]*comb2_indicies.keys.length) : -1 jbe@7: }, #{comb2_indicies[code] or -1}, " << jbe@7: "#{$exclusions.include?(code) or $excl_version.include?(code)}, " << jbe@7: "#{$ignorable.include?(code)}, " << jbe@7: "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << jbe@7: "#{$grapheme_extend.include?(code)}, " << jbe@7: "#{ary2c case_folding}},\n" jbe@7: end jbe@7: end jbe@7: jbe@7: chars = [] jbe@7: char_hash = {} jbe@7: jbe@7: while gets jbe@7: if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i jbe@7: first = $1.hex jbe@7: gets jbe@7: char = UnicodeChar.new($_) jbe@7: raise "No last character of sequence found." unless jbe@7: $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i jbe@7: last = $1.hex jbe@7: name = "<#{$2}>" jbe@7: for i in first..last jbe@7: char_clone = char.clone jbe@7: char_clone.code = i jbe@7: char_clone.name = name jbe@7: char_hash[char_clone.code] = char_clone jbe@7: chars << char_clone jbe@7: end jbe@7: else jbe@7: char = UnicodeChar.new($_) jbe@7: char_hash[char.code] = char jbe@7: chars << char jbe@7: end jbe@7: end jbe@7: jbe@7: comb1st_indicies = {} jbe@7: comb2nd_indicies = {} jbe@7: comb_array = [] jbe@7: jbe@7: chars.each do |char| jbe@7: if char.decomp_type.nil? and char.decomp_mapping and jbe@7: char.decomp_mapping.length == 2 and jbe@7: char_hash[char.decomp_mapping[0]].combining_class == 0 and jbe@7: not $exclusions.include?(char.code) jbe@7: unless comb1st_indicies[char.decomp_mapping[0]] jbe@7: comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length jbe@7: end jbe@7: unless comb2nd_indicies[char.decomp_mapping[1]] jbe@7: comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length jbe@7: end jbe@7: comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= [] jbe@7: raise "Duplicate canonical mapping" if jbe@7: comb_array[comb1st_indicies[char.decomp_mapping[0]]][ jbe@7: comb2nd_indicies[char.decomp_mapping[1]]] jbe@7: comb_array[comb1st_indicies[char.decomp_mapping[0]]][ jbe@7: comb2nd_indicies[char.decomp_mapping[1]]] = char.code jbe@7: end jbe@7: end jbe@7: jbe@7: properties_indicies = {} jbe@7: properties = [] jbe@7: chars.each do |char| jbe@7: c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies) jbe@7: unless properties_indicies[c_entry] jbe@7: properties_indicies[c_entry] = properties.length jbe@7: properties << c_entry jbe@7: end jbe@7: end jbe@7: jbe@7: stage1 = [] jbe@7: stage2 = [] jbe@7: for code in 0...0x110000 jbe@7: next unless code % 0x100 == 0 jbe@7: stage2_entry = [] jbe@7: for code2 in code...(code+0x100) jbe@7: if char_hash[code2] jbe@7: stage2_entry << (properties_indicies[char_hash[code2].c_entry( jbe@7: comb1st_indicies, comb2nd_indicies)] + 1) jbe@7: else jbe@7: stage2_entry << 0 jbe@7: end jbe@7: end jbe@7: old_index = stage2.index(stage2_entry) jbe@7: if old_index jbe@7: stage1 << (old_index * 0x100) jbe@7: else jbe@7: stage1 << (stage2.length * 0x100) jbe@7: stage2 << stage2_entry jbe@7: end jbe@7: end jbe@7: jbe@7: $stdout << "const int32_t utf8proc_sequences[] = {\n " jbe@7: i = 0 jbe@7: $int_array.each do |entry| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << entry << ", " jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const uint16_t utf8proc_stage1table[] = {\n " jbe@7: i = 0 jbe@7: stage1.each do |entry| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << entry << ", " jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const uint16_t utf8proc_stage2table[] = {\n " jbe@7: i = 0 jbe@7: stage2.flatten.each do |entry| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << entry << ", " jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n" jbe@7: $stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n" jbe@7: properties.each { |line| jbe@7: $stdout << line jbe@7: } jbe@7: $stdout << "};\n\n" jbe@7: jbe@7: $stdout << "const int32_t utf8proc_combinations[] = {\n " jbe@7: i = 0 jbe@7: comb1st_indicies.keys.each_index do |a| jbe@7: comb2nd_indicies.keys.each_index do |b| jbe@7: i += 1 jbe@7: if i == 8 jbe@7: i = 0 jbe@7: $stdout << "\n " jbe@7: end jbe@7: $stdout << ( comb_array[a][b] or -1 ) << ", " jbe@7: end jbe@7: end jbe@7: $stdout << "};\n\n" jbe@7: