utf8proc

annotate data_generator.rb @ 16:1711af85df6f

Added tags for versions 0.1 through 1.1.6
author jbe
date Fri Nov 28 01:56:31 2014 +0100 (2014-11-28)
parents 15450ff3d454
children
rev   line source
Jiahao@15 1 #!/usr/bin/env ruby
jbe@7 2
jbe@7 3 # This file was used to generate the 'unicode_data.c' file by parsing the
jbe@7 4 # Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
jbe@7 5 # It is included for informational purposes only and not intended for
jbe@7 6 # production use.
jbe@7 7
jbe@7 8
jbe@10 9 # Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
jbe@7 10 #
jbe@7 11 # Permission is hereby granted, free of charge, to any person obtaining a
jbe@7 12 # copy of this software and associated documentation files (the "Software"),
jbe@7 13 # to deal in the Software without restriction, including without limitation
jbe@7 14 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
jbe@7 15 # and/or sell copies of the Software, and to permit persons to whom the
jbe@7 16 # Software is furnished to do so, subject to the following conditions:
jbe@7 17 #
jbe@7 18 # The above copyright notice and this permission notice shall be included in
jbe@7 19 # all copies or substantial portions of the Software.
jbe@7 20 #
jbe@7 21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jbe@7 22 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jbe@7 23 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
jbe@7 24 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jbe@7 25 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
jbe@7 26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
jbe@7 27 # DEALINGS IN THE SOFTWARE.
jbe@7 28
jbe@7 29
jbe@7 30 # This file contains derived data from a modified version of the
jbe@7 31 # Unicode data files. The following license applies to that data:
jbe@7 32 #
jbe@7 33 # COPYRIGHT AND PERMISSION NOTICE
jbe@7 34 #
jbe@7 35 # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
jbe@7 36 # under the Terms of Use in http://www.unicode.org/copyright.html.
jbe@7 37 #
jbe@7 38 # Permission is hereby granted, free of charge, to any person obtaining a
jbe@7 39 # copy of the Unicode data files and any associated documentation (the "Data
jbe@7 40 # Files") or Unicode software and any associated documentation (the
jbe@7 41 # "Software") to deal in the Data Files or Software without restriction,
jbe@7 42 # including without limitation the rights to use, copy, modify, merge,
jbe@7 43 # publish, distribute, and/or sell copies of the Data Files or Software, and
jbe@7 44 # to permit persons to whom the Data Files or Software are furnished to do
jbe@7 45 # so, provided that (a) the above copyright notice(s) and this permission
jbe@7 46 # notice appear with all copies of the Data Files or Software, (b) both the
jbe@7 47 # above copyright notice(s) and this permission notice appear in associated
jbe@7 48 # documentation, and (c) there is clear notice in each modified Data File or
jbe@7 49 # in the Software as well as in the documentation associated with the Data
jbe@7 50 # File(s) or Software that the data or software has been modified.
jbe@7 51 #
jbe@7 52 # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
jbe@7 53 # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
jbe@7 54 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
jbe@7 55 # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
jbe@7 56 # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
jbe@7 57 # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
jbe@7 58 # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
jbe@7 59 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
jbe@7 60 # PERFORMANCE OF THE DATA FILES OR SOFTWARE.
jbe@7 61 #
jbe@7 62 # Except as contained in this notice, the name of a copyright holder shall
jbe@7 63 # not be used in advertising or otherwise to promote the sale, use or other
jbe@7 64 # dealings in these Data Files or Software without prior written
jbe@7 65 # authorization of the copyright holder.
jbe@7 66
jbe@7 67
Jiahao@15 68 $ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
jbe@7 69 $ignorable = []
Jiahao@15 70 $ignorable_list.each_line do |entry|
jbe@8 71 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
jbe@7 72 $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
jbe@7 73 elsif entry =~ /^[0-9A-F]+/
jbe@7 74 $ignorable << $&.hex
jbe@7 75 end
jbe@7 76 end
jbe@7 77
Jiahao@15 78 $grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
jbe@7 79 $grapheme_extend = []
Jiahao@15 80 $grapheme_extend_list.each_line do |entry|
jbe@8 81 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
jbe@7 82 $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
jbe@7 83 elsif entry =~ /^[0-9A-F]+/
jbe@7 84 $grapheme_extend << $&.hex
jbe@7 85 end
jbe@7 86 end
jbe@7 87
Jiahao@15 88 $exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
jbe@7 89 $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
jbe@7 90
Jiahao@15 91 $excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
jbe@7 92 $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
jbe@7 93
Jiahao@15 94 $case_folding_string = File.open("CaseFolding.txt").read
jbe@7 95
jbe@7 96 $case_folding = {}
jbe@7 97 $case_folding_string.chomp.split("\n").each do |line|
jbe@7 98 next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
jbe@7 99 $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
jbe@7 100 end
jbe@7 101
jbe@7 102 $int_array = []
jbe@7 103 $int_array_indicies = {}
jbe@7 104
jbe@7 105 def str2c(string, prefix)
jbe@7 106 return "0" if string.nil?
jbe@7 107 return "UTF8PROC_#{prefix}_#{string.upcase}"
jbe@7 108 end
jbe@7 109 def ary2c(array)
jbe@7 110 return "NULL" if array.nil?
jbe@7 111 unless $int_array_indicies[array]
jbe@7 112 $int_array_indicies[array] = $int_array.length
jbe@7 113 array.each { |entry| $int_array << entry }
jbe@7 114 $int_array << -1
jbe@7 115 end
jbe@7 116 return "utf8proc_sequences + #{$int_array_indicies[array]}"
jbe@7 117 end
jbe@7 118
jbe@7 119 class UnicodeChar
jbe@7 120 attr_accessor :code, :name, :category, :combining_class, :bidi_class,
jbe@7 121 :decomp_type, :decomp_mapping,
jbe@7 122 :bidi_mirrored,
jbe@7 123 :uppercase_mapping, :lowercase_mapping, :titlecase_mapping
jbe@7 124 def initialize(line)
jbe@7 125 raise "Could not parse input." unless line =~ /^
jbe@7 126 ([0-9A-F]+); # code
jbe@7 127 ([^;]+); # name
jbe@7 128 ([A-Z]+); # general category
jbe@7 129 ([0-9]+); # canonical combining class
jbe@7 130 ([A-Z]+); # bidi class
jbe@7 131 (<([A-Z]*)>)? # decomposition type
jbe@7 132 ((\ ?[0-9A-F]+)*); # decompomposition mapping
jbe@7 133 ([0-9]*); # decimal digit
jbe@7 134 ([0-9]*); # digit
jbe@7 135 ([^;]*); # numeric
jbe@7 136 ([YN]*); # bidi mirrored
jbe@7 137 ([^;]*); # unicode 1.0 name
jbe@7 138 ([^;]*); # iso comment
jbe@7 139 ([0-9A-F]*); # simple uppercase mapping
jbe@7 140 ([0-9A-F]*); # simple lowercase mapping
jbe@7 141 ([0-9A-F]*)$/ix # simple titlecase mapping
jbe@7 142 @code = $1.hex
jbe@7 143 @name = $2
jbe@7 144 @category = $3
jbe@7 145 @combining_class = Integer($4)
jbe@7 146 @bidi_class = $5
jbe@7 147 @decomp_type = $7
jbe@7 148 @decomp_mapping = ($8=='') ? nil :
jbe@7 149 $8.split.collect { |element| element.hex }
jbe@7 150 @bidi_mirrored = ($13=='Y') ? true : false
jbe@7 151 @uppercase_mapping = ($16=='') ? nil : $16.hex
jbe@7 152 @lowercase_mapping = ($17=='') ? nil : $17.hex
jbe@7 153 @titlecase_mapping = ($18=='') ? nil : $18.hex
jbe@7 154 end
jbe@7 155 def case_folding
jbe@7 156 $case_folding[code]
jbe@7 157 end
jbe@7 158 def c_entry(comb1_indicies, comb2_indicies)
jbe@7 159 " " <<
jbe@7 160 "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
jbe@7 161 "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
jbe@7 162 "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
jbe@7 163 "#{ary2c decomp_mapping}, " <<
jbe@7 164 "#{bidi_mirrored}, " <<
jbe@7 165 "#{uppercase_mapping or -1}, " <<
jbe@7 166 "#{lowercase_mapping or -1}, " <<
jbe@7 167 "#{titlecase_mapping or -1}, " <<
jbe@7 168 "#{comb1_indicies[code] ?
jbe@7 169 (comb1_indicies[code]*comb2_indicies.keys.length) : -1
jbe@7 170 }, #{comb2_indicies[code] or -1}, " <<
jbe@7 171 "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
jbe@7 172 "#{$ignorable.include?(code)}, " <<
jbe@7 173 "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
jbe@7 174 "#{$grapheme_extend.include?(code)}, " <<
jbe@7 175 "#{ary2c case_folding}},\n"
jbe@7 176 end
jbe@7 177 end
jbe@7 178
jbe@7 179 chars = []
jbe@7 180 char_hash = {}
jbe@7 181
jbe@7 182 while gets
jbe@7 183 if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
jbe@7 184 first = $1.hex
jbe@7 185 gets
jbe@7 186 char = UnicodeChar.new($_)
jbe@7 187 raise "No last character of sequence found." unless
jbe@7 188 $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
jbe@7 189 last = $1.hex
jbe@7 190 name = "<#{$2}>"
jbe@7 191 for i in first..last
jbe@7 192 char_clone = char.clone
jbe@7 193 char_clone.code = i
jbe@7 194 char_clone.name = name
jbe@7 195 char_hash[char_clone.code] = char_clone
jbe@7 196 chars << char_clone
jbe@7 197 end
jbe@7 198 else
jbe@7 199 char = UnicodeChar.new($_)
jbe@7 200 char_hash[char.code] = char
jbe@7 201 chars << char
jbe@7 202 end
jbe@7 203 end
jbe@7 204
jbe@7 205 comb1st_indicies = {}
jbe@7 206 comb2nd_indicies = {}
jbe@7 207 comb_array = []
jbe@7 208
jbe@7 209 chars.each do |char|
Jiahao@15 210 if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
Jiahao@15 211 char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
jbe@7 212 char_hash[char.decomp_mapping[0]].combining_class == 0 and
jbe@7 213 not $exclusions.include?(char.code)
jbe@7 214 unless comb1st_indicies[char.decomp_mapping[0]]
jbe@7 215 comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length
jbe@7 216 end
jbe@7 217 unless comb2nd_indicies[char.decomp_mapping[1]]
jbe@7 218 comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length
jbe@7 219 end
jbe@7 220 comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= []
jbe@7 221 raise "Duplicate canonical mapping" if
jbe@7 222 comb_array[comb1st_indicies[char.decomp_mapping[0]]][
jbe@7 223 comb2nd_indicies[char.decomp_mapping[1]]]
jbe@7 224 comb_array[comb1st_indicies[char.decomp_mapping[0]]][
jbe@7 225 comb2nd_indicies[char.decomp_mapping[1]]] = char.code
jbe@7 226 end
jbe@7 227 end
jbe@7 228
jbe@7 229 properties_indicies = {}
jbe@7 230 properties = []
jbe@7 231 chars.each do |char|
jbe@7 232 c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies)
jbe@7 233 unless properties_indicies[c_entry]
jbe@7 234 properties_indicies[c_entry] = properties.length
jbe@7 235 properties << c_entry
jbe@7 236 end
jbe@7 237 end
jbe@7 238
jbe@7 239 stage1 = []
jbe@7 240 stage2 = []
jbe@7 241 for code in 0...0x110000
jbe@7 242 next unless code % 0x100 == 0
jbe@7 243 stage2_entry = []
jbe@7 244 for code2 in code...(code+0x100)
jbe@7 245 if char_hash[code2]
jbe@7 246 stage2_entry << (properties_indicies[char_hash[code2].c_entry(
jbe@7 247 comb1st_indicies, comb2nd_indicies)] + 1)
jbe@7 248 else
jbe@7 249 stage2_entry << 0
jbe@7 250 end
jbe@7 251 end
jbe@7 252 old_index = stage2.index(stage2_entry)
jbe@7 253 if old_index
jbe@7 254 stage1 << (old_index * 0x100)
jbe@7 255 else
jbe@7 256 stage1 << (stage2.length * 0x100)
jbe@7 257 stage2 << stage2_entry
jbe@7 258 end
jbe@7 259 end
jbe@7 260
jbe@7 261 $stdout << "const int32_t utf8proc_sequences[] = {\n "
jbe@7 262 i = 0
jbe@7 263 $int_array.each do |entry|
jbe@7 264 i += 1
jbe@7 265 if i == 8
jbe@7 266 i = 0
jbe@7 267 $stdout << "\n "
jbe@7 268 end
jbe@7 269 $stdout << entry << ", "
jbe@7 270 end
jbe@7 271 $stdout << "};\n\n"
jbe@7 272
jbe@7 273 $stdout << "const uint16_t utf8proc_stage1table[] = {\n "
jbe@7 274 i = 0
jbe@7 275 stage1.each do |entry|
jbe@7 276 i += 1
jbe@7 277 if i == 8
jbe@7 278 i = 0
jbe@7 279 $stdout << "\n "
jbe@7 280 end
jbe@7 281 $stdout << entry << ", "
jbe@7 282 end
jbe@7 283 $stdout << "};\n\n"
jbe@7 284
jbe@7 285 $stdout << "const uint16_t utf8proc_stage2table[] = {\n "
jbe@7 286 i = 0
jbe@7 287 stage2.flatten.each do |entry|
jbe@7 288 i += 1
jbe@7 289 if i == 8
jbe@7 290 i = 0
jbe@7 291 $stdout << "\n "
jbe@7 292 end
jbe@7 293 $stdout << entry << ", "
jbe@7 294 end
jbe@7 295 $stdout << "};\n\n"
jbe@7 296
jbe@7 297 $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
jbe@7 298 $stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
jbe@7 299 properties.each { |line|
jbe@7 300 $stdout << line
jbe@7 301 }
jbe@7 302 $stdout << "};\n\n"
jbe@7 303
jbe@7 304 $stdout << "const int32_t utf8proc_combinations[] = {\n "
jbe@7 305 i = 0
jbe@7 306 comb1st_indicies.keys.each_index do |a|
jbe@7 307 comb2nd_indicies.keys.each_index do |b|
jbe@7 308 i += 1
jbe@7 309 if i == 8
jbe@7 310 i = 0
jbe@7 311 $stdout << "\n "
jbe@7 312 end
jbe@7 313 $stdout << ( comb_array[a][b] or -1 ) << ", "
jbe@7 314 end
jbe@7 315 end
jbe@7 316 $stdout << "};\n\n"
jbe@7 317

Impressum / About Us