utf8proc

view data_generator.rb @ 15:15450ff3d454

Contribution from libmojibake fork
author Jiahao Chen, Steven G. Johnson, Anthony David Kelman
date Fri Nov 21 08:27:44 2014 -0500 (2014-11-21)
parents 00d2bcbdc945
children
line source
1 #!/usr/bin/env ruby
3 # This file was used to generate the 'unicode_data.c' file by parsing the
4 # Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
5 # It is included for informational purposes only and not intended for
6 # production use.
9 # Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
10 #
11 # Permission is hereby granted, free of charge, to any person obtaining a
12 # copy of this software and associated documentation files (the "Software"),
13 # to deal in the Software without restriction, including without limitation
14 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 # and/or sell copies of the Software, and to permit persons to whom the
16 # Software is furnished to do so, subject to the following conditions:
17 #
18 # The above copyright notice and this permission notice shall be included in
19 # all copies or substantial portions of the Software.
20 #
21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 # DEALINGS IN THE SOFTWARE.
30 # This file contains derived data from a modified version of the
31 # Unicode data files. The following license applies to that data:
32 #
33 # COPYRIGHT AND PERMISSION NOTICE
34 #
35 # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
36 # under the Terms of Use in http://www.unicode.org/copyright.html.
37 #
38 # Permission is hereby granted, free of charge, to any person obtaining a
39 # copy of the Unicode data files and any associated documentation (the "Data
40 # Files") or Unicode software and any associated documentation (the
41 # "Software") to deal in the Data Files or Software without restriction,
42 # including without limitation the rights to use, copy, modify, merge,
43 # publish, distribute, and/or sell copies of the Data Files or Software, and
44 # to permit persons to whom the Data Files or Software are furnished to do
45 # so, provided that (a) the above copyright notice(s) and this permission
46 # notice appear with all copies of the Data Files or Software, (b) both the
47 # above copyright notice(s) and this permission notice appear in associated
48 # documentation, and (c) there is clear notice in each modified Data File or
49 # in the Software as well as in the documentation associated with the Data
50 # File(s) or Software that the data or software has been modified.
51 #
52 # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
53 # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
54 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
55 # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
56 # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
57 # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
58 # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
59 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
60 # PERFORMANCE OF THE DATA FILES OR SOFTWARE.
61 #
62 # Except as contained in this notice, the name of a copyright holder shall
63 # not be used in advertising or otherwise to promote the sale, use or other
64 # dealings in these Data Files or Software without prior written
65 # authorization of the copyright holder.
68 $ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
69 $ignorable = []
70 $ignorable_list.each_line do |entry|
71 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
72 $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
73 elsif entry =~ /^[0-9A-F]+/
74 $ignorable << $&.hex
75 end
76 end
78 $grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
79 $grapheme_extend = []
80 $grapheme_extend_list.each_line do |entry|
81 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
82 $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
83 elsif entry =~ /^[0-9A-F]+/
84 $grapheme_extend << $&.hex
85 end
86 end
88 $exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
89 $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
91 $excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
92 $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
94 $case_folding_string = File.open("CaseFolding.txt").read
96 $case_folding = {}
97 $case_folding_string.chomp.split("\n").each do |line|
98 next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
99 $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
100 end
102 $int_array = []
103 $int_array_indicies = {}
105 def str2c(string, prefix)
106 return "0" if string.nil?
107 return "UTF8PROC_#{prefix}_#{string.upcase}"
108 end
109 def ary2c(array)
110 return "NULL" if array.nil?
111 unless $int_array_indicies[array]
112 $int_array_indicies[array] = $int_array.length
113 array.each { |entry| $int_array << entry }
114 $int_array << -1
115 end
116 return "utf8proc_sequences + #{$int_array_indicies[array]}"
117 end
119 class UnicodeChar
120 attr_accessor :code, :name, :category, :combining_class, :bidi_class,
121 :decomp_type, :decomp_mapping,
122 :bidi_mirrored,
123 :uppercase_mapping, :lowercase_mapping, :titlecase_mapping
124 def initialize(line)
125 raise "Could not parse input." unless line =~ /^
126 ([0-9A-F]+); # code
127 ([^;]+); # name
128 ([A-Z]+); # general category
129 ([0-9]+); # canonical combining class
130 ([A-Z]+); # bidi class
131 (<([A-Z]*)>)? # decomposition type
132 ((\ ?[0-9A-F]+)*); # decompomposition mapping
133 ([0-9]*); # decimal digit
134 ([0-9]*); # digit
135 ([^;]*); # numeric
136 ([YN]*); # bidi mirrored
137 ([^;]*); # unicode 1.0 name
138 ([^;]*); # iso comment
139 ([0-9A-F]*); # simple uppercase mapping
140 ([0-9A-F]*); # simple lowercase mapping
141 ([0-9A-F]*)$/ix # simple titlecase mapping
142 @code = $1.hex
143 @name = $2
144 @category = $3
145 @combining_class = Integer($4)
146 @bidi_class = $5
147 @decomp_type = $7
148 @decomp_mapping = ($8=='') ? nil :
149 $8.split.collect { |element| element.hex }
150 @bidi_mirrored = ($13=='Y') ? true : false
151 @uppercase_mapping = ($16=='') ? nil : $16.hex
152 @lowercase_mapping = ($17=='') ? nil : $17.hex
153 @titlecase_mapping = ($18=='') ? nil : $18.hex
154 end
155 def case_folding
156 $case_folding[code]
157 end
158 def c_entry(comb1_indicies, comb2_indicies)
159 " " <<
160 "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
161 "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
162 "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
163 "#{ary2c decomp_mapping}, " <<
164 "#{bidi_mirrored}, " <<
165 "#{uppercase_mapping or -1}, " <<
166 "#{lowercase_mapping or -1}, " <<
167 "#{titlecase_mapping or -1}, " <<
168 "#{comb1_indicies[code] ?
169 (comb1_indicies[code]*comb2_indicies.keys.length) : -1
170 }, #{comb2_indicies[code] or -1}, " <<
171 "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
172 "#{$ignorable.include?(code)}, " <<
173 "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
174 "#{$grapheme_extend.include?(code)}, " <<
175 "#{ary2c case_folding}},\n"
176 end
177 end
179 chars = []
180 char_hash = {}
182 while gets
183 if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
184 first = $1.hex
185 gets
186 char = UnicodeChar.new($_)
187 raise "No last character of sequence found." unless
188 $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
189 last = $1.hex
190 name = "<#{$2}>"
191 for i in first..last
192 char_clone = char.clone
193 char_clone.code = i
194 char_clone.name = name
195 char_hash[char_clone.code] = char_clone
196 chars << char_clone
197 end
198 else
199 char = UnicodeChar.new($_)
200 char_hash[char.code] = char
201 chars << char
202 end
203 end
205 comb1st_indicies = {}
206 comb2nd_indicies = {}
207 comb_array = []
209 chars.each do |char|
210 if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
211 char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
212 char_hash[char.decomp_mapping[0]].combining_class == 0 and
213 not $exclusions.include?(char.code)
214 unless comb1st_indicies[char.decomp_mapping[0]]
215 comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length
216 end
217 unless comb2nd_indicies[char.decomp_mapping[1]]
218 comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length
219 end
220 comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= []
221 raise "Duplicate canonical mapping" if
222 comb_array[comb1st_indicies[char.decomp_mapping[0]]][
223 comb2nd_indicies[char.decomp_mapping[1]]]
224 comb_array[comb1st_indicies[char.decomp_mapping[0]]][
225 comb2nd_indicies[char.decomp_mapping[1]]] = char.code
226 end
227 end
229 properties_indicies = {}
230 properties = []
231 chars.each do |char|
232 c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies)
233 unless properties_indicies[c_entry]
234 properties_indicies[c_entry] = properties.length
235 properties << c_entry
236 end
237 end
239 stage1 = []
240 stage2 = []
241 for code in 0...0x110000
242 next unless code % 0x100 == 0
243 stage2_entry = []
244 for code2 in code...(code+0x100)
245 if char_hash[code2]
246 stage2_entry << (properties_indicies[char_hash[code2].c_entry(
247 comb1st_indicies, comb2nd_indicies)] + 1)
248 else
249 stage2_entry << 0
250 end
251 end
252 old_index = stage2.index(stage2_entry)
253 if old_index
254 stage1 << (old_index * 0x100)
255 else
256 stage1 << (stage2.length * 0x100)
257 stage2 << stage2_entry
258 end
259 end
261 $stdout << "const int32_t utf8proc_sequences[] = {\n "
262 i = 0
263 $int_array.each do |entry|
264 i += 1
265 if i == 8
266 i = 0
267 $stdout << "\n "
268 end
269 $stdout << entry << ", "
270 end
271 $stdout << "};\n\n"
273 $stdout << "const uint16_t utf8proc_stage1table[] = {\n "
274 i = 0
275 stage1.each do |entry|
276 i += 1
277 if i == 8
278 i = 0
279 $stdout << "\n "
280 end
281 $stdout << entry << ", "
282 end
283 $stdout << "};\n\n"
285 $stdout << "const uint16_t utf8proc_stage2table[] = {\n "
286 i = 0
287 stage2.flatten.each do |entry|
288 i += 1
289 if i == 8
290 i = 0
291 $stdout << "\n "
292 end
293 $stdout << entry << ", "
294 end
295 $stdout << "};\n\n"
297 $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
298 $stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
299 properties.each { |line|
300 $stdout << line
301 }
302 $stdout << "};\n\n"
304 $stdout << "const int32_t utf8proc_combinations[] = {\n "
305 i = 0
306 comb1st_indicies.keys.each_index do |a|
307 comb2nd_indicies.keys.each_index do |b|
308 i += 1
309 if i == 8
310 i = 0
311 $stdout << "\n "
312 end
313 $stdout << ( comb_array[a][b] or -1 ) << ", "
314 end
315 end
316 $stdout << "};\n\n"

Impressum / About Us