rev |
line source |
Jiahao@15
|
1 #!/usr/bin/env ruby
|
jbe@7
|
2
|
jbe@7
|
3 # This file was used to generate the 'unicode_data.c' file by parsing the
|
jbe@7
|
4 # Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
|
jbe@7
|
5 # It is included for informational purposes only and not intended for
|
jbe@7
|
6 # production use.
|
jbe@7
|
7
|
jbe@7
|
8
|
jbe@10
|
9 # Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
jbe@7
|
10 #
|
jbe@7
|
11 # Permission is hereby granted, free of charge, to any person obtaining a
|
jbe@7
|
12 # copy of this software and associated documentation files (the "Software"),
|
jbe@7
|
13 # to deal in the Software without restriction, including without limitation
|
jbe@7
|
14 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
jbe@7
|
15 # and/or sell copies of the Software, and to permit persons to whom the
|
jbe@7
|
16 # Software is furnished to do so, subject to the following conditions:
|
jbe@7
|
17 #
|
jbe@7
|
18 # The above copyright notice and this permission notice shall be included in
|
jbe@7
|
19 # all copies or substantial portions of the Software.
|
jbe@7
|
20 #
|
jbe@7
|
21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jbe@7
|
22 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jbe@7
|
23 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
jbe@7
|
24 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jbe@7
|
25 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
jbe@7
|
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
jbe@7
|
27 # DEALINGS IN THE SOFTWARE.
|
jbe@7
|
28
|
jbe@7
|
29
|
jbe@7
|
30 # This file contains derived data from a modified version of the
|
jbe@7
|
31 # Unicode data files. The following license applies to that data:
|
jbe@7
|
32 #
|
jbe@7
|
33 # COPYRIGHT AND PERMISSION NOTICE
|
jbe@7
|
34 #
|
jbe@7
|
35 # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
|
jbe@7
|
36 # under the Terms of Use in http://www.unicode.org/copyright.html.
|
jbe@7
|
37 #
|
jbe@7
|
38 # Permission is hereby granted, free of charge, to any person obtaining a
|
jbe@7
|
39 # copy of the Unicode data files and any associated documentation (the "Data
|
jbe@7
|
40 # Files") or Unicode software and any associated documentation (the
|
jbe@7
|
41 # "Software") to deal in the Data Files or Software without restriction,
|
jbe@7
|
42 # including without limitation the rights to use, copy, modify, merge,
|
jbe@7
|
43 # publish, distribute, and/or sell copies of the Data Files or Software, and
|
jbe@7
|
44 # to permit persons to whom the Data Files or Software are furnished to do
|
jbe@7
|
45 # so, provided that (a) the above copyright notice(s) and this permission
|
jbe@7
|
46 # notice appear with all copies of the Data Files or Software, (b) both the
|
jbe@7
|
47 # above copyright notice(s) and this permission notice appear in associated
|
jbe@7
|
48 # documentation, and (c) there is clear notice in each modified Data File or
|
jbe@7
|
49 # in the Software as well as in the documentation associated with the Data
|
jbe@7
|
50 # File(s) or Software that the data or software has been modified.
|
jbe@7
|
51 #
|
jbe@7
|
52 # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
jbe@7
|
53 # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
jbe@7
|
54 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
|
jbe@7
|
55 # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
|
jbe@7
|
56 # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
|
jbe@7
|
57 # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
|
jbe@7
|
58 # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
jbe@7
|
59 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
jbe@7
|
60 # PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
jbe@7
|
61 #
|
jbe@7
|
62 # Except as contained in this notice, the name of a copyright holder shall
|
jbe@7
|
63 # not be used in advertising or otherwise to promote the sale, use or other
|
jbe@7
|
64 # dealings in these Data Files or Software without prior written
|
jbe@7
|
65 # authorization of the copyright holder.
|
jbe@7
|
66
|
jbe@7
|
67
|
Jiahao@15
|
68 $ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
|
jbe@7
|
69 $ignorable = []
|
Jiahao@15
|
70 $ignorable_list.each_line do |entry|
|
jbe@8
|
71 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
|
jbe@7
|
72 $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
|
jbe@7
|
73 elsif entry =~ /^[0-9A-F]+/
|
jbe@7
|
74 $ignorable << $&.hex
|
jbe@7
|
75 end
|
jbe@7
|
76 end
|
jbe@7
|
77
|
Jiahao@15
|
78 $grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
|
jbe@7
|
79 $grapheme_extend = []
|
Jiahao@15
|
80 $grapheme_extend_list.each_line do |entry|
|
jbe@8
|
81 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
|
jbe@7
|
82 $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
|
jbe@7
|
83 elsif entry =~ /^[0-9A-F]+/
|
jbe@7
|
84 $grapheme_extend << $&.hex
|
jbe@7
|
85 end
|
jbe@7
|
86 end
|
jbe@7
|
87
|
Jiahao@15
|
88 $exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
|
jbe@7
|
89 $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
|
jbe@7
|
90
|
Jiahao@15
|
91 $excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
|
jbe@7
|
92 $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
|
jbe@7
|
93
|
Jiahao@15
|
94 $case_folding_string = File.open("CaseFolding.txt").read
|
jbe@7
|
95
|
jbe@7
|
96 $case_folding = {}
|
jbe@7
|
97 $case_folding_string.chomp.split("\n").each do |line|
|
jbe@7
|
98 next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
|
jbe@7
|
99 $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
|
jbe@7
|
100 end
|
jbe@7
|
101
|
jbe@7
|
102 $int_array = []
|
jbe@7
|
103 $int_array_indicies = {}
|
jbe@7
|
104
|
jbe@7
|
105 def str2c(string, prefix)
|
jbe@7
|
106 return "0" if string.nil?
|
jbe@7
|
107 return "UTF8PROC_#{prefix}_#{string.upcase}"
|
jbe@7
|
108 end
|
jbe@7
|
109 def ary2c(array)
|
jbe@7
|
110 return "NULL" if array.nil?
|
jbe@7
|
111 unless $int_array_indicies[array]
|
jbe@7
|
112 $int_array_indicies[array] = $int_array.length
|
jbe@7
|
113 array.each { |entry| $int_array << entry }
|
jbe@7
|
114 $int_array << -1
|
jbe@7
|
115 end
|
jbe@7
|
116 return "utf8proc_sequences + #{$int_array_indicies[array]}"
|
jbe@7
|
117 end
|
jbe@7
|
118
|
jbe@7
|
119 class UnicodeChar
|
jbe@7
|
120 attr_accessor :code, :name, :category, :combining_class, :bidi_class,
|
jbe@7
|
121 :decomp_type, :decomp_mapping,
|
jbe@7
|
122 :bidi_mirrored,
|
jbe@7
|
123 :uppercase_mapping, :lowercase_mapping, :titlecase_mapping
|
jbe@7
|
124 def initialize(line)
|
jbe@7
|
125 raise "Could not parse input." unless line =~ /^
|
jbe@7
|
126 ([0-9A-F]+); # code
|
jbe@7
|
127 ([^;]+); # name
|
jbe@7
|
128 ([A-Z]+); # general category
|
jbe@7
|
129 ([0-9]+); # canonical combining class
|
jbe@7
|
130 ([A-Z]+); # bidi class
|
jbe@7
|
131 (<([A-Z]*)>)? # decomposition type
|
jbe@7
|
132 ((\ ?[0-9A-F]+)*); # decompomposition mapping
|
jbe@7
|
133 ([0-9]*); # decimal digit
|
jbe@7
|
134 ([0-9]*); # digit
|
jbe@7
|
135 ([^;]*); # numeric
|
jbe@7
|
136 ([YN]*); # bidi mirrored
|
jbe@7
|
137 ([^;]*); # unicode 1.0 name
|
jbe@7
|
138 ([^;]*); # iso comment
|
jbe@7
|
139 ([0-9A-F]*); # simple uppercase mapping
|
jbe@7
|
140 ([0-9A-F]*); # simple lowercase mapping
|
jbe@7
|
141 ([0-9A-F]*)$/ix # simple titlecase mapping
|
jbe@7
|
142 @code = $1.hex
|
jbe@7
|
143 @name = $2
|
jbe@7
|
144 @category = $3
|
jbe@7
|
145 @combining_class = Integer($4)
|
jbe@7
|
146 @bidi_class = $5
|
jbe@7
|
147 @decomp_type = $7
|
jbe@7
|
148 @decomp_mapping = ($8=='') ? nil :
|
jbe@7
|
149 $8.split.collect { |element| element.hex }
|
jbe@7
|
150 @bidi_mirrored = ($13=='Y') ? true : false
|
jbe@7
|
151 @uppercase_mapping = ($16=='') ? nil : $16.hex
|
jbe@7
|
152 @lowercase_mapping = ($17=='') ? nil : $17.hex
|
jbe@7
|
153 @titlecase_mapping = ($18=='') ? nil : $18.hex
|
jbe@7
|
154 end
|
jbe@7
|
155 def case_folding
|
jbe@7
|
156 $case_folding[code]
|
jbe@7
|
157 end
|
jbe@7
|
158 def c_entry(comb1_indicies, comb2_indicies)
|
jbe@7
|
159 " " <<
|
jbe@7
|
160 "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
|
jbe@7
|
161 "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
|
jbe@7
|
162 "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
|
jbe@7
|
163 "#{ary2c decomp_mapping}, " <<
|
jbe@7
|
164 "#{bidi_mirrored}, " <<
|
jbe@7
|
165 "#{uppercase_mapping or -1}, " <<
|
jbe@7
|
166 "#{lowercase_mapping or -1}, " <<
|
jbe@7
|
167 "#{titlecase_mapping or -1}, " <<
|
jbe@7
|
168 "#{comb1_indicies[code] ?
|
jbe@7
|
169 (comb1_indicies[code]*comb2_indicies.keys.length) : -1
|
jbe@7
|
170 }, #{comb2_indicies[code] or -1}, " <<
|
jbe@7
|
171 "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
jbe@7
|
172 "#{$ignorable.include?(code)}, " <<
|
jbe@7
|
173 "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
|
jbe@7
|
174 "#{$grapheme_extend.include?(code)}, " <<
|
jbe@7
|
175 "#{ary2c case_folding}},\n"
|
jbe@7
|
176 end
|
jbe@7
|
177 end
|
jbe@7
|
178
|
jbe@7
|
179 chars = []
|
jbe@7
|
180 char_hash = {}
|
jbe@7
|
181
|
jbe@7
|
182 while gets
|
jbe@7
|
183 if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
|
jbe@7
|
184 first = $1.hex
|
jbe@7
|
185 gets
|
jbe@7
|
186 char = UnicodeChar.new($_)
|
jbe@7
|
187 raise "No last character of sequence found." unless
|
jbe@7
|
188 $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
|
jbe@7
|
189 last = $1.hex
|
jbe@7
|
190 name = "<#{$2}>"
|
jbe@7
|
191 for i in first..last
|
jbe@7
|
192 char_clone = char.clone
|
jbe@7
|
193 char_clone.code = i
|
jbe@7
|
194 char_clone.name = name
|
jbe@7
|
195 char_hash[char_clone.code] = char_clone
|
jbe@7
|
196 chars << char_clone
|
jbe@7
|
197 end
|
jbe@7
|
198 else
|
jbe@7
|
199 char = UnicodeChar.new($_)
|
jbe@7
|
200 char_hash[char.code] = char
|
jbe@7
|
201 chars << char
|
jbe@7
|
202 end
|
jbe@7
|
203 end
|
jbe@7
|
204
|
jbe@7
|
205 comb1st_indicies = {}
|
jbe@7
|
206 comb2nd_indicies = {}
|
jbe@7
|
207 comb_array = []
|
jbe@7
|
208
|
jbe@7
|
209 chars.each do |char|
|
Jiahao@15
|
210 if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
|
Jiahao@15
|
211 char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
|
jbe@7
|
212 char_hash[char.decomp_mapping[0]].combining_class == 0 and
|
jbe@7
|
213 not $exclusions.include?(char.code)
|
jbe@7
|
214 unless comb1st_indicies[char.decomp_mapping[0]]
|
jbe@7
|
215 comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length
|
jbe@7
|
216 end
|
jbe@7
|
217 unless comb2nd_indicies[char.decomp_mapping[1]]
|
jbe@7
|
218 comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length
|
jbe@7
|
219 end
|
jbe@7
|
220 comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= []
|
jbe@7
|
221 raise "Duplicate canonical mapping" if
|
jbe@7
|
222 comb_array[comb1st_indicies[char.decomp_mapping[0]]][
|
jbe@7
|
223 comb2nd_indicies[char.decomp_mapping[1]]]
|
jbe@7
|
224 comb_array[comb1st_indicies[char.decomp_mapping[0]]][
|
jbe@7
|
225 comb2nd_indicies[char.decomp_mapping[1]]] = char.code
|
jbe@7
|
226 end
|
jbe@7
|
227 end
|
jbe@7
|
228
|
jbe@7
|
229 properties_indicies = {}
|
jbe@7
|
230 properties = []
|
jbe@7
|
231 chars.each do |char|
|
jbe@7
|
232 c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies)
|
jbe@7
|
233 unless properties_indicies[c_entry]
|
jbe@7
|
234 properties_indicies[c_entry] = properties.length
|
jbe@7
|
235 properties << c_entry
|
jbe@7
|
236 end
|
jbe@7
|
237 end
|
jbe@7
|
238
|
jbe@7
|
239 stage1 = []
|
jbe@7
|
240 stage2 = []
|
jbe@7
|
241 for code in 0...0x110000
|
jbe@7
|
242 next unless code % 0x100 == 0
|
jbe@7
|
243 stage2_entry = []
|
jbe@7
|
244 for code2 in code...(code+0x100)
|
jbe@7
|
245 if char_hash[code2]
|
jbe@7
|
246 stage2_entry << (properties_indicies[char_hash[code2].c_entry(
|
jbe@7
|
247 comb1st_indicies, comb2nd_indicies)] + 1)
|
jbe@7
|
248 else
|
jbe@7
|
249 stage2_entry << 0
|
jbe@7
|
250 end
|
jbe@7
|
251 end
|
jbe@7
|
252 old_index = stage2.index(stage2_entry)
|
jbe@7
|
253 if old_index
|
jbe@7
|
254 stage1 << (old_index * 0x100)
|
jbe@7
|
255 else
|
jbe@7
|
256 stage1 << (stage2.length * 0x100)
|
jbe@7
|
257 stage2 << stage2_entry
|
jbe@7
|
258 end
|
jbe@7
|
259 end
|
jbe@7
|
260
|
jbe@7
|
261 $stdout << "const int32_t utf8proc_sequences[] = {\n "
|
jbe@7
|
262 i = 0
|
jbe@7
|
263 $int_array.each do |entry|
|
jbe@7
|
264 i += 1
|
jbe@7
|
265 if i == 8
|
jbe@7
|
266 i = 0
|
jbe@7
|
267 $stdout << "\n "
|
jbe@7
|
268 end
|
jbe@7
|
269 $stdout << entry << ", "
|
jbe@7
|
270 end
|
jbe@7
|
271 $stdout << "};\n\n"
|
jbe@7
|
272
|
jbe@7
|
273 $stdout << "const uint16_t utf8proc_stage1table[] = {\n "
|
jbe@7
|
274 i = 0
|
jbe@7
|
275 stage1.each do |entry|
|
jbe@7
|
276 i += 1
|
jbe@7
|
277 if i == 8
|
jbe@7
|
278 i = 0
|
jbe@7
|
279 $stdout << "\n "
|
jbe@7
|
280 end
|
jbe@7
|
281 $stdout << entry << ", "
|
jbe@7
|
282 end
|
jbe@7
|
283 $stdout << "};\n\n"
|
jbe@7
|
284
|
jbe@7
|
285 $stdout << "const uint16_t utf8proc_stage2table[] = {\n "
|
jbe@7
|
286 i = 0
|
jbe@7
|
287 stage2.flatten.each do |entry|
|
jbe@7
|
288 i += 1
|
jbe@7
|
289 if i == 8
|
jbe@7
|
290 i = 0
|
jbe@7
|
291 $stdout << "\n "
|
jbe@7
|
292 end
|
jbe@7
|
293 $stdout << entry << ", "
|
jbe@7
|
294 end
|
jbe@7
|
295 $stdout << "};\n\n"
|
jbe@7
|
296
|
jbe@7
|
297 $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
jbe@7
|
298 $stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
|
jbe@7
|
299 properties.each { |line|
|
jbe@7
|
300 $stdout << line
|
jbe@7
|
301 }
|
jbe@7
|
302 $stdout << "};\n\n"
|
jbe@7
|
303
|
jbe@7
|
304 $stdout << "const int32_t utf8proc_combinations[] = {\n "
|
jbe@7
|
305 i = 0
|
jbe@7
|
306 comb1st_indicies.keys.each_index do |a|
|
jbe@7
|
307 comb2nd_indicies.keys.each_index do |b|
|
jbe@7
|
308 i += 1
|
jbe@7
|
309 if i == 8
|
jbe@7
|
310 i = 0
|
jbe@7
|
311 $stdout << "\n "
|
jbe@7
|
312 end
|
jbe@7
|
313 $stdout << ( comb_array[a][b] or -1 ) << ", "
|
jbe@7
|
314 end
|
jbe@7
|
315 end
|
jbe@7
|
316 $stdout << "};\n\n"
|
jbe@7
|
317
|