github.com/hashicorp/hcl/v2@v2.20.0/hclsyntax/unicode2ragel.rb (about) 1 #!/usr/bin/env ruby 2 # Copyright (c) HashiCorp, Inc. 3 # SPDX-License-Identifier: MPL-2.0 4 5 # 6 # This scripted has been updated to accept more command-line arguments: 7 # 8 # -u, --url URL to process 9 # -m, --machine Machine name 10 # -p, --properties Properties to add to the machine 11 # -o, --output Write output to file 12 # 13 # Updated by: Marty Schoch <marty.schoch@gmail.com> 14 # 15 # This script uses the unicode spec to generate a Ragel state machine 16 # that recognizes unicode alphanumeric characters. It generates 5 17 # character classes: uupper, ulower, ualpha, udigit, and ualnum. 18 # Currently supported encodings are UTF-8 [default] and UCS-4. 19 # 20 # Usage: unicode2ragel.rb [options] 21 # -e, --encoding [ucs4 | utf8] Data encoding 22 # -h, --help Show this message 23 # 24 # This script was originally written as part of the Ferret search 25 # engine library. 26 # 27 # Author: Rakan El-Khalil <rakan@well.com> 28 29 require 'optparse' 30 require 'open-uri' 31 32 ENCODINGS = [ :utf8, :ucs4 ] 33 ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" } 34 DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" 35 DEFAULT_MACHINE_NAME= "WChar" 36 37 ### 38 # Display vars & default option 39 40 TOTAL_WIDTH = 80 41 RANGE_WIDTH = 23 42 @encoding = :utf8 43 @chart_url = DEFAULT_CHART_URL 44 machine_name = DEFAULT_MACHINE_NAME 45 properties = [] 46 @output = $stdout 47 48 ### 49 # Option parsing 50 51 cli_opts = OptionParser.new do |opts| 52 opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| 53 @encoding = o.downcase.to_sym 54 end 55 opts.on("-h", "--help", "Show this message") do 56 puts opts 57 exit 58 end 59 opts.on("-u", "--url URL", "URL to process") do |o| 60 @chart_url = o 61 end 62 opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o| 63 machine_name = o 64 end 65 opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o| 66 properties = o 67 end 68 opts.on("-o", "--output FILE", "output file") do |o| 69 @output = File.new(o, "w+") 70 end 71 end 72 73 cli_opts.parse(ARGV) 74 unless ENCODINGS.member? @encoding 75 puts "Invalid encoding: #{@encoding}" 76 puts cli_opts 77 exit 78 end 79 80 ## 81 # Downloads the document at url and yields every alpha line's hex 82 # range and description. 83 84 def each_alpha( url, property ) 85 open( url ) do |file| 86 file.each_line do |line| 87 next if line =~ /^#/; 88 next if line !~ /; #{property} #/; 89 90 range, description = line.split(/;/) 91 range.strip! 92 description.gsub!(/.*#/, '').strip! 93 94 if range =~ /\.\./ 95 start, stop = range.split '..' 96 else start = stop = range 97 end 98 99 yield start.hex .. stop.hex, description 100 end 101 end 102 end 103 104 ### 105 # Formats to hex at minimum width 106 107 def to_hex( n ) 108 r = "%0X" % n 109 r = "0#{r}" unless (r.length % 2).zero? 110 r 111 end 112 113 ### 114 # UCS4 is just a straight hex conversion of the unicode codepoint. 115 116 def to_ucs4( range ) 117 rangestr = "0x" + to_hex(range.begin) 118 rangestr << "..0x" + to_hex(range.end) if range.begin != range.end 119 [ rangestr ] 120 end 121 122 ## 123 # 0x00 - 0x7f -> 0zzzzzzz[7] 124 # 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] 125 # 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] 126 # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] 127 128 UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] 129 130 def to_utf8_enc( n ) 131 r = 0 132 if n <= 0x7f 133 r = n 134 elsif n <= 0x7ff 135 y = 0xc0 | (n >> 6) 136 z = 0x80 | (n & 0x3f) 137 r = y << 8 | z 138 elsif n <= 0xffff 139 x = 0xe0 | (n >> 12) 140 y = 0x80 | (n >> 6) & 0x3f 141 z = 0x80 | n & 0x3f 142 r = x << 16 | y << 8 | z 143 elsif n <= 0x10ffff 144 w = 0xf0 | (n >> 18) 145 x = 0x80 | (n >> 12) & 0x3f 146 y = 0x80 | (n >> 6) & 0x3f 147 z = 0x80 | n & 0x3f 148 r = w << 24 | x << 16 | y << 8 | z 149 end 150 151 to_hex(r) 152 end 153 154 def from_utf8_enc( n ) 155 n = n.hex 156 r = 0 157 if n <= 0x7f 158 r = n 159 elsif n <= 0xdfff 160 y = (n >> 8) & 0x1f 161 z = n & 0x3f 162 r = y << 6 | z 163 elsif n <= 0xefffff 164 x = (n >> 16) & 0x0f 165 y = (n >> 8) & 0x3f 166 z = n & 0x3f 167 r = x << 10 | y << 6 | z 168 elsif n <= 0xf7ffffff 169 w = (n >> 24) & 0x07 170 x = (n >> 16) & 0x3f 171 y = (n >> 8) & 0x3f 172 z = n & 0x3f 173 r = w << 18 | x << 12 | y << 6 | z 174 end 175 r 176 end 177 178 ### 179 # Given a range, splits it up into ranges that can be continuously 180 # encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] 181 # This is not strictly needed since the current [5.1] unicode standard 182 # doesn't have ranges that straddle utf8 boundaries. This is included 183 # for completeness as there is no telling if that will ever change. 184 185 def utf8_ranges( range ) 186 ranges = [] 187 UTF8_BOUNDARIES.each do |max| 188 if range.begin <= max 189 if range.end <= max 190 ranges << range 191 return ranges 192 end 193 194 ranges << (range.begin .. max) 195 range = (max + 1) .. range.end 196 end 197 end 198 ranges 199 end 200 201 def build_range( start, stop ) 202 size = start.size/2 203 left = size - 1 204 return [""] if size < 1 205 206 a = start[0..1] 207 b = stop[0..1] 208 209 ### 210 # Shared prefix 211 212 if a == b 213 return build_range(start[2..-1], stop[2..-1]).map do |elt| 214 "0x#{a} " + elt 215 end 216 end 217 218 ### 219 # Unshared prefix, end of run 220 221 return ["0x#{a}..0x#{b} "] if left.zero? 222 223 ### 224 # Unshared prefix, not end of run 225 # Range can be 0x123456..0x56789A 226 # Which is equivalent to: 227 # 0x123456 .. 0x12FFFF 228 # 0x130000 .. 0x55FFFF 229 # 0x560000 .. 0x56789A 230 231 ret = [] 232 ret << build_range(start, a + "FF" * left) 233 234 ### 235 # Only generate middle range if need be. 236 237 if a.hex+1 != b.hex 238 max = to_hex(b.hex - 1) 239 max = "FF" if b == "FF" 240 ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left 241 end 242 243 ### 244 # Don't generate last range if it is covered by first range 245 246 ret << build_range(b + "00" * left, stop) unless b == "FF" 247 ret.flatten! 248 end 249 250 def to_utf8( range ) 251 utf8_ranges( range ).map do |r| 252 begin_enc = to_utf8_enc(r.begin) 253 end_enc = to_utf8_enc(r.end) 254 build_range begin_enc, end_enc 255 end.flatten! 256 end 257 258 ## 259 # Perform a 3-way comparison of the number of codepoints advertised by 260 # the unicode spec for the given range, the originally parsed range, 261 # and the resulting utf8 encoded range. 262 263 def count_codepoints( code ) 264 code.split(' ').inject(1) do |acc, elt| 265 if elt =~ /0x(.+)\.\.0x(.+)/ 266 if @encoding == :utf8 267 acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) 268 else 269 acc * ($2.hex - $1.hex + 1) 270 end 271 else 272 acc 273 end 274 end 275 end 276 277 def is_valid?( range, desc, codes ) 278 spec_count = 1 279 spec_count = $1.to_i if desc =~ /\[(\d+)\]/ 280 range_count = range.end - range.begin + 1 281 282 sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } 283 sum == spec_count and sum == range_count 284 end 285 286 ## 287 # Generate the state maching to stdout 288 289 def generate_machine( name, property ) 290 pipe = " " 291 @output.puts " #{name} = " 292 each_alpha( @chart_url, property ) do |range, desc| 293 294 codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) 295 296 #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless 297 # is_valid? range, desc, codes 298 299 range_width = codes.map { |a| a.size }.max 300 range_width = RANGE_WIDTH if range_width < RANGE_WIDTH 301 302 desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 303 desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH 304 305 if desc.size > desc_width 306 desc = desc[0..desc_width - 4] + "..." 307 end 308 309 codes.each_with_index do |r, idx| 310 desc = "" unless idx.zero? 311 code = "%-#{range_width}s" % r 312 @output.puts " #{pipe} #{code} ##{desc}" 313 pipe = "|" 314 end 315 end 316 @output.puts " ;" 317 @output.puts "" 318 end 319 320 @output.puts <<EOF 321 # The following Ragel file was autogenerated with #{$0} 322 # from: #{@chart_url} 323 # 324 # It defines #{properties}. 325 # 326 # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]}, 327 # and that your input is in #{@encoding}. 328 329 %%{ 330 machine #{machine_name}; 331 332 EOF 333 334 properties.each { |x| generate_machine( x, x ) } 335 336 @output.puts <<EOF 337 }%% 338 EOF