github.com/hashicorp/hcl/v2@v2.20.0/hclsyntax/unicode2ragel.rb (about)

     1  #!/usr/bin/env ruby
     2  # Copyright (c) HashiCorp, Inc.
     3  # SPDX-License-Identifier: MPL-2.0
     4  
     5  #
     6  # This scripted has been updated to accept more command-line arguments:
     7  #
     8  #    -u, --url                        URL to process
     9  #    -m, --machine                    Machine name
    10  #    -p, --properties                 Properties to add to the machine
    11  #    -o, --output                     Write output to file
    12  #
    13  # Updated by: Marty Schoch <marty.schoch@gmail.com>
    14  # 
    15  # This script uses the unicode spec to generate a Ragel state machine
    16  # that recognizes unicode alphanumeric characters.  It generates 5
    17  # character classes: uupper, ulower, ualpha, udigit, and ualnum.
    18  # Currently supported encodings are UTF-8 [default] and UCS-4.
    19  #
    20  # Usage: unicode2ragel.rb [options]
    21  #    -e, --encoding [ucs4 | utf8]     Data encoding
    22  #    -h, --help                       Show this message
    23  #
    24  # This script was originally written as part of the Ferret search
    25  # engine library.
    26  #
    27  # Author: Rakan El-Khalil <rakan@well.com>
    28  
    29  require 'optparse'
    30  require 'open-uri'
    31  
    32  ENCODINGS = [ :utf8, :ucs4 ]
    33  ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
    34  DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
    35  DEFAULT_MACHINE_NAME= "WChar"
    36  
    37  ###
    38  # Display vars & default option
    39  
    40  TOTAL_WIDTH = 80
    41  RANGE_WIDTH = 23
    42  @encoding = :utf8
    43  @chart_url = DEFAULT_CHART_URL
    44  machine_name = DEFAULT_MACHINE_NAME
    45  properties = []
    46  @output = $stdout
    47  
    48  ###
    49  # Option parsing
    50  
    51  cli_opts = OptionParser.new do |opts|
    52    opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
    53      @encoding = o.downcase.to_sym
    54    end
    55    opts.on("-h", "--help", "Show this message") do
    56      puts opts
    57      exit
    58    end
    59    opts.on("-u", "--url URL", "URL to process") do |o|
    60      @chart_url = o 
    61    end
    62    opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
    63      machine_name = o
    64    end
    65    opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
    66      properties = o
    67    end
    68    opts.on("-o", "--output FILE", "output file") do |o|
    69      @output = File.new(o, "w+")
    70    end
    71  end
    72  
    73  cli_opts.parse(ARGV)
    74  unless ENCODINGS.member? @encoding
    75    puts "Invalid encoding: #{@encoding}"
    76    puts cli_opts
    77    exit
    78  end
    79  
    80  ##
    81  # Downloads the document at url and yields every alpha line's hex
    82  # range and description.
    83  
    84  def each_alpha( url, property ) 
    85    open( url ) do |file|
    86      file.each_line do |line|
    87        next if line =~ /^#/;
    88        next if line !~ /; #{property} #/;
    89  
    90        range, description = line.split(/;/)
    91        range.strip!
    92        description.gsub!(/.*#/, '').strip!
    93  
    94        if range =~ /\.\./
    95             start, stop = range.split '..'
    96        else start = stop = range
    97        end
    98  
    99        yield start.hex .. stop.hex, description
   100      end
   101    end
   102  end
   103  
   104  ###
   105  # Formats to hex at minimum width
   106  
   107  def to_hex( n )
   108    r = "%0X" % n
   109    r = "0#{r}" unless (r.length % 2).zero?
   110    r
   111  end
   112  
   113  ###
   114  # UCS4 is just a straight hex conversion of the unicode codepoint.
   115  
   116  def to_ucs4( range )
   117    rangestr  =   "0x" + to_hex(range.begin)
   118    rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
   119    [ rangestr ]
   120  end
   121  
   122  ##
   123  # 0x00     - 0x7f     -> 0zzzzzzz[7]
   124  # 0x80     - 0x7ff    -> 110yyyyy[5] 10zzzzzz[6]
   125  # 0x800    - 0xffff   -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
   126  # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] 
   127  
   128  UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
   129  
   130  def to_utf8_enc( n )
   131    r = 0
   132    if n <= 0x7f
   133      r = n
   134    elsif n <= 0x7ff
   135      y = 0xc0 | (n >> 6)
   136      z = 0x80 | (n & 0x3f)
   137      r = y << 8 | z
   138    elsif n <= 0xffff
   139      x = 0xe0 | (n >> 12)
   140      y = 0x80 | (n >>  6) & 0x3f
   141      z = 0x80 |  n        & 0x3f
   142      r = x << 16 | y << 8 | z
   143    elsif n <= 0x10ffff
   144      w = 0xf0 | (n >> 18)
   145      x = 0x80 | (n >> 12) & 0x3f
   146      y = 0x80 | (n >>  6) & 0x3f
   147      z = 0x80 |  n        & 0x3f
   148      r = w << 24 | x << 16 | y << 8 | z
   149    end
   150  
   151    to_hex(r)
   152  end
   153  
   154  def from_utf8_enc( n )
   155    n = n.hex
   156    r = 0
   157    if n <= 0x7f
   158      r = n
   159    elsif n <= 0xdfff
   160      y = (n >> 8) & 0x1f
   161      z =  n       & 0x3f
   162      r = y << 6 | z
   163    elsif n <= 0xefffff
   164      x = (n >> 16) & 0x0f
   165      y = (n >>  8) & 0x3f
   166      z =  n        & 0x3f
   167      r = x << 10 | y << 6 | z
   168    elsif n <= 0xf7ffffff
   169      w = (n >> 24) & 0x07
   170      x = (n >> 16) & 0x3f
   171      y = (n >>  8) & 0x3f
   172      z =  n        & 0x3f
   173      r = w << 18 | x << 12 | y << 6 | z
   174    end
   175    r
   176  end
   177  
   178  ###
   179  # Given a range, splits it up into ranges that can be continuously
   180  # encoded into utf8.  Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
   181  # This is not strictly needed since the current [5.1] unicode standard
   182  # doesn't have ranges that straddle utf8 boundaries.  This is included
   183  # for completeness as there is no telling if that will ever change.
   184  
   185  def utf8_ranges( range )
   186    ranges = []
   187    UTF8_BOUNDARIES.each do |max|
   188      if range.begin <= max
   189        if range.end <= max
   190          ranges << range
   191          return ranges
   192        end
   193  
   194        ranges << (range.begin .. max)
   195        range = (max + 1) .. range.end
   196      end
   197    end
   198    ranges
   199  end
   200  
   201  def build_range( start, stop )
   202    size = start.size/2
   203    left = size - 1
   204    return [""] if size < 1
   205  
   206    a = start[0..1]
   207    b = stop[0..1]
   208  
   209    ###
   210    # Shared prefix
   211  
   212    if a == b
   213      return build_range(start[2..-1], stop[2..-1]).map do |elt|
   214        "0x#{a} " + elt
   215      end
   216    end
   217  
   218    ###
   219    # Unshared prefix, end of run
   220  
   221    return ["0x#{a}..0x#{b} "] if left.zero?
   222    
   223    ###
   224    # Unshared prefix, not end of run
   225    # Range can be 0x123456..0x56789A
   226    # Which is equivalent to:
   227    #     0x123456 .. 0x12FFFF
   228    #     0x130000 .. 0x55FFFF
   229    #     0x560000 .. 0x56789A
   230  
   231    ret = []
   232    ret << build_range(start, a + "FF" * left)
   233  
   234    ###
   235    # Only generate middle range if need be.
   236  
   237    if a.hex+1 != b.hex
   238      max = to_hex(b.hex - 1)
   239      max = "FF" if b == "FF"
   240      ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
   241    end
   242  
   243    ###
   244    # Don't generate last range if it is covered by first range
   245    
   246    ret << build_range(b + "00" * left, stop) unless b == "FF"
   247    ret.flatten!
   248  end
   249  
   250  def to_utf8( range )
   251    utf8_ranges( range ).map do |r|   
   252      begin_enc = to_utf8_enc(r.begin)
   253      end_enc = to_utf8_enc(r.end)
   254      build_range begin_enc, end_enc
   255    end.flatten!
   256  end
   257  
   258  ##
   259  # Perform a 3-way comparison of the number of codepoints advertised by
   260  # the unicode spec for the given range, the originally parsed range,
   261  # and the resulting utf8 encoded range.
   262  
   263  def count_codepoints( code )
   264    code.split(' ').inject(1) do |acc, elt|
   265      if elt =~ /0x(.+)\.\.0x(.+)/
   266        if @encoding == :utf8
   267          acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
   268        else
   269          acc * ($2.hex - $1.hex + 1)
   270        end
   271      else
   272        acc
   273      end
   274    end
   275  end
   276  
   277  def is_valid?( range, desc, codes )
   278    spec_count  = 1
   279    spec_count  = $1.to_i if desc =~ /\[(\d+)\]/
   280    range_count = range.end - range.begin + 1
   281  
   282    sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
   283    sum == spec_count and sum == range_count
   284  end
   285  
   286  ##
   287  # Generate the state maching to stdout
   288  
   289  def generate_machine( name, property )
   290    pipe = " "
   291    @output.puts "    #{name} = "
   292    each_alpha( @chart_url, property ) do |range, desc|
   293  
   294      codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
   295  
   296      #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless 
   297      #  is_valid? range, desc, codes
   298  
   299      range_width = codes.map { |a| a.size }.max
   300      range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
   301  
   302      desc_width  = TOTAL_WIDTH - RANGE_WIDTH - 11
   303      desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
   304  
   305      if desc.size > desc_width
   306        desc = desc[0..desc_width - 4] + "..."
   307      end
   308  
   309      codes.each_with_index do |r, idx|
   310        desc = "" unless idx.zero?
   311        code = "%-#{range_width}s" % r
   312        @output.puts "      #{pipe} #{code} ##{desc}"
   313        pipe = "|"
   314      end
   315    end
   316    @output.puts "      ;"
   317    @output.puts ""
   318  end
   319  
   320  @output.puts <<EOF
   321  # The following Ragel file was autogenerated with #{$0} 
   322  # from: #{@chart_url}
   323  #
   324  # It defines #{properties}.
   325  #
   326  # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
   327  # and that your input is in #{@encoding}.
   328  
   329  %%{
   330      machine #{machine_name};
   331      
   332  EOF
   333  
   334  properties.each { |x| generate_machine( x, x ) }
   335  
   336  @output.puts <<EOF
   337  }%%
   338  EOF