github.com/liquid-dev/text@v0.3.3-liquid/internal/ucd/ucd.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package ucd provides a parser for Unicode Character Database files, the
     6  // format of which is defined in https://www.unicode.org/reports/tr44/. See
     7  // https://www.unicode.org/Public/UCD/latest/ucd/ for example files.
     8  //
     9  // It currently does not support substitutions of missing fields.
    10  package ucd // import "github.com/liquid-dev/text/internal/ucd"
    11  
    12  import (
    13  	"bufio"
    14  	"errors"
    15  	"fmt"
    16  	"io"
    17  	"log"
    18  	"regexp"
    19  	"strconv"
    20  	"strings"
    21  )
    22  
    23  // UnicodeData.txt fields.
    24  const (
    25  	CodePoint = iota
    26  	Name
    27  	GeneralCategory
    28  	CanonicalCombiningClass
    29  	BidiClass
    30  	DecompMapping
    31  	DecimalValue
    32  	DigitValue
    33  	NumericValue
    34  	BidiMirrored
    35  	Unicode1Name
    36  	ISOComment
    37  	SimpleUppercaseMapping
    38  	SimpleLowercaseMapping
    39  	SimpleTitlecaseMapping
    40  )
    41  
    42  // Parse calls f for each entry in the given reader of a UCD file. It will close
    43  // the reader upon return. It will call log.Fatal if any error occurred.
    44  //
    45  // This implements the most common usage pattern of using Parser.
    46  func Parse(r io.ReadCloser, f func(p *Parser)) {
    47  	defer r.Close()
    48  
    49  	p := New(r)
    50  	for p.Next() {
    51  		f(p)
    52  	}
    53  	if err := p.Err(); err != nil {
    54  		r.Close() // os.Exit will cause defers not to be called.
    55  		log.Fatal(err)
    56  	}
    57  }
    58  
    59  // An Option is used to configure a Parser.
    60  type Option func(p *Parser)
    61  
    62  func keepRanges(p *Parser) {
    63  	p.keepRanges = true
    64  }
    65  
    66  var (
    67  	// KeepRanges prevents the expansion of ranges. The raw ranges can be
    68  	// obtained by calling Range(0) on the parser.
    69  	KeepRanges Option = keepRanges
    70  )
    71  
    72  // The Part option register a handler for lines starting with a '@'. The text
    73  // after a '@' is available as the first field. Comments are handled as usual.
    74  func Part(f func(p *Parser)) Option {
    75  	return func(p *Parser) {
    76  		p.partHandler = f
    77  	}
    78  }
    79  
    80  // The CommentHandler option passes comments that are on a line by itself to
    81  // a given handler.
    82  func CommentHandler(f func(s string)) Option {
    83  	return func(p *Parser) {
    84  		p.commentHandler = f
    85  	}
    86  }
    87  
    88  // A Parser parses Unicode Character Database (UCD) files.
    89  type Parser struct {
    90  	scanner *bufio.Scanner
    91  
    92  	keepRanges bool // Don't expand rune ranges in field 0.
    93  
    94  	err     error
    95  	comment string
    96  	field   []string
    97  	// parsedRange is needed in case Range(0) is called more than once for one
    98  	// field. In some cases this requires scanning ahead.
    99  	line                 int
   100  	parsedRange          bool
   101  	rangeStart, rangeEnd rune
   102  
   103  	partHandler    func(p *Parser)
   104  	commentHandler func(s string)
   105  }
   106  
   107  func (p *Parser) setError(err error, msg string) {
   108  	if p.err == nil && err != nil {
   109  		if msg == "" {
   110  			p.err = fmt.Errorf("ucd:line:%d: %v", p.line, err)
   111  		} else {
   112  			p.err = fmt.Errorf("ucd:line:%d:%s: %v", p.line, msg, err)
   113  		}
   114  	}
   115  }
   116  
   117  func (p *Parser) getField(i int) string {
   118  	if i >= len(p.field) {
   119  		return ""
   120  	}
   121  	return p.field[i]
   122  }
   123  
   124  // Err returns a non-nil error if any error occurred during parsing.
   125  func (p *Parser) Err() error {
   126  	return p.err
   127  }
   128  
   129  // New returns a Parser for the given Reader.
   130  func New(r io.Reader, o ...Option) *Parser {
   131  	p := &Parser{
   132  		scanner: bufio.NewScanner(r),
   133  	}
   134  	for _, f := range o {
   135  		f(p)
   136  	}
   137  	return p
   138  }
   139  
   140  // Next parses the next line in the file. It returns true if a line was parsed
   141  // and false if it reached the end of the file.
   142  func (p *Parser) Next() bool {
   143  	if !p.keepRanges && p.rangeStart < p.rangeEnd {
   144  		p.rangeStart++
   145  		return true
   146  	}
   147  	p.comment = ""
   148  	p.field = p.field[:0]
   149  	p.parsedRange = false
   150  
   151  	for p.scanner.Scan() && p.err == nil {
   152  		p.line++
   153  		s := p.scanner.Text()
   154  		if s == "" {
   155  			continue
   156  		}
   157  		if s[0] == '#' {
   158  			if p.commentHandler != nil {
   159  				p.commentHandler(strings.TrimSpace(s[1:]))
   160  			}
   161  			continue
   162  		}
   163  
   164  		// Parse line
   165  		if i := strings.IndexByte(s, '#'); i != -1 {
   166  			p.comment = strings.TrimSpace(s[i+1:])
   167  			s = s[:i]
   168  		}
   169  		if s[0] == '@' {
   170  			if p.partHandler != nil {
   171  				p.field = append(p.field, strings.TrimSpace(s[1:]))
   172  				p.partHandler(p)
   173  				p.field = p.field[:0]
   174  			}
   175  			p.comment = ""
   176  			continue
   177  		}
   178  		for {
   179  			i := strings.IndexByte(s, ';')
   180  			if i == -1 {
   181  				p.field = append(p.field, strings.TrimSpace(s))
   182  				break
   183  			}
   184  			p.field = append(p.field, strings.TrimSpace(s[:i]))
   185  			s = s[i+1:]
   186  		}
   187  		if !p.keepRanges {
   188  			p.rangeStart, p.rangeEnd = p.getRange(0)
   189  		}
   190  		return true
   191  	}
   192  	p.setError(p.scanner.Err(), "scanner failed")
   193  	return false
   194  }
   195  
   196  func parseRune(b string) (rune, error) {
   197  	if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
   198  		b = b[2:]
   199  	}
   200  	x, err := strconv.ParseUint(b, 16, 32)
   201  	return rune(x), err
   202  }
   203  
   204  func (p *Parser) parseRune(s string) rune {
   205  	x, err := parseRune(s)
   206  	p.setError(err, "failed to parse rune")
   207  	return x
   208  }
   209  
   210  // Rune parses and returns field i as a rune.
   211  func (p *Parser) Rune(i int) rune {
   212  	if i > 0 || p.keepRanges {
   213  		return p.parseRune(p.getField(i))
   214  	}
   215  	return p.rangeStart
   216  }
   217  
   218  // Runes interprets and returns field i as a sequence of runes.
   219  func (p *Parser) Runes(i int) (runes []rune) {
   220  	add := func(s string) {
   221  		if s = strings.TrimSpace(s); len(s) > 0 {
   222  			runes = append(runes, p.parseRune(s))
   223  		}
   224  	}
   225  	for b := p.getField(i); ; {
   226  		i := strings.IndexByte(b, ' ')
   227  		if i == -1 {
   228  			add(b)
   229  			break
   230  		}
   231  		add(b[:i])
   232  		b = b[i+1:]
   233  	}
   234  	return
   235  }
   236  
   237  var (
   238  	errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
   239  
   240  	// reRange matches one line of a legacy rune range.
   241  	reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
   242  )
   243  
   244  // Range parses and returns field i as a rune range. A range is inclusive at
   245  // both ends. If the field only has one rune, first and last will be identical.
   246  // It supports the legacy format for ranges used in UnicodeData.txt.
   247  func (p *Parser) Range(i int) (first, last rune) {
   248  	if !p.keepRanges {
   249  		return p.rangeStart, p.rangeStart
   250  	}
   251  	return p.getRange(i)
   252  }
   253  
   254  func (p *Parser) getRange(i int) (first, last rune) {
   255  	b := p.getField(i)
   256  	if k := strings.Index(b, ".."); k != -1 {
   257  		return p.parseRune(b[:k]), p.parseRune(b[k+2:])
   258  	}
   259  	// The first field may not be a rune, in which case we may ignore any error
   260  	// and set the range as 0..0.
   261  	x, err := parseRune(b)
   262  	if err != nil {
   263  		// Disable range parsing henceforth. This ensures that an error will be
   264  		// returned if the user subsequently will try to parse this field as
   265  		// a Rune.
   266  		p.keepRanges = true
   267  	}
   268  	// Special case for UnicodeData that was retained for backwards compatibility.
   269  	if i == 0 && len(p.field) > 1 && strings.HasSuffix(p.field[1], "First>") {
   270  		if p.parsedRange {
   271  			return p.rangeStart, p.rangeEnd
   272  		}
   273  		mf := reRange.FindStringSubmatch(p.scanner.Text())
   274  		p.line++
   275  		if mf == nil || !p.scanner.Scan() {
   276  			p.setError(errIncorrectLegacyRange, "")
   277  			return x, x
   278  		}
   279  		// Using Bytes would be more efficient here, but Text is a lot easier
   280  		// and this is not a frequent case.
   281  		ml := reRange.FindStringSubmatch(p.scanner.Text())
   282  		if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
   283  			p.setError(errIncorrectLegacyRange, "")
   284  			return x, x
   285  		}
   286  		p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Text()[:len(ml[1])])
   287  		p.parsedRange = true
   288  		return p.rangeStart, p.rangeEnd
   289  	}
   290  	return x, x
   291  }
   292  
   293  // bools recognizes all valid UCD boolean values.
   294  var bools = map[string]bool{
   295  	"":      false,
   296  	"N":     false,
   297  	"No":    false,
   298  	"F":     false,
   299  	"False": false,
   300  	"Y":     true,
   301  	"Yes":   true,
   302  	"T":     true,
   303  	"True":  true,
   304  }
   305  
   306  // Bool parses and returns field i as a boolean value.
   307  func (p *Parser) Bool(i int) bool {
   308  	f := p.getField(i)
   309  	for s, v := range bools {
   310  		if f == s {
   311  			return v
   312  		}
   313  	}
   314  	p.setError(strconv.ErrSyntax, "error parsing bool")
   315  	return false
   316  }
   317  
   318  // Int parses and returns field i as an integer value.
   319  func (p *Parser) Int(i int) int {
   320  	x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
   321  	p.setError(err, "error parsing int")
   322  	return int(x)
   323  }
   324  
   325  // Uint parses and returns field i as an unsigned integer value.
   326  func (p *Parser) Uint(i int) uint {
   327  	x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
   328  	p.setError(err, "error parsing uint")
   329  	return uint(x)
   330  }
   331  
   332  // Float parses and returns field i as a decimal value.
   333  func (p *Parser) Float(i int) float64 {
   334  	x, err := strconv.ParseFloat(string(p.getField(i)), 64)
   335  	p.setError(err, "error parsing float")
   336  	return x
   337  }
   338  
   339  // String parses and returns field i as a string value.
   340  func (p *Parser) String(i int) string {
   341  	return string(p.getField(i))
   342  }
   343  
   344  // Strings parses and returns field i as a space-separated list of strings.
   345  func (p *Parser) Strings(i int) []string {
   346  	ss := strings.Split(string(p.getField(i)), " ")
   347  	for i, s := range ss {
   348  		ss[i] = strings.TrimSpace(s)
   349  	}
   350  	return ss
   351  }
   352  
   353  // Comment returns the comments for the current line.
   354  func (p *Parser) Comment() string {
   355  	return string(p.comment)
   356  }
   357  
   358  var errUndefinedEnum = errors.New("ucd: undefined enum value")
   359  
   360  // Enum interprets and returns field i as a value that must be one of the values
   361  // in enum.
   362  func (p *Parser) Enum(i int, enum ...string) string {
   363  	f := p.getField(i)
   364  	for _, s := range enum {
   365  		if f == s {
   366  			return s
   367  		}
   368  	}
   369  	p.setError(errUndefinedEnum, "error parsing enum")
   370  	return ""
   371  }