github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/internal/ucd/ucd.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package ucd provides a parser for Unicode Character Database files, the
     6  // format of which is defined in http://www.unicode.org/reports/tr44/. See
     7  // http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
     8  //
     9  // It currently does not support substitutions of missing fields.
    10  package ucd // import "golang.org/x/text/internal/ucd"
    11  
    12  import (
    13  	"bufio"
    14  	"bytes"
    15  	"errors"
    16  	"fmt"
    17  	"io"
    18  	"log"
    19  	"regexp"
    20  	"strconv"
    21  	"strings"
    22  )
    23  
    24  // UnicodeData.txt fields.
    25  const (
    26  	CodePoint = iota
    27  	Name
    28  	GeneralCategory
    29  	CanonicalCombiningClass
    30  	BidiClass
    31  	DecompMapping
    32  	DecimalValue
    33  	DigitValue
    34  	NumericValue
    35  	BidiMirrored
    36  	Unicode1Name
    37  	ISOComment
    38  	SimpleUppercaseMapping
    39  	SimpleLowercaseMapping
    40  	SimpleTitlecaseMapping
    41  )
    42  
    43  // Parse calls f for each entry in the given reader of a UCD file. It will close
    44  // the reader upon return. It will call log.Fatal if any error occurred.
    45  //
    46  // This implements the most common usage pattern of using Parser.
    47  func Parse(r io.ReadCloser, f func(p *Parser)) {
    48  	defer r.Close()
    49  
    50  	p := New(r)
    51  	for p.Next() {
    52  		f(p)
    53  	}
    54  	if err := p.Err(); err != nil {
    55  		r.Close() // os.Exit will cause defers not to be called.
    56  		log.Fatal(err)
    57  	}
    58  }
    59  
    60  // An Option is used to configure a Parser.
    61  type Option func(p *Parser)
    62  
    63  func keepRanges(p *Parser) {
    64  	p.keepRanges = true
    65  }
    66  
    67  var (
    68  	// KeepRanges prevents the expansion of ranges. The raw ranges can be
    69  	// obtained by calling Range(0) on the parser.
    70  	KeepRanges Option = keepRanges
    71  )
    72  
    73  // The Part option register a handler for lines starting with a '@'. The text
    74  // after a '@' is available as the first field. Comments are handled as usual.
    75  func Part(f func(p *Parser)) Option {
    76  	return func(p *Parser) {
    77  		p.partHandler = f
    78  	}
    79  }
    80  
    81  // A Parser parses Unicode Character Database (UCD) files.
    82  type Parser struct {
    83  	scanner *bufio.Scanner
    84  
    85  	keepRanges bool // Don't expand rune ranges in field 0.
    86  
    87  	err     error
    88  	comment []byte
    89  	field   [][]byte
    90  	// parsedRange is needed in case Range(0) is called more than once for one
    91  	// field. In some cases this requires scanning ahead.
    92  	parsedRange          bool
    93  	rangeStart, rangeEnd rune
    94  
    95  	partHandler func(p *Parser)
    96  }
    97  
    98  func (p *Parser) setError(err error) {
    99  	if p.err == nil {
   100  		p.err = err
   101  	}
   102  }
   103  
   104  func (p *Parser) getField(i int) []byte {
   105  	if i >= len(p.field) {
   106  		p.setError(fmt.Errorf("ucd: index of field %d out of bounds", i))
   107  		return nil
   108  	}
   109  	return p.field[i]
   110  }
   111  
   112  // Err returns a non-nil error if any error occurred during parsing.
   113  func (p *Parser) Err() error {
   114  	return p.err
   115  }
   116  
   117  // New returns a Parser for the given Reader.
   118  func New(r io.Reader, o ...Option) *Parser {
   119  	p := &Parser{
   120  		scanner: bufio.NewScanner(r),
   121  	}
   122  	for _, f := range o {
   123  		f(p)
   124  	}
   125  	return p
   126  }
   127  
   128  // Next parses the next line in the file. It returns true if a line was parsed
   129  // and false if it reached the end of the file.
   130  func (p *Parser) Next() bool {
   131  	if !p.keepRanges && p.rangeStart < p.rangeEnd {
   132  		p.rangeStart++
   133  		return true
   134  	}
   135  	p.comment = nil
   136  	p.field = p.field[:0]
   137  	p.parsedRange = false
   138  
   139  	for p.scanner.Scan() {
   140  		b := p.scanner.Bytes()
   141  		if len(b) == 0 || b[0] == '#' {
   142  			continue
   143  		}
   144  
   145  		// Parse line
   146  		if i := bytes.IndexByte(b, '#'); i != -1 {
   147  			p.comment = bytes.TrimSpace(b[i+1:])
   148  			b = b[:i]
   149  		}
   150  		if b[0] == '@' {
   151  			if p.partHandler != nil {
   152  				p.field = append(p.field, bytes.TrimSpace(b[1:]))
   153  				p.partHandler(p)
   154  				p.field = p.field[:0]
   155  			}
   156  			p.comment = nil
   157  			continue
   158  		}
   159  		for {
   160  			i := bytes.IndexByte(b, ';')
   161  			if i == -1 {
   162  				p.field = append(p.field, bytes.TrimSpace(b))
   163  				break
   164  			}
   165  			p.field = append(p.field, bytes.TrimSpace(b[:i]))
   166  			b = b[i+1:]
   167  		}
   168  		if !p.keepRanges {
   169  			p.rangeStart, p.rangeEnd = p.getRange(0)
   170  		}
   171  		return true
   172  	}
   173  	p.setError(p.scanner.Err())
   174  	return false
   175  }
   176  
   177  func parseRune(b []byte) (rune, error) {
   178  	if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
   179  		b = b[2:]
   180  	}
   181  	x, err := strconv.ParseUint(string(b), 16, 32)
   182  	return rune(x), err
   183  }
   184  
   185  func (p *Parser) parseRune(b []byte) rune {
   186  	x, err := parseRune(b)
   187  	p.setError(err)
   188  	return x
   189  }
   190  
   191  // Rune parses and returns field i as a rune.
   192  func (p *Parser) Rune(i int) rune {
   193  	if i > 0 || p.keepRanges {
   194  		return p.parseRune(p.getField(i))
   195  	}
   196  	return p.rangeStart
   197  }
   198  
   199  // Runes interprets and returns field i as a sequence of runes.
   200  func (p *Parser) Runes(i int) (runes []rune) {
   201  	add := func(b []byte) {
   202  		if b = bytes.TrimSpace(b); len(b) > 0 {
   203  			runes = append(runes, p.parseRune(b))
   204  		}
   205  	}
   206  	for b := p.getField(i); ; {
   207  		i := bytes.IndexByte(b, ' ')
   208  		if i == -1 {
   209  			add(b)
   210  			break
   211  		}
   212  		add(b[:i])
   213  		b = b[i+1:]
   214  	}
   215  	return
   216  }
   217  
   218  var (
   219  	errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
   220  
   221  	// reRange matches one line of a legacy rune range.
   222  	reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
   223  )
   224  
   225  // Range parses and returns field i as a rune range. A range is inclusive at
   226  // both ends. If the field only has one rune, first and last will be identical.
   227  // It supports the legacy format for ranges used in UnicodeData.txt.
   228  func (p *Parser) Range(i int) (first, last rune) {
   229  	if !p.keepRanges {
   230  		return p.rangeStart, p.rangeStart
   231  	}
   232  	return p.getRange(i)
   233  }
   234  
   235  func (p *Parser) getRange(i int) (first, last rune) {
   236  	b := p.getField(i)
   237  	if k := bytes.Index(b, []byte("..")); k != -1 {
   238  		return p.parseRune(b[:k]), p.parseRune(b[k+2:])
   239  	}
   240  	// The first field may not be a rune, in which case we may ignore any error
   241  	// and set the range as 0..0.
   242  	x, err := parseRune(b)
   243  	if err != nil {
   244  		// Disable range parsing henceforth. This ensures that an error will be
   245  		// returned if the user subsequently will try to parse this field as
   246  		// a Rune.
   247  		p.keepRanges = true
   248  	}
   249  	// Special case for UnicodeData that was retained for backwards compatibility.
   250  	if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
   251  		if p.parsedRange {
   252  			return p.rangeStart, p.rangeEnd
   253  		}
   254  		mf := reRange.FindStringSubmatch(p.scanner.Text())
   255  		if mf == nil || !p.scanner.Scan() {
   256  			p.setError(errIncorrectLegacyRange)
   257  			return x, x
   258  		}
   259  		// Using Bytes would be more efficient here, but Text is a lot easier
   260  		// and this is not a frequent case.
   261  		ml := reRange.FindStringSubmatch(p.scanner.Text())
   262  		if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
   263  			p.setError(errIncorrectLegacyRange)
   264  			return x, x
   265  		}
   266  		p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
   267  		p.parsedRange = true
   268  		return p.rangeStart, p.rangeEnd
   269  	}
   270  	return x, x
   271  }
   272  
   273  // bools recognizes all valid UCD boolean values.
   274  var bools = map[string]bool{
   275  	"":      false,
   276  	"N":     false,
   277  	"No":    false,
   278  	"F":     false,
   279  	"False": false,
   280  	"Y":     true,
   281  	"Yes":   true,
   282  	"T":     true,
   283  	"True":  true,
   284  }
   285  
   286  // Bool parses and returns field i as a boolean value.
   287  func (p *Parser) Bool(i int) bool {
   288  	b := p.getField(i)
   289  	for s, v := range bools {
   290  		if bstrEq(b, s) {
   291  			return v
   292  		}
   293  	}
   294  	p.setError(strconv.ErrSyntax)
   295  	return false
   296  }
   297  
   298  // Int parses and returns field i as an integer value.
   299  func (p *Parser) Int(i int) int {
   300  	x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
   301  	p.setError(err)
   302  	return int(x)
   303  }
   304  
   305  // Uint parses and returns field i as an unsigned integer value.
   306  func (p *Parser) Uint(i int) uint {
   307  	x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
   308  	p.setError(err)
   309  	return uint(x)
   310  }
   311  
   312  // Float parses and returns field i as a decimal value.
   313  func (p *Parser) Float(i int) float64 {
   314  	x, err := strconv.ParseFloat(string(p.getField(i)), 64)
   315  	p.setError(err)
   316  	return x
   317  }
   318  
   319  // String parses and returns field i as a string value.
   320  func (p *Parser) String(i int) string {
   321  	return string(p.getField(i))
   322  }
   323  
   324  // Strings parses and returns field i as a space-separated list of strings.
   325  func (p *Parser) Strings(i int) []string {
   326  	ss := strings.Split(string(p.getField(i)), " ")
   327  	for i, s := range ss {
   328  		ss[i] = strings.TrimSpace(s)
   329  	}
   330  	return ss
   331  }
   332  
   333  // Comment returns the comments for the current line.
   334  func (p *Parser) Comment() string {
   335  	return string(p.comment)
   336  }
   337  
   338  var errUndefinedEnum = errors.New("ucd: undefined enum value")
   339  
   340  // Enum interprets and returns field i as a value that must be one of the values
   341  // in enum.
   342  func (p *Parser) Enum(i int, enum ...string) string {
   343  	b := p.getField(i)
   344  	for _, s := range enum {
   345  		if bstrEq(b, s) {
   346  			return s
   347  		}
   348  	}
   349  	p.setError(errUndefinedEnum)
   350  	return ""
   351  }
   352  
   353  func bstrEq(b []byte, s string) bool {
   354  	if len(b) != len(s) {
   355  		return false
   356  	}
   357  	for i, c := range b {
   358  		if c != s[i] {
   359  			return false
   360  		}
   361  	}
   362  	return true
   363  }