gopkg.in/alecthomas/gometalinter.v3@v3.0.0/_linters/src/golang.org/x/text/internal/ucd/ucd.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package ucd provides a parser for Unicode Character Database files, the
     6  // format of which is defined in http://www.unicode.org/reports/tr44/. See
     7  // http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
     8  //
     9  // It currently does not support substitutions of missing fields.
    10  package ucd // import "golang.org/x/text/internal/ucd"
    11  
    12  import (
    13  	"bufio"
    14  	"bytes"
    15  	"errors"
    16  	"io"
    17  	"log"
    18  	"regexp"
    19  	"strconv"
    20  	"strings"
    21  )
    22  
    23  // UnicodeData.txt fields.
    24  const (
    25  	CodePoint = iota
    26  	Name
    27  	GeneralCategory
    28  	CanonicalCombiningClass
    29  	BidiClass
    30  	DecompMapping
    31  	DecimalValue
    32  	DigitValue
    33  	NumericValue
    34  	BidiMirrored
    35  	Unicode1Name
    36  	ISOComment
    37  	SimpleUppercaseMapping
    38  	SimpleLowercaseMapping
    39  	SimpleTitlecaseMapping
    40  )
    41  
    42  // Parse calls f for each entry in the given reader of a UCD file. It will close
    43  // the reader upon return. It will call log.Fatal if any error occurred.
    44  //
    45  // This implements the most common usage pattern of using Parser.
    46  func Parse(r io.ReadCloser, f func(p *Parser)) {
    47  	defer r.Close()
    48  
    49  	p := New(r)
    50  	for p.Next() {
    51  		f(p)
    52  	}
    53  	if err := p.Err(); err != nil {
    54  		r.Close() // os.Exit will cause defers not to be called.
    55  		log.Fatal(err)
    56  	}
    57  }
    58  
    59  // An Option is used to configure a Parser.
    60  type Option func(p *Parser)
    61  
    62  func keepRanges(p *Parser) {
    63  	p.keepRanges = true
    64  }
    65  
    66  var (
    67  	// KeepRanges prevents the expansion of ranges. The raw ranges can be
    68  	// obtained by calling Range(0) on the parser.
    69  	KeepRanges Option = keepRanges
    70  )
    71  
    72  // The Part option register a handler for lines starting with a '@'. The text
    73  // after a '@' is available as the first field. Comments are handled as usual.
    74  func Part(f func(p *Parser)) Option {
    75  	return func(p *Parser) {
    76  		p.partHandler = f
    77  	}
    78  }
    79  
    80  // The CommentHandler option passes comments that are on a line by itself to
    81  // a given handler.
    82  func CommentHandler(f func(s string)) Option {
    83  	return func(p *Parser) {
    84  		p.commentHandler = f
    85  	}
    86  }
    87  
    88  // A Parser parses Unicode Character Database (UCD) files.
    89  type Parser struct {
    90  	scanner *bufio.Scanner
    91  
    92  	keepRanges bool // Don't expand rune ranges in field 0.
    93  
    94  	err     error
    95  	comment []byte
    96  	field   [][]byte
    97  	// parsedRange is needed in case Range(0) is called more than once for one
    98  	// field. In some cases this requires scanning ahead.
    99  	parsedRange          bool
   100  	rangeStart, rangeEnd rune
   101  
   102  	partHandler    func(p *Parser)
   103  	commentHandler func(s string)
   104  }
   105  
   106  func (p *Parser) setError(err error) {
   107  	if p.err == nil {
   108  		p.err = err
   109  	}
   110  }
   111  
   112  func (p *Parser) getField(i int) []byte {
   113  	if i >= len(p.field) {
   114  		return nil
   115  	}
   116  	return p.field[i]
   117  }
   118  
   119  // Err returns a non-nil error if any error occurred during parsing.
   120  func (p *Parser) Err() error {
   121  	return p.err
   122  }
   123  
   124  // New returns a Parser for the given Reader.
   125  func New(r io.Reader, o ...Option) *Parser {
   126  	p := &Parser{
   127  		scanner: bufio.NewScanner(r),
   128  	}
   129  	for _, f := range o {
   130  		f(p)
   131  	}
   132  	return p
   133  }
   134  
   135  // Next parses the next line in the file. It returns true if a line was parsed
   136  // and false if it reached the end of the file.
   137  func (p *Parser) Next() bool {
   138  	if !p.keepRanges && p.rangeStart < p.rangeEnd {
   139  		p.rangeStart++
   140  		return true
   141  	}
   142  	p.comment = nil
   143  	p.field = p.field[:0]
   144  	p.parsedRange = false
   145  
   146  	for p.scanner.Scan() {
   147  		b := p.scanner.Bytes()
   148  		if len(b) == 0 {
   149  			continue
   150  		}
   151  		if b[0] == '#' {
   152  			if p.commentHandler != nil {
   153  				p.commentHandler(strings.TrimSpace(string(b[1:])))
   154  			}
   155  			continue
   156  		}
   157  
   158  		// Parse line
   159  		if i := bytes.IndexByte(b, '#'); i != -1 {
   160  			p.comment = bytes.TrimSpace(b[i+1:])
   161  			b = b[:i]
   162  		}
   163  		if b[0] == '@' {
   164  			if p.partHandler != nil {
   165  				p.field = append(p.field, bytes.TrimSpace(b[1:]))
   166  				p.partHandler(p)
   167  				p.field = p.field[:0]
   168  			}
   169  			p.comment = nil
   170  			continue
   171  		}
   172  		for {
   173  			i := bytes.IndexByte(b, ';')
   174  			if i == -1 {
   175  				p.field = append(p.field, bytes.TrimSpace(b))
   176  				break
   177  			}
   178  			p.field = append(p.field, bytes.TrimSpace(b[:i]))
   179  			b = b[i+1:]
   180  		}
   181  		if !p.keepRanges {
   182  			p.rangeStart, p.rangeEnd = p.getRange(0)
   183  		}
   184  		return true
   185  	}
   186  	p.setError(p.scanner.Err())
   187  	return false
   188  }
   189  
   190  func parseRune(b []byte) (rune, error) {
   191  	if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
   192  		b = b[2:]
   193  	}
   194  	x, err := strconv.ParseUint(string(b), 16, 32)
   195  	return rune(x), err
   196  }
   197  
   198  func (p *Parser) parseRune(b []byte) rune {
   199  	x, err := parseRune(b)
   200  	p.setError(err)
   201  	return x
   202  }
   203  
   204  // Rune parses and returns field i as a rune.
   205  func (p *Parser) Rune(i int) rune {
   206  	if i > 0 || p.keepRanges {
   207  		return p.parseRune(p.getField(i))
   208  	}
   209  	return p.rangeStart
   210  }
   211  
   212  // Runes interprets and returns field i as a sequence of runes.
   213  func (p *Parser) Runes(i int) (runes []rune) {
   214  	add := func(b []byte) {
   215  		if b = bytes.TrimSpace(b); len(b) > 0 {
   216  			runes = append(runes, p.parseRune(b))
   217  		}
   218  	}
   219  	for b := p.getField(i); ; {
   220  		i := bytes.IndexByte(b, ' ')
   221  		if i == -1 {
   222  			add(b)
   223  			break
   224  		}
   225  		add(b[:i])
   226  		b = b[i+1:]
   227  	}
   228  	return
   229  }
   230  
   231  var (
   232  	errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
   233  
   234  	// reRange matches one line of a legacy rune range.
   235  	reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
   236  )
   237  
   238  // Range parses and returns field i as a rune range. A range is inclusive at
   239  // both ends. If the field only has one rune, first and last will be identical.
   240  // It supports the legacy format for ranges used in UnicodeData.txt.
   241  func (p *Parser) Range(i int) (first, last rune) {
   242  	if !p.keepRanges {
   243  		return p.rangeStart, p.rangeStart
   244  	}
   245  	return p.getRange(i)
   246  }
   247  
   248  func (p *Parser) getRange(i int) (first, last rune) {
   249  	b := p.getField(i)
   250  	if k := bytes.Index(b, []byte("..")); k != -1 {
   251  		return p.parseRune(b[:k]), p.parseRune(b[k+2:])
   252  	}
   253  	// The first field may not be a rune, in which case we may ignore any error
   254  	// and set the range as 0..0.
   255  	x, err := parseRune(b)
   256  	if err != nil {
   257  		// Disable range parsing henceforth. This ensures that an error will be
   258  		// returned if the user subsequently will try to parse this field as
   259  		// a Rune.
   260  		p.keepRanges = true
   261  	}
   262  	// Special case for UnicodeData that was retained for backwards compatibility.
   263  	if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
   264  		if p.parsedRange {
   265  			return p.rangeStart, p.rangeEnd
   266  		}
   267  		mf := reRange.FindStringSubmatch(p.scanner.Text())
   268  		if mf == nil || !p.scanner.Scan() {
   269  			p.setError(errIncorrectLegacyRange)
   270  			return x, x
   271  		}
   272  		// Using Bytes would be more efficient here, but Text is a lot easier
   273  		// and this is not a frequent case.
   274  		ml := reRange.FindStringSubmatch(p.scanner.Text())
   275  		if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
   276  			p.setError(errIncorrectLegacyRange)
   277  			return x, x
   278  		}
   279  		p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
   280  		p.parsedRange = true
   281  		return p.rangeStart, p.rangeEnd
   282  	}
   283  	return x, x
   284  }
   285  
   286  // bools recognizes all valid UCD boolean values.
   287  var bools = map[string]bool{
   288  	"":      false,
   289  	"N":     false,
   290  	"No":    false,
   291  	"F":     false,
   292  	"False": false,
   293  	"Y":     true,
   294  	"Yes":   true,
   295  	"T":     true,
   296  	"True":  true,
   297  }
   298  
   299  // Bool parses and returns field i as a boolean value.
   300  func (p *Parser) Bool(i int) bool {
   301  	b := p.getField(i)
   302  	for s, v := range bools {
   303  		if bstrEq(b, s) {
   304  			return v
   305  		}
   306  	}
   307  	p.setError(strconv.ErrSyntax)
   308  	return false
   309  }
   310  
   311  // Int parses and returns field i as an integer value.
   312  func (p *Parser) Int(i int) int {
   313  	x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
   314  	p.setError(err)
   315  	return int(x)
   316  }
   317  
   318  // Uint parses and returns field i as an unsigned integer value.
   319  func (p *Parser) Uint(i int) uint {
   320  	x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
   321  	p.setError(err)
   322  	return uint(x)
   323  }
   324  
   325  // Float parses and returns field i as a decimal value.
   326  func (p *Parser) Float(i int) float64 {
   327  	x, err := strconv.ParseFloat(string(p.getField(i)), 64)
   328  	p.setError(err)
   329  	return x
   330  }
   331  
   332  // String parses and returns field i as a string value.
   333  func (p *Parser) String(i int) string {
   334  	return string(p.getField(i))
   335  }
   336  
   337  // Strings parses and returns field i as a space-separated list of strings.
   338  func (p *Parser) Strings(i int) []string {
   339  	ss := strings.Split(string(p.getField(i)), " ")
   340  	for i, s := range ss {
   341  		ss[i] = strings.TrimSpace(s)
   342  	}
   343  	return ss
   344  }
   345  
   346  // Comment returns the comments for the current line.
   347  func (p *Parser) Comment() string {
   348  	return string(p.comment)
   349  }
   350  
   351  var errUndefinedEnum = errors.New("ucd: undefined enum value")
   352  
   353  // Enum interprets and returns field i as a value that must be one of the values
   354  // in enum.
   355  func (p *Parser) Enum(i int, enum ...string) string {
   356  	b := p.getField(i)
   357  	for _, s := range enum {
   358  		if bstrEq(b, s) {
   359  			return s
   360  		}
   361  	}
   362  	p.setError(errUndefinedEnum)
   363  	return ""
   364  }
   365  
   366  func bstrEq(b []byte, s string) bool {
   367  	if len(b) != len(s) {
   368  		return false
   369  	}
   370  	for i, c := range b {
   371  		if c != s[i] {
   372  			return false
   373  		}
   374  	}
   375  	return true
   376  }