github.com/searKing/golang/go@v1.2.117/go/scanner/scanner.go (about)

     1  // Copyright 2020 The searKing Author. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package scanner
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"fmt"
    11  	"go/token"
    12  	"path/filepath"
    13  	"regexp"
    14  	"strings"
    15  	"unicode"
    16  	"unicode/utf8"
    17  )
    18  
    19  // A mode value is a set of flags (or 0).
    20  // They control scanner behavior.
    21  type Mode uint
    22  
    23  const (
    24  	ModeCaseSensitive Mode = 1 << iota
    25  	ModeRegexpPerl
    26  	ModeRegexpPosix
    27  )
    28  
    29  // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    30  // encountered and a handler was installed, the handler is called with a
    31  // position and an error message. The position points to the beginning of
    32  // the offending token.
    33  type ErrorHandler func(pos token.Position, msg string)
    34  
    35  // A Scanner holds the scanner's internal state while processing
    36  // a given text. It can be allocated as part of another data
    37  // structure but must be initialized via Init before use.
    38  type Scanner struct {
    39  	// immutable state
    40  	file *token.File  // source file handle
    41  	dir  string       // directory portion of file.Name()
    42  	src  []byte       // source
    43  	err  ErrorHandler // error reporting; or nil
    44  	mode Mode         // scanning mode
    45  
    46  	// scanning state
    47  	offset     int // character offset
    48  	rdOffset   int // reading offset (position after current character)
    49  	lineOffset int // current line offset
    50  
    51  	// public state - ok to modify
    52  	ErrorCount int // number of errors encountered
    53  }
    54  
    55  const bom = 0xFEFF // byte order mark, only permitted as very first character
    56  
    57  func (s *Scanner) AtEOF() bool {
    58  	return s.rdOffset >= len(s.src)
    59  }
    60  
    61  func (s *Scanner) CurrentBytes() []byte {
    62  	return s.src[s.offset:s.rdOffset]
    63  }
    64  
    65  func (s *Scanner) CurrentString() string {
    66  	return string(s.CurrentBytes())
    67  }
    68  
    69  func (s *Scanner) CurrentRunes() []rune {
    70  	return []rune(s.CurrentString())
    71  }
    72  
    73  func (s *Scanner) CurrentRune() rune {
    74  	runes := s.CurrentRunes()
    75  	if len(runes) > 0 {
    76  		return runes[0]
    77  	}
    78  	return -1
    79  }
    80  
    81  func (s *Scanner) CurrentLength() int {
    82  	return s.rdOffset - s.offset
    83  }
    84  
    85  // walk until current is consumed
    86  func (s *Scanner) Consume() {
    87  	chars := s.CurrentBytes()
    88  	if len(chars) == 0 {
    89  		return
    90  	}
    91  
    92  	lines := bytes.Split(chars, []byte{'\n'})
    93  	var hasCL bool
    94  	if len(lines) > 1 {
    95  		hasCL = true
    96  	}
    97  
    98  	for _, line := range lines {
    99  		lineLen := len(line)
   100  		if hasCL {
   101  			lineLen++
   102  			s.lineOffset = s.offset
   103  			s.file.AddLine(s.offset)
   104  		}
   105  
   106  		s.offset = s.offset + lineLen
   107  	}
   108  	s.offset = s.rdOffset
   109  }
   110  
   111  func (s *Scanner) NextByte() {
   112  	s.NextBytesN(1)
   113  }
   114  
   115  func (s *Scanner) NextBytesN(n int) {
   116  	s.Consume()
   117  	if s.rdOffset+n <= len(s.src) {
   118  		s.rdOffset += n
   119  	} else {
   120  		s.offset = len(s.src)
   121  	}
   122  }
   123  
   124  // Read the NextRune Unicode char into s.ch.
   125  // s.AtEOF() == true means end-of-file.
   126  func (s *Scanner) NextRune() {
   127  	if s.rdOffset < len(s.src) {
   128  		s.Consume()
   129  		r, w := rune(s.src[s.rdOffset]), 1
   130  		switch {
   131  		case r == 0:
   132  			s.error(s.offset, "illegal character NUL")
   133  		case r >= utf8.RuneSelf:
   134  			// not ASCII
   135  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
   136  			if r == utf8.RuneError && w == 1 {
   137  				s.error(s.offset, "illegal UTF-8 encoding")
   138  			} else if r == bom && s.offset > 0 {
   139  				s.error(s.offset, "illegal byte order mark")
   140  			}
   141  		}
   142  		s.rdOffset += w
   143  	} else {
   144  		s.Consume()
   145  		s.offset = len(s.src)
   146  	}
   147  }
   148  
   149  func (s *Scanner) PeekRune() rune {
   150  	if s.rdOffset < len(s.src) {
   151  		r, w := rune(s.src[s.rdOffset]), 1
   152  		switch {
   153  		case r == 0:
   154  			s.error(s.offset, "illegal character NUL")
   155  		case r >= utf8.RuneSelf:
   156  			// not ASCII
   157  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
   158  			if r == utf8.RuneError && w == 1 {
   159  				s.error(s.offset, "illegal UTF-8 encoding")
   160  			} else if r == bom && s.offset > 0 {
   161  				s.error(s.offset, "illegal byte order mark")
   162  			}
   163  		}
   164  		return r
   165  	}
   166  	return -1
   167  }
   168  
   169  // PeekByte returns the byte following the most recently read character without
   170  // advancing the scanner. If the scanner is at EOF, PeekByte returns 0.
   171  func (s *Scanner) PeekByte() byte {
   172  	if s.rdOffset < len(s.src) {
   173  		return s.src[s.rdOffset]
   174  	}
   175  	return 0
   176  }
   177  
   178  // Read the NextRune Unicode chars into s.ch.
   179  // s.ch < 0 means end-of-file.
   180  func (s *Scanner) NextRunesN(n int) {
   181  	offsetBegin := s.rdOffset
   182  
   183  	for i := 0; i < n; i++ {
   184  		s.NextRune()
   185  	}
   186  	s.offset = offsetBegin
   187  }
   188  
   189  // Read the NextRune Unicode chars into s.ch.
   190  // s.ch < 0 means end-of-file.
   191  func (s *Scanner) NextRegexp(expectStrs ...string) {
   192  	match := s.PeekRegexpAny()
   193  	if match == "" {
   194  		return
   195  	}
   196  	offsetBegin := s.rdOffset
   197  
   198  	for range match {
   199  		s.NextRune()
   200  	}
   201  	s.offset = offsetBegin
   202  }
   203  
   204  // PeekRegexpAny returns the string following the most recently read character which matches the regexp case without
   205  // advancing the scanner. If the scanner is at EOF or regexp unmatched, PeekRegexpAny returns nil.
   206  func (s *Scanner) PeekRegexpAny(expectStrs ...string) string {
   207  	if s.AtEOF() {
   208  		return ""
   209  	}
   210  	if s.mode&ModeRegexpPosix != 0 {
   211  		return s.peekRegexpPosix(expectStrs...)
   212  	} else if s.mode&ModeRegexpPerl != 0 {
   213  		return s.peekRegexpPerl(expectStrs...)
   214  	}
   215  
   216  	return s.PeekString(expectStrs...)
   217  }
   218  
   219  func (s *Scanner) PeekString(expectStrs ...string) string {
   220  	if s.AtEOF() {
   221  		return ""
   222  	}
   223  
   224  	// regex mode
   225  	for _, expect := range expectStrs {
   226  		endPos := s.rdOffset + len(expect)
   227  		if endPos > len(s.src) {
   228  			continue
   229  		}
   230  		selected := s.src[s.rdOffset:endPos]
   231  		if string(selected) == expect {
   232  			return string(selected)
   233  		}
   234  
   235  		if ((s.mode&ModeCaseSensitive != 0) && strings.EqualFold(string(selected), expect)) ||
   236  			string(selected) == expect {
   237  			return string(selected)
   238  		}
   239  	}
   240  	return ""
   241  }
   242  
   243  func (s *Scanner) peekRegexpPosix(expectStrs ...string) string {
   244  	if s.AtEOF() {
   245  		return ""
   246  	}
   247  
   248  	// regex mode
   249  	for _, expect := range expectStrs {
   250  		expect = "^" + strings.TrimPrefix(expect, "^")
   251  
   252  		reg := regexp.MustCompilePOSIX(expect)
   253  		matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:]))
   254  		if len(matches) == 0 {
   255  			continue
   256  		}
   257  
   258  		return matches[0]
   259  	}
   260  	return ""
   261  }
   262  
   263  func (s *Scanner) peekRegexpPerl(expectStrs ...string) string {
   264  	if s.AtEOF() {
   265  		return ""
   266  	}
   267  
   268  	// regex mode
   269  	for _, expect := range expectStrs {
   270  		expect = "^" + strings.TrimPrefix(expect, "^")
   271  
   272  		reg := regexp.MustCompile(expect)
   273  		matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:]))
   274  		if len(matches) == 0 {
   275  			continue
   276  		}
   277  
   278  		return matches[0]
   279  	}
   280  	return ""
   281  }
   282  
   283  // Init prepares the scanner s to tokenize the text src by setting the
   284  // scanner at the beginning of src. The scanner uses the file set file
   285  // for position information and it adds line information for each line.
   286  // It is ok to re-use the same file when re-scanning the same file as
   287  // line information which is already present is ignored. Init causes a
   288  // panic if the file size does not match the src size.
   289  //
   290  // Calls to Scan will invoke the error handler err if they encounter a
   291  // syntax error and err is not nil. Also, for each error encountered,
   292  // the Scanner field ErrorCount is incremented by one. The mode parameter
   293  // determines how comments are handled.
   294  //
   295  // Note that Init may call err if there is an error in the first character
   296  // of the file.
   297  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   298  	// Explicitly initialize all fields since a scanner may be reused.
   299  	if file.Size() != len(src) {
   300  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   301  	}
   302  	s.file = file
   303  	s.dir, _ = filepath.Split(file.Name())
   304  	s.src = src
   305  	s.err = err
   306  	s.mode = mode
   307  
   308  	s.offset = 0
   309  	s.rdOffset = 0
   310  	s.lineOffset = 0
   311  	s.ErrorCount = 0
   312  
   313  	if s.PeekRune() == bom {
   314  		s.NextRune() // ignore BOM at file beginning
   315  	}
   316  }
   317  
   318  func (s *Scanner) error(offs int, msg string) {
   319  	if s.err != nil {
   320  		s.err(s.file.Position(s.file.Pos(offs)), msg)
   321  	}
   322  	s.ErrorCount++
   323  }
   324  
   325  func digitVal(ch rune) int {
   326  	switch {
   327  	case '0' <= ch && ch <= '9':
   328  		return int(ch - '0')
   329  	case 'a' <= ch && ch <= 'f':
   330  		return int(ch - 'a' + 10)
   331  	case 'A' <= ch && ch <= 'F':
   332  		return int(ch - 'A' + 10)
   333  	}
   334  	return 16 // larger than any legal digit val
   335  }
   336  
   337  // ScanEscape parses an escape sequence where rune is the accepted
   338  // escaped quote. In case of a syntax error, it stops at the offending
   339  // character (without consuming it) and returns false. Otherwise
   340  // it returns true.
   341  func (s *Scanner) ScanEscape(quote rune) bool {
   342  	offs := s.offset
   343  
   344  	var ch = s.CurrentRune()
   345  
   346  	var n int
   347  	var base, max uint32
   348  	switch ch {
   349  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   350  		s.NextRune()
   351  		return true
   352  	case '0', '1', '2', '3', '4', '5', '6', '7':
   353  		n, base, max = 3, 8, 255
   354  	case 'x':
   355  		s.NextRune()
   356  		n, base, max = 2, 16, 255
   357  	case 'u':
   358  		s.NextRune()
   359  		n, base, max = 4, 16, unicode.MaxRune
   360  	case 'U':
   361  		s.NextRune()
   362  		n, base, max = 8, 16, unicode.MaxRune
   363  	default:
   364  		msg := "unknown escape sequence"
   365  		if ch < 0 {
   366  			msg = "escape sequence not terminated"
   367  		}
   368  		s.error(offs, msg)
   369  		return false
   370  	}
   371  
   372  	var x uint32
   373  	for n > 0 {
   374  		d := uint32(digitVal(ch))
   375  		if d >= base {
   376  			msg := fmt.Sprintf("illegal character %#U in escape sequence", ch)
   377  			if ch < 0 {
   378  				msg = "escape sequence not terminated"
   379  			}
   380  			s.error(s.offset, msg)
   381  			return false
   382  		}
   383  		x = x*base + d
   384  		s.NextRune()
   385  		n--
   386  	}
   387  
   388  	if x > max || 0xD800 <= x && x < 0xE000 {
   389  		s.error(offs, "escape sequence is invalid Unicode code point")
   390  		return false
   391  	}
   392  
   393  	return true
   394  }
   395  
   396  func (s *Scanner) ScanRune() string {
   397  	// '\'' opening already consumed
   398  	offs := s.offset - 1
   399  
   400  	valid := true
   401  	n := 0
   402  	for {
   403  		var ch = s.CurrentRune()
   404  
   405  		if ch == '\n' || ch < 0 {
   406  			// only report error if we don't have one already
   407  			if valid {
   408  				s.error(offs, "rune literal not terminated")
   409  				valid = false
   410  			}
   411  			break
   412  		}
   413  		s.NextRune()
   414  		if ch == '\'' {
   415  			break
   416  		}
   417  		n++
   418  		if ch == '\\' {
   419  			if !s.ScanEscape('\'') {
   420  				valid = false
   421  			}
   422  			// continue to read to closing quote
   423  		}
   424  	}
   425  
   426  	if valid && n != 1 {
   427  		s.error(offs, "illegal rune literal")
   428  	}
   429  
   430  	return string(s.src[offs:s.offset])
   431  }
   432  
   433  func (s *Scanner) ScanString() string {
   434  	// '"' opening already consumed
   435  	offs := s.offset - 1
   436  
   437  	for {
   438  		var ch = s.CurrentRune()
   439  		if ch == '\n' || ch < 0 {
   440  			s.error(offs, "string literal not terminated")
   441  			break
   442  		}
   443  		s.NextRune()
   444  		if ch == '"' {
   445  			break
   446  		}
   447  		if ch == '\\' {
   448  			s.ScanEscape('"')
   449  		}
   450  	}
   451  
   452  	return string(s.src[offs:s.offset])
   453  }
   454  
   455  func stripCR(b []byte, comment bool) []byte {
   456  	c := make([]byte, len(b))
   457  	i := 0
   458  	for j, ch := range b {
   459  		// In a /*-style comment, don't strip \r from *\r/ (incl.
   460  		// sequences of \r from *\r\r...\r/) since the resulting
   461  		// */ would terminate the comment too early unless the \r
   462  		// is immediately following the opening /* in which case
   463  		// it's ok because /*/ is not closed yet (issue #11151).
   464  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   465  			c[i] = ch
   466  			i++
   467  		}
   468  	}
   469  	return c[:i]
   470  }
   471  
   472  func (s *Scanner) ScanRawString() string {
   473  	// '`' opening already consumed
   474  	offs := s.offset - 1
   475  
   476  	hasCR := false
   477  	for {
   478  		var ch = s.CurrentRune()
   479  		if ch < 0 {
   480  			s.error(offs, "raw string literal not terminated")
   481  			break
   482  		}
   483  		s.NextRune()
   484  		if ch == '`' {
   485  			break
   486  		}
   487  		if ch == '\r' {
   488  			hasCR = true
   489  		}
   490  	}
   491  
   492  	lit := s.src[offs:s.offset]
   493  	if hasCR {
   494  		lit = stripCR(lit, false)
   495  	}
   496  
   497  	return string(lit)
   498  }
   499  
   500  func (s *Scanner) ScanLine() string {
   501  	// '"' opening already consumed
   502  	offs := s.offset
   503  
   504  	for {
   505  		var ch = s.CurrentRune()
   506  		if ch < 0 {
   507  			s.error(offs, "string literal not terminated")
   508  			break
   509  		}
   510  		s.NextRune()
   511  		if ch == '\n' {
   512  			break
   513  		}
   514  	}
   515  
   516  	return string(s.src[offs:s.offset])
   517  }
   518  
   519  // ScanSplits advances the Scanner to the next token by splits when first meet, which will then be
   520  // available through the Bytes or Text method. It returns false when the
   521  // scan stops, either by reaching the end of the input or an error.
   522  // After Scan returns false, the Err method will return any error that
   523  // occurred during scanning, except that if it was io.EOF, Err
   524  // will return nil.
   525  func (s *Scanner) ScanSplits(splits ...bufio.SplitFunc) ([]byte, bool) {
   526  	s.Consume()
   527  
   528  	for _, split := range splits {
   529  		if split == nil {
   530  			continue
   531  		}
   532  		// See if we can get a token with what we already have.
   533  		// If we've run out of data but have an error, give the split function
   534  		// a chance to recover any remaining, possibly empty token.
   535  		// atEOF is true always, for we consume by a byte slice
   536  		advance, token, err := split(s.src[s.rdOffset:], true)
   537  		if err != nil && err != bufio.ErrFinalToken {
   538  			s.error(s.offset, err.Error())
   539  			return nil, false
   540  		}
   541  		s.NextBytesN(advance)
   542  		if len(token) != 0 {
   543  			return token, true
   544  		}
   545  	}
   546  	return nil, false
   547  }