github.com/searKing/golang/go@v1.2.74/go/scanner/scanner.go (about)

     1  // Copyright 2020 The searKing Author. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package scanner
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"fmt"
    11  	"go/token"
    12  	"path/filepath"
    13  	"regexp"
    14  	"strings"
    15  	"unicode"
    16  	"unicode/utf8"
    17  )
    18  
    19  // A mode value is a set of flags (or 0).
    20  // They control scanner behavior.
    21  //
    22  type Mode uint
    23  
    24  const (
    25  	ModeCaseSensitive Mode = 1 << iota
    26  	ModeRegexpPerl
    27  	ModeRegexpPosix
    28  )
    29  
    30  // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    31  // encountered and a handler was installed, the handler is called with a
    32  // position and an error message. The position points to the beginning of
    33  // the offending token.
    34  //
    35  type ErrorHandler func(pos token.Position, msg string)
    36  
    37  // A Scanner holds the scanner's internal state while processing
    38  // a given text. It can be allocated as part of another data
    39  // structure but must be initialized via Init before use.
    40  //
    41  type Scanner struct {
    42  	// immutable state
    43  	file *token.File  // source file handle
    44  	dir  string       // directory portion of file.Name()
    45  	src  []byte       // source
    46  	err  ErrorHandler // error reporting; or nil
    47  	mode Mode         // scanning mode
    48  
    49  	// scanning state
    50  	offset     int // character offset
    51  	rdOffset   int // reading offset (position after current character)
    52  	lineOffset int // current line offset
    53  
    54  	// public state - ok to modify
    55  	ErrorCount int // number of errors encountered
    56  }
    57  
    58  const bom = 0xFEFF // byte order mark, only permitted as very first character
    59  
    60  func (s *Scanner) AtEOF() bool {
    61  	return s.rdOffset >= len(s.src)
    62  }
    63  
    64  func (s *Scanner) CurrentBytes() []byte {
    65  	return s.src[s.offset:s.rdOffset]
    66  }
    67  
    68  func (s *Scanner) CurrentString() string {
    69  	return string(s.CurrentBytes())
    70  }
    71  
    72  func (s *Scanner) CurrentRunes() []rune {
    73  	return []rune(s.CurrentString())
    74  }
    75  
    76  func (s *Scanner) CurrentRune() rune {
    77  	runes := s.CurrentRunes()
    78  	if len(runes) > 0 {
    79  		return runes[0]
    80  	}
    81  	return -1
    82  }
    83  
    84  func (s *Scanner) CurrentLength() int {
    85  	return s.rdOffset - s.offset
    86  }
    87  
    88  // walk until current is consumed
    89  func (s *Scanner) Consume() {
    90  	chars := s.CurrentBytes()
    91  	if len(chars) == 0 {
    92  		return
    93  	}
    94  
    95  	lines := bytes.Split(chars, []byte{'\n'})
    96  	var hasCL bool
    97  	if len(lines) > 1 {
    98  		hasCL = true
    99  	}
   100  
   101  	for _, line := range lines {
   102  		lineLen := len(line)
   103  		if hasCL {
   104  			lineLen++
   105  			s.lineOffset = s.offset
   106  			s.file.AddLine(s.offset)
   107  		}
   108  
   109  		s.offset = s.offset + lineLen
   110  	}
   111  	s.offset = s.rdOffset
   112  }
   113  
   114  func (s *Scanner) NextByte() {
   115  	s.NextBytesN(1)
   116  }
   117  
   118  func (s *Scanner) NextBytesN(n int) {
   119  	s.Consume()
   120  	if s.rdOffset+n <= len(s.src) {
   121  		s.rdOffset += n
   122  	} else {
   123  		s.offset = len(s.src)
   124  	}
   125  }
   126  
   127  // Read the NextRune Unicode char into s.ch.
   128  // s.AtEOF() == true means end-of-file.
   129  func (s *Scanner) NextRune() {
   130  	if s.rdOffset < len(s.src) {
   131  		s.Consume()
   132  		r, w := rune(s.src[s.rdOffset]), 1
   133  		switch {
   134  		case r == 0:
   135  			s.error(s.offset, "illegal character NUL")
   136  		case r >= utf8.RuneSelf:
   137  			// not ASCII
   138  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
   139  			if r == utf8.RuneError && w == 1 {
   140  				s.error(s.offset, "illegal UTF-8 encoding")
   141  			} else if r == bom && s.offset > 0 {
   142  				s.error(s.offset, "illegal byte order mark")
   143  			}
   144  		}
   145  		s.rdOffset += w
   146  	} else {
   147  		s.Consume()
   148  		s.offset = len(s.src)
   149  	}
   150  }
   151  
   152  func (s *Scanner) PeekRune() rune {
   153  	if s.rdOffset < len(s.src) {
   154  		r, w := rune(s.src[s.rdOffset]), 1
   155  		switch {
   156  		case r == 0:
   157  			s.error(s.offset, "illegal character NUL")
   158  		case r >= utf8.RuneSelf:
   159  			// not ASCII
   160  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
   161  			if r == utf8.RuneError && w == 1 {
   162  				s.error(s.offset, "illegal UTF-8 encoding")
   163  			} else if r == bom && s.offset > 0 {
   164  				s.error(s.offset, "illegal byte order mark")
   165  			}
   166  		}
   167  		return r
   168  	}
   169  	return -1
   170  }
   171  
   172  // PeekByte returns the byte following the most recently read character without
   173  // advancing the scanner. If the scanner is at EOF, PeekByte returns 0.
   174  func (s *Scanner) PeekByte() byte {
   175  	if s.rdOffset < len(s.src) {
   176  		return s.src[s.rdOffset]
   177  	}
   178  	return 0
   179  }
   180  
   181  // Read the NextRune Unicode chars into s.ch.
   182  // s.ch < 0 means end-of-file.
   183  //
   184  func (s *Scanner) NextRunesN(n int) {
   185  	offsetBegin := s.rdOffset
   186  
   187  	for i := 0; i < n; i++ {
   188  		s.NextRune()
   189  	}
   190  	s.offset = offsetBegin
   191  }
   192  
   193  // Read the NextRune Unicode chars into s.ch.
   194  // s.ch < 0 means end-of-file.
   195  //
   196  func (s *Scanner) NextRegexp(expectStrs ...string) {
   197  	match := s.PeekRegexpAny()
   198  	if match == "" {
   199  		return
   200  	}
   201  	offsetBegin := s.rdOffset
   202  
   203  	for range match {
   204  		s.NextRune()
   205  	}
   206  	s.offset = offsetBegin
   207  }
   208  
   209  // PeekRegexpAny returns the string following the most recently read character which matches the regexp case without
   210  // advancing the scanner. If the scanner is at EOF or regexp unmatched, PeekRegexpAny returns nil.
   211  func (s *Scanner) PeekRegexpAny(expectStrs ...string) string {
   212  	if s.AtEOF() {
   213  		return ""
   214  	}
   215  	if s.mode&ModeRegexpPosix != 0 {
   216  		return s.peekRegexpPosix(expectStrs...)
   217  	} else if s.mode&ModeRegexpPerl != 0 {
   218  		return s.peekRegexpPerl(expectStrs...)
   219  	}
   220  
   221  	return s.PeekString(expectStrs...)
   222  }
   223  
   224  func (s *Scanner) PeekString(expectStrs ...string) string {
   225  	if s.AtEOF() {
   226  		return ""
   227  	}
   228  
   229  	// regex mode
   230  	for _, expect := range expectStrs {
   231  		endPos := s.rdOffset + len(expect)
   232  		if endPos > len(s.src) {
   233  			continue
   234  		}
   235  		selected := s.src[s.rdOffset:endPos]
   236  		if string(selected) == expect {
   237  			return string(selected)
   238  		}
   239  
   240  		if ((s.mode&ModeCaseSensitive != 0) && strings.EqualFold(string(selected), expect)) ||
   241  			string(selected) == expect {
   242  			return string(selected)
   243  		}
   244  	}
   245  	return ""
   246  }
   247  
   248  func (s *Scanner) peekRegexpPosix(expectStrs ...string) string {
   249  	if s.AtEOF() {
   250  		return ""
   251  	}
   252  
   253  	// regex mode
   254  	for _, expect := range expectStrs {
   255  		expect = "^" + strings.TrimPrefix(expect, "^")
   256  
   257  		reg := regexp.MustCompilePOSIX(expect)
   258  		matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:]))
   259  		if len(matches) == 0 {
   260  			continue
   261  		}
   262  
   263  		return matches[0]
   264  	}
   265  	return ""
   266  }
   267  
   268  func (s *Scanner) peekRegexpPerl(expectStrs ...string) string {
   269  	if s.AtEOF() {
   270  		return ""
   271  	}
   272  
   273  	// regex mode
   274  	for _, expect := range expectStrs {
   275  		expect = "^" + strings.TrimPrefix(expect, "^")
   276  
   277  		reg := regexp.MustCompile(expect)
   278  		matches := reg.FindStringSubmatch(string(s.src[s.rdOffset:]))
   279  		if len(matches) == 0 {
   280  			continue
   281  		}
   282  
   283  		return matches[0]
   284  	}
   285  	return ""
   286  }
   287  
   288  // Init prepares the scanner s to tokenize the text src by setting the
   289  // scanner at the beginning of src. The scanner uses the file set file
   290  // for position information and it adds line information for each line.
   291  // It is ok to re-use the same file when re-scanning the same file as
   292  // line information which is already present is ignored. Init causes a
   293  // panic if the file size does not match the src size.
   294  //
   295  // Calls to Scan will invoke the error handler err if they encounter a
   296  // syntax error and err is not nil. Also, for each error encountered,
   297  // the Scanner field ErrorCount is incremented by one. The mode parameter
   298  // determines how comments are handled.
   299  //
   300  // Note that Init may call err if there is an error in the first character
   301  // of the file.
   302  //
   303  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   304  	// Explicitly initialize all fields since a scanner may be reused.
   305  	if file.Size() != len(src) {
   306  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   307  	}
   308  	s.file = file
   309  	s.dir, _ = filepath.Split(file.Name())
   310  	s.src = src
   311  	s.err = err
   312  	s.mode = mode
   313  
   314  	s.offset = 0
   315  	s.rdOffset = 0
   316  	s.lineOffset = 0
   317  	s.ErrorCount = 0
   318  
   319  	if s.PeekRune() == bom {
   320  		s.NextRune() // ignore BOM at file beginning
   321  	}
   322  }
   323  
   324  func (s *Scanner) error(offs int, msg string) {
   325  	if s.err != nil {
   326  		s.err(s.file.Position(s.file.Pos(offs)), msg)
   327  	}
   328  	s.ErrorCount++
   329  }
   330  
   331  func digitVal(ch rune) int {
   332  	switch {
   333  	case '0' <= ch && ch <= '9':
   334  		return int(ch - '0')
   335  	case 'a' <= ch && ch <= 'f':
   336  		return int(ch - 'a' + 10)
   337  	case 'A' <= ch && ch <= 'F':
   338  		return int(ch - 'A' + 10)
   339  	}
   340  	return 16 // larger than any legal digit val
   341  }
   342  
   343  // ScanEscape parses an escape sequence where rune is the accepted
   344  // escaped quote. In case of a syntax error, it stops at the offending
   345  // character (without consuming it) and returns false. Otherwise
   346  // it returns true.
   347  func (s *Scanner) ScanEscape(quote rune) bool {
   348  	offs := s.offset
   349  
   350  	var ch = s.CurrentRune()
   351  
   352  	var n int
   353  	var base, max uint32
   354  	switch ch {
   355  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   356  		s.NextRune()
   357  		return true
   358  	case '0', '1', '2', '3', '4', '5', '6', '7':
   359  		n, base, max = 3, 8, 255
   360  	case 'x':
   361  		s.NextRune()
   362  		n, base, max = 2, 16, 255
   363  	case 'u':
   364  		s.NextRune()
   365  		n, base, max = 4, 16, unicode.MaxRune
   366  	case 'U':
   367  		s.NextRune()
   368  		n, base, max = 8, 16, unicode.MaxRune
   369  	default:
   370  		msg := "unknown escape sequence"
   371  		if ch < 0 {
   372  			msg = "escape sequence not terminated"
   373  		}
   374  		s.error(offs, msg)
   375  		return false
   376  	}
   377  
   378  	var x uint32
   379  	for n > 0 {
   380  		d := uint32(digitVal(ch))
   381  		if d >= base {
   382  			msg := fmt.Sprintf("illegal character %#U in escape sequence", ch)
   383  			if ch < 0 {
   384  				msg = "escape sequence not terminated"
   385  			}
   386  			s.error(s.offset, msg)
   387  			return false
   388  		}
   389  		x = x*base + d
   390  		s.NextRune()
   391  		n--
   392  	}
   393  
   394  	if x > max || 0xD800 <= x && x < 0xE000 {
   395  		s.error(offs, "escape sequence is invalid Unicode code point")
   396  		return false
   397  	}
   398  
   399  	return true
   400  }
   401  
   402  func (s *Scanner) ScanRune() string {
   403  	// '\'' opening already consumed
   404  	offs := s.offset - 1
   405  
   406  	valid := true
   407  	n := 0
   408  	for {
   409  		var ch = s.CurrentRune()
   410  
   411  		if ch == '\n' || ch < 0 {
   412  			// only report error if we don't have one already
   413  			if valid {
   414  				s.error(offs, "rune literal not terminated")
   415  				valid = false
   416  			}
   417  			break
   418  		}
   419  		s.NextRune()
   420  		if ch == '\'' {
   421  			break
   422  		}
   423  		n++
   424  		if ch == '\\' {
   425  			if !s.ScanEscape('\'') {
   426  				valid = false
   427  			}
   428  			// continue to read to closing quote
   429  		}
   430  	}
   431  
   432  	if valid && n != 1 {
   433  		s.error(offs, "illegal rune literal")
   434  	}
   435  
   436  	return string(s.src[offs:s.offset])
   437  }
   438  
   439  func (s *Scanner) ScanString() string {
   440  	// '"' opening already consumed
   441  	offs := s.offset - 1
   442  
   443  	for {
   444  		var ch = s.CurrentRune()
   445  		if ch == '\n' || ch < 0 {
   446  			s.error(offs, "string literal not terminated")
   447  			break
   448  		}
   449  		s.NextRune()
   450  		if ch == '"' {
   451  			break
   452  		}
   453  		if ch == '\\' {
   454  			s.ScanEscape('"')
   455  		}
   456  	}
   457  
   458  	return string(s.src[offs:s.offset])
   459  }
   460  
   461  func stripCR(b []byte, comment bool) []byte {
   462  	c := make([]byte, len(b))
   463  	i := 0
   464  	for j, ch := range b {
   465  		// In a /*-style comment, don't strip \r from *\r/ (incl.
   466  		// sequences of \r from *\r\r...\r/) since the resulting
   467  		// */ would terminate the comment too early unless the \r
   468  		// is immediately following the opening /* in which case
   469  		// it's ok because /*/ is not closed yet (issue #11151).
   470  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   471  			c[i] = ch
   472  			i++
   473  		}
   474  	}
   475  	return c[:i]
   476  }
   477  
   478  func (s *Scanner) ScanRawString() string {
   479  	// '`' opening already consumed
   480  	offs := s.offset - 1
   481  
   482  	hasCR := false
   483  	for {
   484  		var ch = s.CurrentRune()
   485  		if ch < 0 {
   486  			s.error(offs, "raw string literal not terminated")
   487  			break
   488  		}
   489  		s.NextRune()
   490  		if ch == '`' {
   491  			break
   492  		}
   493  		if ch == '\r' {
   494  			hasCR = true
   495  		}
   496  	}
   497  
   498  	lit := s.src[offs:s.offset]
   499  	if hasCR {
   500  		lit = stripCR(lit, false)
   501  	}
   502  
   503  	return string(lit)
   504  }
   505  
   506  func (s *Scanner) ScanLine() string {
   507  	// '"' opening already consumed
   508  	offs := s.offset
   509  
   510  	for {
   511  		var ch = s.CurrentRune()
   512  		if ch < 0 {
   513  			s.error(offs, "string literal not terminated")
   514  			break
   515  		}
   516  		s.NextRune()
   517  		if ch == '\n' {
   518  			break
   519  		}
   520  	}
   521  
   522  	return string(s.src[offs:s.offset])
   523  }
   524  
   525  // ScanSplits advances the Scanner to the next token by splits when first meet, which will then be
   526  // available through the Bytes or Text method. It returns false when the
   527  // scan stops, either by reaching the end of the input or an error.
   528  // After Scan returns false, the Err method will return any error that
   529  // occurred during scanning, except that if it was io.EOF, Err
   530  // will return nil.
   531  func (s *Scanner) ScanSplits(splits ...bufio.SplitFunc) ([]byte, bool) {
   532  	s.Consume()
   533  
   534  	for _, split := range splits {
   535  		if split == nil {
   536  			continue
   537  		}
   538  		// See if we can get a token with what we already have.
   539  		// If we've run out of data but have an error, give the split function
   540  		// a chance to recover any remaining, possibly empty token.
   541  		// atEOF is true always, for we consume by a byte slice
   542  		advance, token, err := split(s.src[s.rdOffset:], true)
   543  		if err != nil && err != bufio.ErrFinalToken {
   544  			s.error(s.offset, err.Error())
   545  			return nil, false
   546  		}
   547  		s.NextBytesN(advance)
   548  		if len(token) != 0 {
   549  			return token, true
   550  		}
   551  	}
   552  	return nil, false
   553  }