github.com/searKing/golang/go@v1.2.117/go/scanner/split.go (about)

     1  // Copyright 2020 The searKing Author. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package scanner
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"errors"
    11  	"fmt"
    12  	"regexp"
    13  	"strings"
    14  	"unicode"
    15  	"unicode/utf8"
    16  )
    17  
    18  // Split functions
    19  var (
    20  	// ScanBytes is a split function for a Scanner that returns each byte as a token.
    21  	ScanBytes = bufio.ScanBytes
    22  
    23  	// ScanRunes is a split function for a Scanner that returns each
    24  	// UTF-8-encoded rune as a token. The sequence of runes returned is
    25  	// equivalent to that from a range loop over the input as a string, which
    26  	// means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd".
    27  	// Because of the Scan interface, this makes it impossible for the client to
    28  	// distinguish correctly encoded replacement runes from encoding errors.
    29  	ScanRunes = bufio.ScanRunes
    30  
    31  	// ScanWords is a split function for a Scanner that returns each
    32  	// space-separated word of text, with surrounding spaces deleted. It will
    33  	// never return an empty string. The definition of space is set by
    34  	// unicode.IsSpace.
    35  	ScanWords = bufio.ScanWords
    36  
    37  	// ScanLines is a split function for a Scanner that returns each line of
    38  	// text, stripped of any trailing end-of-line marker. The returned line may
    39  	// be empty. The end-of-line marker is one optional carriage return followed
    40  	// by one mandatory newline. In regular expression notation, it is `\r?\n`.
    41  	// The last non-empty line of input will be returned even if it has no
    42  	// newline.
    43  	ScanLines = bufio.ScanLines
    44  )
    45  
    46  // ScanRawStrings is a split function for a Scanner that returns each string quoted by ` of
    47  // text. The returned line may be empty. Escape is disallowed
    48  // Raw string literals are character sequences between back quotes, as in `foo`.
    49  // Within the quotes, any character may appear except back quote.
    50  // The value of a raw string literal is the string composed of the uninterpreted (implicitly UTF-8-encoded) characters
    51  // between the quotes; in particular, backslashes have no special meaning and the string may contain newlines.
    52  // Carriage return characters ('\r') inside raw string literals are discarded from the raw string value.
    53  // https://golang.org/ref/spec#String_literals
    54  // raw_string_lit         = "`" { unicode_char | newline } "`" .
    55  func ScanRawStrings(data []byte, atEOF bool) (advance int, token []byte, err error) {
    56  	return scanStrings(data, atEOF, '`')
    57  }
    58  
    59  // ScanInterpretedStrings is a split function for a Scanner that returns each string quoted by " of
    60  // text. The returned line may be empty.
    61  // Interpreted string literals are character sequences between double quotes, as in "bar".
    62  // Within the quotes, any character may appear except newline and unescaped double quote.
    63  // The text between the quotes forms the value of the literal,
    64  // with backslash escapes interpreted as they are in rune literals
    65  // (except that \' is illegal and \" is legal), with the same restrictions.
    66  // The three-digit octal (\nnn) and two-digit hexadecimal (\xnn)
    67  // escapes represent individual bytes of the resulting string;
    68  // all other escapes represent the (possibly multi-byte) UTF-8 encoding of individual characters. Thus inside a string
    69  // literal \377 and \xFF represent a single byte of value 0xFF=255, while ΓΏ, \u00FF, \U000000FF and \xc3\xbf represent
    70  // the two bytes 0xc3 0xbf of the UTF-8 encoding of character U+00FF.
    71  // https://golang.org/ref/spec#String_literals
    72  // interpreted_string_lit = `"` { unicode_value | byte_value } `"` .
    73  func ScanInterpretedStrings(data []byte, atEOF bool) (advance int, token []byte, err error) {
    74  	return scanStrings(data, atEOF, '"')
    75  }
    76  
    77  // ScanEscapes is a split function wrapper for a Scanner that returns each string which is an escape format of
    78  // text. The returned line may be empty.
    79  func ScanEscapes(quote rune) func(data []byte, atEOF bool) (advance int, token []byte, err error) {
    80  	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
    81  		return scanEscapes(data, atEOF, quote)
    82  	}
    83  }
    84  
    85  // ScanMantissas is a split function wrapper for a Scanner that returns each string which is an n-base number format of
    86  // text. The returned line may be empty.
    87  func ScanMantissas(base int) func(data []byte, atEOF bool) (advance int, token []byte, err error) {
    88  	return ScanWhile(func(r rune) bool {
    89  		return digitVal(r) < base
    90  	})
    91  }
    92  
    93  // https://golang.org/ref/spec#Integer_literals
    94  // https://golang.org/ref/spec#Floating-point_literals
    95  // https://golang.org/ref/spec#Imaginary_literals
    96  // ScanNumbers is a split function wrapper for a Scanner that returns each string which is an integer, floating-point
    97  // or imaginary format of text. The returned line may be empty.
    98  func ScanNumbers(data []byte, atEOF bool) (advance int, token []byte, err error) {
    99  	if atEOF && len(data) == 0 {
   100  		return needMoreData()
   101  	}
   102  	var off int
   103  	var seenSign bool
   104  	var seenDecimalPoint bool
   105  	var seenDecimalNumber bool
   106  
   107  	var lookforFraction bool
   108  	var lookforExponent bool
   109  	// First character 1: digitVal(ch) < 10.
   110  	// Handle .989 or 0x888
   111  	for {
   112  		// read a rune
   113  		advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF))
   114  		off = off + advance
   115  		if err != nil || len(token) == 0 {
   116  			return advance, token, err
   117  		}
   118  		ch := bytes.Runes(token)[0]
   119  		if ch == '.' {
   120  			// . can be seen once only
   121  			if seenDecimalPoint {
   122  				off--
   123  				return off, data[:off], nil
   124  			}
   125  			seenDecimalPoint = true
   126  			continue
   127  		}
   128  
   129  		// sign can be seen leading or after E or e
   130  		if ch == '+' || ch == '-' {
   131  			// sign can be seen once only, and can never be after "."
   132  			if seenSign || seenDecimalPoint {
   133  				off--
   134  				return off, data[:off], nil
   135  			}
   136  			seenSign = true
   137  			continue
   138  		}
   139  
   140  		// number must be leading with "." "+" "-" or "0-9"
   141  		if !seenDecimalNumber && digitVal(ch) > 10 {
   142  			msg := fmt.Sprintf("illegal character %#U leading escape sequence, expect \\", token)
   143  			return 0, nil, errors.New(msg)
   144  		}
   145  		seenDecimalNumber = true
   146  
   147  		// .989777
   148  		if seenDecimalPoint {
   149  			advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF))
   150  			off = off + advance
   151  			if err != nil || len(token) == 0 {
   152  				return advance, token, err
   153  			}
   154  			// look for "E" or "e"
   155  			lookforExponent = true
   156  			break
   157  		}
   158  
   159  		// 0x12
   160  		if ch == '0' {
   161  			// int or float
   162  			advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF))
   163  			off = off + advance
   164  			if err != nil {
   165  				return advance, token, err
   166  			}
   167  			if len(token) == 0 {
   168  				return off, data[:off], nil
   169  			}
   170  			ch = bytes.Runes(token)[0]
   171  
   172  			if ch == 'x' || ch == 'X' {
   173  				// hexadecimal int
   174  				advance, token, err := handleSplitError(ScanMantissas(16)(data[off:], atEOF))
   175  				off = off + advance
   176  				if err != nil || len(token) == 0 {
   177  					return advance, token, err
   178  				}
   179  				if len(token) <= 0 {
   180  					// only scanned "0x" or "0X"
   181  					return 0, nil, errors.New("illegal hexadecimal number")
   182  				}
   183  				return off, data[:off], nil
   184  			} else {
   185  				// octal int or float
   186  				seenDecimalDigit := false
   187  				advance, token, err := handleSplitError(ScanMantissas(8)(data[off:], atEOF))
   188  				off = off + advance
   189  				if err != nil {
   190  					return advance, token, err
   191  				}
   192  
   193  				// read new rune
   194  				advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   195  				off = off + advance
   196  				if err != nil {
   197  					return advance, token, err
   198  				}
   199  				if len(token) == 0 {
   200  					return off, data[:off], nil
   201  				}
   202  				ch = bytes.Runes(token)[0]
   203  
   204  				if ch == '8' || ch == '9' {
   205  					// illegal octal int or float
   206  					seenDecimalDigit = true
   207  					advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF))
   208  					off = off + advance
   209  					if err != nil || len(token) == 0 {
   210  						return advance, token, err
   211  					}
   212  					advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   213  					off = off + advance
   214  					if err != nil || len(token) == 0 {
   215  						return advance, token, err
   216  					}
   217  					ch = bytes.Runes(token)[0]
   218  				}
   219  				if ch == '.' || ch == 'e' || ch == 'E' || ch == 'i' {
   220  					off-- //backward for fraction "." "e" "E" or "i"
   221  					lookforFraction = true
   222  					break
   223  				}
   224  				// octal int
   225  				if seenDecimalDigit {
   226  					return 0, nil, errors.New("illegal octal number")
   227  				}
   228  
   229  				off-- //backward for exit
   230  
   231  			}
   232  			return off, data[:off], nil
   233  		}
   234  
   235  		// decimal int or float
   236  		advance, token, err = handleSplitError(ScanMantissas(10)(data[off:], atEOF))
   237  		off = off + advance
   238  		if err != nil || len(token) == 0 {
   239  			return advance, token, err
   240  		}
   241  		lookforFraction = true
   242  		break
   243  	}
   244  
   245  	// read a rune
   246  	advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   247  	off = off + advance
   248  	if err != nil {
   249  		return advance, token, err
   250  	}
   251  	if len(token) == 0 {
   252  		return off, data[:off], nil
   253  	}
   254  	ch := bytes.Runes(token)[0]
   255  
   256  	if lookforFraction && ch == '.' {
   257  		advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF))
   258  		off = off + advance
   259  		if err != nil {
   260  			return advance, token, err
   261  		}
   262  		if len(token) == 0 {
   263  			return off, data[:off], nil
   264  		}
   265  		lookforExponent = true
   266  
   267  		// read new rune
   268  		advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   269  		off = off + advance
   270  		if err != nil {
   271  			return advance, token, err
   272  		}
   273  		if len(token) == 0 {
   274  			return off, data[:off], nil
   275  		}
   276  		ch = bytes.Runes(token)[0]
   277  	}
   278  
   279  	if lookforExponent && (ch == 'e' || ch == 'E') {
   280  		advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF))
   281  		off = off + advance
   282  		if err != nil {
   283  			return advance, token, err
   284  		}
   285  		if len(token) == 0 {
   286  			return off, data[:off], nil
   287  		}
   288  		ch = bytes.Runes(token)[0]
   289  
   290  		if ch == '-' || ch == '+' {
   291  			advance, token, err := handleSplitError(ScanRunes(data[off:], atEOF))
   292  			off = off + advance
   293  			if err != nil {
   294  				return advance, token, err
   295  			}
   296  			if len(token) == 0 {
   297  				return off, data[:off], nil
   298  			}
   299  			ch = bytes.Runes(token)[0]
   300  		}
   301  		if digitVal(ch) < 10 {
   302  			advance, token, err := handleSplitError(ScanMantissas(10)(data[off:], atEOF))
   303  			off = off + advance
   304  			if err != nil {
   305  				return advance, token, err
   306  			}
   307  			if len(token) == 0 {
   308  				return off, data[:off], nil
   309  			}
   310  
   311  			// read new rune
   312  			advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   313  			off = off + advance
   314  			if err != nil {
   315  				return advance, token, err
   316  			}
   317  			if len(token) == 0 {
   318  				return off, data[:off], nil
   319  			}
   320  		} else {
   321  			return 0, nil, errors.New("illegal floating-point exponent")
   322  		}
   323  	}
   324  
   325  	if ch != 'i' {
   326  		// backward
   327  		off = off - utf8.RuneLen(ch)
   328  	}
   329  	return off, data[:off], nil
   330  }
   331  
   332  // https://golang.org/ref/spec#Identifiers
   333  // ScanIdentifier is a split function wrapper for a Scanner that returns each string which is an identifier format of text.
   334  // The returned line may be empty.
   335  // identifier = letter { letter | unicode_digit } .
   336  func ScanIdentifier(data []byte, atEOF bool) (advance int, token []byte, err error) {
   337  	if atEOF && len(data) == 0 {
   338  		return needMoreData()
   339  	}
   340  	var off int
   341  
   342  	// First character 1: \.
   343  	advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   344  	off = off + advance
   345  	if err != nil || len(token) == 0 {
   346  		return advance, token, err
   347  	}
   348  	ch := bytes.Runes(token)[0]
   349  
   350  	if isLetter(ch) {
   351  		for isLetter(ch) || isDigit(ch) {
   352  			advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   353  			off = off + advance
   354  			if err != nil {
   355  				return advance, token, err
   356  			}
   357  			if token == nil {
   358  				return off, data[:off], nil
   359  			}
   360  			ch = bytes.Runes(token)[0]
   361  		}
   362  	}
   363  	off -= utf8.RuneLen(ch) // backward
   364  	return off, data[:off], nil
   365  }
   366  
   367  // ScanUntil is a split function wrapper for a Scanner that returns each string until filter case is meet.
   368  // The returned line may be empty.
   369  func ScanUntil(filter func(r rune) bool) func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   370  	return ScanWhile(func(r rune) bool {
   371  		if filter == nil {
   372  			return false
   373  		}
   374  		return !filter(r)
   375  	})
   376  }
   377  
   378  // ScanUntil is a split function wrapper for a Scanner that returns each string until filter case is not meet.
   379  // The returned line may be empty.
   380  func ScanWhile(filter func(r rune) bool) func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   381  	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   382  		if filter == nil || atEOF && len(data) == 0 {
   383  			return needMoreData()
   384  		}
   385  		var off int
   386  
   387  		// First character 1: \.
   388  		advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   389  		off = off + advance
   390  		if err != nil || len(token) == 0 {
   391  			return advance, token, err
   392  		}
   393  		ch := bytes.Runes(token)[0]
   394  
   395  		for filter(ch) {
   396  			advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   397  			off = off + advance
   398  			if err != nil {
   399  				return advance, token, err
   400  			}
   401  			if token == nil {
   402  				return off, data[:off], nil
   403  			}
   404  			ch = bytes.Runes(token)[0]
   405  		}
   406  		off -= utf8.RuneLen(ch) // backward
   407  
   408  		return off, data[:off], nil
   409  	}
   410  }
   411  
   412  // ScanRegexp is a split function wrapper for a Scanner that returns each string until regexp case is not meet.
   413  // The returned line may be empty.
   414  func ScanRegexp(regs ...*regexp.Regexp) func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   415  	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   416  		if atEOF && len(data) == 0 {
   417  			return needMoreData()
   418  		}
   419  		var off int
   420  
   421  		// First character 1: \.
   422  		// regex mode
   423  		for _, reg := range regs {
   424  			if reg == nil {
   425  				continue
   426  			}
   427  
   428  			locs := reg.FindStringIndex(string(data[off:]))
   429  			if len(locs) == 0 {
   430  				continue
   431  			}
   432  			off = locs[1]
   433  			return off, data[locs[0]:off], nil
   434  		}
   435  
   436  		return off, data[:off], nil
   437  	}
   438  }
   439  
   440  // ScanRegexpPerl is a split function wrapper for a Scanner that returns each string until regexp case is not meet.
   441  // The returned line may be empty.
   442  // This so-called leftmost-first matching is the same semantics
   443  // that Perl, Python, and other implementations use, although this
   444  // package implements it without the expense of backtracking.
   445  // For POSIX leftmost-longest matching, see ScanRegexpPosix.
   446  func ScanRegexpPerl(expectStrs ...string) func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   447  	var regs []*regexp.Regexp
   448  	for _, expect := range expectStrs {
   449  		expect = "^" + strings.TrimPrefix(expect, "^")
   450  
   451  		regs = append(regs, regexp.MustCompile(expect))
   452  	}
   453  	return ScanRegexp(regs...)
   454  }
   455  
   456  // ScanRegexpPosix is a split function wrapper for a Scanner that returns each string until regexp case is not meet.
   457  // The returned line may be empty.
   458  // ScanRegexpPosix is like ScanRegexpPerl but restricts the regular expression
   459  // to POSIX ERE (egrep) syntax and changes the match semantics to
   460  // leftmost-longest.
   461  func ScanRegexpPosix(expectStrs ...string) func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   462  	var regs []*regexp.Regexp
   463  	for _, expect := range expectStrs {
   464  		expect = "^" + strings.TrimPrefix(expect, "^")
   465  
   466  		regs = append(regs, regexp.MustCompilePOSIX(expect))
   467  	}
   468  	return ScanRegexp(regs...)
   469  }
   470  
   471  // https://golang.org/ref/spec#String_literals
   472  // string_lit             = raw_string_lit | interpreted_string_lit .
   473  // raw_string_lit         = "`" { unicode_char | newline } "`" .
   474  // interpreted_string_lit = `"` { unicode_value | byte_value } `"` .
   475  func scanStrings(data []byte, atEOF bool, quote rune) (advance int, token []byte, err error) {
   476  	if atEOF && len(data) == 0 {
   477  		return needMoreData()
   478  	}
   479  	var off int
   480  
   481  	// First character 1: ".
   482  	advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   483  	off = off + advance
   484  	if err != nil || len(token) == 0 {
   485  		return advance, token, err
   486  	}
   487  	if !bytes.ContainsRune(token, quote) {
   488  		msg := fmt.Sprintf("illegal character %#U leading escape sequence, expect \\", token)
   489  		return 0, nil, errors.New(msg)
   490  	}
   491  
   492  	var allowEscape bool
   493  	if quote == '"' {
   494  		allowEscape = true
   495  	}
   496  	// '"' opening already consumed
   497  	for _, ch := range data[off:] {
   498  		off++
   499  		if ch == '\n' || ch < 0 {
   500  			return 0, nil, errors.New("string literal not terminated")
   501  		}
   502  
   503  		if rune(ch) == quote {
   504  			break
   505  		}
   506  
   507  		if allowEscape && ch == '\\' {
   508  			// backward
   509  			off--
   510  			advance, token, err = handleSplitError(ScanEscapes(quote)(data[off:], atEOF))
   511  			off = off + advance
   512  			if err != nil || len(token) == 0 {
   513  				return advance, token, err
   514  			}
   515  
   516  		}
   517  	}
   518  	return off, data[:off], nil
   519  }
   520  
   521  func scanEscapes(data []byte, atEOF bool, quote rune) (advance int, token []byte, err error) {
   522  	if atEOF && len(data) == 0 {
   523  		return needMoreData()
   524  	}
   525  	var off int
   526  
   527  	// First character 1: \.
   528  	advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   529  	off = off + advance
   530  	if err != nil || len(token) == 0 {
   531  		return advance, token, err
   532  	}
   533  
   534  	if !bytes.ContainsRune(token, '\\') {
   535  		msg := fmt.Sprintf("illegal character %#U leading escape sequence, expect \\", token)
   536  		return 0, nil, errors.New(msg)
   537  	}
   538  
   539  	// Second character 2: char.
   540  	advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   541  	off = off + advance
   542  	if err != nil || len(token) == 0 {
   543  		return advance, token, err
   544  	}
   545  
   546  	ch := bytes.Runes(token)[0]
   547  
   548  	var n int
   549  	var base, max uint32
   550  	switch ch {
   551  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   552  		return off, data[0:off], nil
   553  	case '0', '1', '2', '3', '4', '5', '6', '7':
   554  		n, base, max = 3, 8, 255
   555  	case 'x':
   556  		n, base, max = 2, 16, 255
   557  	case 'u':
   558  		n, base, max = 4, 16, unicode.MaxRune
   559  	case 'U':
   560  		n, base, max = 8, 16, unicode.MaxRune
   561  	default:
   562  		msg := "unknown escape sequence"
   563  		if ch < 0 {
   564  			msg = "escape sequence not terminated"
   565  		}
   566  		return 0, nil, errors.New(msg)
   567  	}
   568  
   569  	switch ch {
   570  	case 'x', 'u', 'U':
   571  		advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   572  		off = off + advance
   573  		if err != nil || len(token) == 0 {
   574  			return advance, token, err
   575  		}
   576  
   577  		ch = bytes.Runes(token)[0]
   578  	}
   579  
   580  	var x uint32
   581  	for n > 0 {
   582  		d := uint32(digitVal(ch))
   583  		if d >= base {
   584  			msg := fmt.Sprintf("illegal character %#U in escape sequence", ch)
   585  			if ch < 0 {
   586  				msg = "escape sequence not terminated"
   587  			}
   588  			return 0, nil, errors.New(msg)
   589  		}
   590  		x = x*base + d
   591  
   592  		advance, token, err = handleSplitError(ScanRunes(data[off:], atEOF))
   593  		off = off + advance
   594  		if err != nil || len(token) == 0 {
   595  			return advance, token, err
   596  		}
   597  		ch = bytes.Runes(token)[0]
   598  
   599  		n--
   600  	}
   601  
   602  	if x > max || 0xD800 <= x && x < 0xE000 {
   603  		return 0, nil, errors.New("escape sequence is invalid Unicode code point")
   604  	}
   605  	return off, data[:off], nil
   606  }
   607  
   608  func isLetter(ch rune) bool {
   609  	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   610  }
   611  
   612  func isDigit(ch rune) bool {
   613  	return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   614  }
   615  
   616  func needMoreData() (advance int, token []byte, err error) {
   617  	return 0, nil, nil
   618  }
   619  
   620  func handleSplitError(advance int, token []byte, err error) (int, []byte, error) {
   621  	if err != nil {
   622  		if err == bufio.ErrFinalToken {
   623  			return 0, nil, nil
   624  		}
   625  		return 0, nil, err
   626  	}
   627  
   628  	if len(token) == 0 {
   629  		// needMoreData
   630  		return 0, nil, nil
   631  	}
   632  
   633  	return advance, token, nil
   634  }