github.com/whiteCcinn/protobuf-go@v1.0.9/internal/encoding/text/decode.go (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package text
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"io"
    11  	"strconv"
    12  	"unicode/utf8"
    13  
    14  	"github.com/whiteCcinn/protobuf-go/internal/errors"
    15  )
    16  
    17  // Decoder is a token-based textproto decoder.
    18  type Decoder struct {
    19  	// lastCall is last method called, either readCall or peekCall.
    20  	// Initial value is readCall.
    21  	lastCall call
    22  
    23  	// lastToken contains the last read token.
    24  	lastToken Token
    25  
    26  	// lastErr contains the last read error.
    27  	lastErr error
    28  
    29  	// openStack is a stack containing the byte characters for MessageOpen and
    30  	// ListOpen kinds. The top of stack represents the message or the list that
    31  	// the current token is nested in. An empty stack means the current token is
    32  	// at the top level message. The characters '{' and '<' both represent the
    33  	// MessageOpen kind.
    34  	openStack []byte
    35  
    36  	// orig is used in reporting line and column.
    37  	orig []byte
    38  	// in contains the unconsumed input.
    39  	in []byte
    40  }
    41  
    42  // NewDecoder returns a Decoder to read the given []byte.
    43  func NewDecoder(b []byte) *Decoder {
    44  	return &Decoder{orig: b, in: b}
    45  }
    46  
    47  // ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
    48  var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
    49  
    50  // call specifies which Decoder method was invoked.
    51  type call uint8
    52  
    53  const (
    54  	readCall call = iota
    55  	peekCall
    56  )
    57  
    58  // Peek looks ahead and returns the next token and error without advancing a read.
    59  func (d *Decoder) Peek() (Token, error) {
    60  	defer func() { d.lastCall = peekCall }()
    61  	if d.lastCall == readCall {
    62  		d.lastToken, d.lastErr = d.Read()
    63  	}
    64  	return d.lastToken, d.lastErr
    65  }
    66  
    67  // Read returns the next token.
    68  // It will return an error if there is no valid token.
    69  func (d *Decoder) Read() (Token, error) {
    70  	defer func() { d.lastCall = readCall }()
    71  	if d.lastCall == peekCall {
    72  		return d.lastToken, d.lastErr
    73  	}
    74  
    75  	tok, err := d.parseNext(d.lastToken.kind)
    76  	if err != nil {
    77  		return Token{}, err
    78  	}
    79  
    80  	switch tok.kind {
    81  	case comma, semicolon:
    82  		tok, err = d.parseNext(tok.kind)
    83  		if err != nil {
    84  			return Token{}, err
    85  		}
    86  	}
    87  	d.lastToken = tok
    88  	return tok, nil
    89  }
    90  
    91  const (
    92  	mismatchedFmt = "mismatched close character %q"
    93  	unexpectedFmt = "unexpected character %q"
    94  )
    95  
    96  // parseNext parses the next Token based on given last kind.
    97  func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
    98  	// Trim leading spaces.
    99  	d.consume(0)
   100  	isEOF := false
   101  	if len(d.in) == 0 {
   102  		isEOF = true
   103  	}
   104  
   105  	switch lastKind {
   106  	case EOF:
   107  		return d.consumeToken(EOF, 0, 0), nil
   108  
   109  	case bof:
   110  		// Start of top level message. Next token can be EOF or Name.
   111  		if isEOF {
   112  			return d.consumeToken(EOF, 0, 0), nil
   113  		}
   114  		return d.parseFieldName()
   115  
   116  	case Name:
   117  		// Next token can be MessageOpen, ListOpen or Scalar.
   118  		if isEOF {
   119  			return Token{}, ErrUnexpectedEOF
   120  		}
   121  		switch ch := d.in[0]; ch {
   122  		case '{', '<':
   123  			d.pushOpenStack(ch)
   124  			return d.consumeToken(MessageOpen, 1, 0), nil
   125  		case '[':
   126  			d.pushOpenStack(ch)
   127  			return d.consumeToken(ListOpen, 1, 0), nil
   128  		default:
   129  			return d.parseScalar()
   130  		}
   131  
   132  	case Scalar:
   133  		openKind, closeCh := d.currentOpenKind()
   134  		switch openKind {
   135  		case bof:
   136  			// Top level message.
   137  			// 	Next token can be EOF, comma, semicolon or Name.
   138  			if isEOF {
   139  				return d.consumeToken(EOF, 0, 0), nil
   140  			}
   141  			switch d.in[0] {
   142  			case ',':
   143  				return d.consumeToken(comma, 1, 0), nil
   144  			case ';':
   145  				return d.consumeToken(semicolon, 1, 0), nil
   146  			default:
   147  				return d.parseFieldName()
   148  			}
   149  
   150  		case MessageOpen:
   151  			// Next token can be MessageClose, comma, semicolon or Name.
   152  			if isEOF {
   153  				return Token{}, ErrUnexpectedEOF
   154  			}
   155  			switch ch := d.in[0]; ch {
   156  			case closeCh:
   157  				d.popOpenStack()
   158  				return d.consumeToken(MessageClose, 1, 0), nil
   159  			case otherCloseChar[closeCh]:
   160  				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
   161  			case ',':
   162  				return d.consumeToken(comma, 1, 0), nil
   163  			case ';':
   164  				return d.consumeToken(semicolon, 1, 0), nil
   165  			default:
   166  				return d.parseFieldName()
   167  			}
   168  
   169  		case ListOpen:
   170  			// Next token can be ListClose or comma.
   171  			if isEOF {
   172  				return Token{}, ErrUnexpectedEOF
   173  			}
   174  			switch ch := d.in[0]; ch {
   175  			case ']':
   176  				d.popOpenStack()
   177  				return d.consumeToken(ListClose, 1, 0), nil
   178  			case ',':
   179  				return d.consumeToken(comma, 1, 0), nil
   180  			default:
   181  				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
   182  			}
   183  		}
   184  
   185  	case MessageOpen:
   186  		// Next token can be MessageClose or Name.
   187  		if isEOF {
   188  			return Token{}, ErrUnexpectedEOF
   189  		}
   190  		_, closeCh := d.currentOpenKind()
   191  		switch ch := d.in[0]; ch {
   192  		case closeCh:
   193  			d.popOpenStack()
   194  			return d.consumeToken(MessageClose, 1, 0), nil
   195  		case otherCloseChar[closeCh]:
   196  			return Token{}, d.newSyntaxError(mismatchedFmt, ch)
   197  		default:
   198  			return d.parseFieldName()
   199  		}
   200  
   201  	case MessageClose:
   202  		openKind, closeCh := d.currentOpenKind()
   203  		switch openKind {
   204  		case bof:
   205  			// Top level message.
   206  			// Next token can be EOF, comma, semicolon or Name.
   207  			if isEOF {
   208  				return d.consumeToken(EOF, 0, 0), nil
   209  			}
   210  			switch ch := d.in[0]; ch {
   211  			case ',':
   212  				return d.consumeToken(comma, 1, 0), nil
   213  			case ';':
   214  				return d.consumeToken(semicolon, 1, 0), nil
   215  			default:
   216  				return d.parseFieldName()
   217  			}
   218  
   219  		case MessageOpen:
   220  			// Next token can be MessageClose, comma, semicolon or Name.
   221  			if isEOF {
   222  				return Token{}, ErrUnexpectedEOF
   223  			}
   224  			switch ch := d.in[0]; ch {
   225  			case closeCh:
   226  				d.popOpenStack()
   227  				return d.consumeToken(MessageClose, 1, 0), nil
   228  			case otherCloseChar[closeCh]:
   229  				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
   230  			case ',':
   231  				return d.consumeToken(comma, 1, 0), nil
   232  			case ';':
   233  				return d.consumeToken(semicolon, 1, 0), nil
   234  			default:
   235  				return d.parseFieldName()
   236  			}
   237  
   238  		case ListOpen:
   239  			// Next token can be ListClose or comma
   240  			if isEOF {
   241  				return Token{}, ErrUnexpectedEOF
   242  			}
   243  			switch ch := d.in[0]; ch {
   244  			case closeCh:
   245  				d.popOpenStack()
   246  				return d.consumeToken(ListClose, 1, 0), nil
   247  			case ',':
   248  				return d.consumeToken(comma, 1, 0), nil
   249  			default:
   250  				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
   251  			}
   252  		}
   253  
   254  	case ListOpen:
   255  		// Next token can be ListClose, MessageStart or Scalar.
   256  		if isEOF {
   257  			return Token{}, ErrUnexpectedEOF
   258  		}
   259  		switch ch := d.in[0]; ch {
   260  		case ']':
   261  			d.popOpenStack()
   262  			return d.consumeToken(ListClose, 1, 0), nil
   263  		case '{', '<':
   264  			d.pushOpenStack(ch)
   265  			return d.consumeToken(MessageOpen, 1, 0), nil
   266  		default:
   267  			return d.parseScalar()
   268  		}
   269  
   270  	case ListClose:
   271  		openKind, closeCh := d.currentOpenKind()
   272  		switch openKind {
   273  		case bof:
   274  			// Top level message.
   275  			// Next token can be EOF, comma, semicolon or Name.
   276  			if isEOF {
   277  				return d.consumeToken(EOF, 0, 0), nil
   278  			}
   279  			switch ch := d.in[0]; ch {
   280  			case ',':
   281  				return d.consumeToken(comma, 1, 0), nil
   282  			case ';':
   283  				return d.consumeToken(semicolon, 1, 0), nil
   284  			default:
   285  				return d.parseFieldName()
   286  			}
   287  
   288  		case MessageOpen:
   289  			// Next token can be MessageClose, comma, semicolon or Name.
   290  			if isEOF {
   291  				return Token{}, ErrUnexpectedEOF
   292  			}
   293  			switch ch := d.in[0]; ch {
   294  			case closeCh:
   295  				d.popOpenStack()
   296  				return d.consumeToken(MessageClose, 1, 0), nil
   297  			case otherCloseChar[closeCh]:
   298  				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
   299  			case ',':
   300  				return d.consumeToken(comma, 1, 0), nil
   301  			case ';':
   302  				return d.consumeToken(semicolon, 1, 0), nil
   303  			default:
   304  				return d.parseFieldName()
   305  			}
   306  
   307  		default:
   308  			// It is not possible to have this case. Let it panic below.
   309  		}
   310  
   311  	case comma, semicolon:
   312  		openKind, closeCh := d.currentOpenKind()
   313  		switch openKind {
   314  		case bof:
   315  			// Top level message. Next token can be EOF or Name.
   316  			if isEOF {
   317  				return d.consumeToken(EOF, 0, 0), nil
   318  			}
   319  			return d.parseFieldName()
   320  
   321  		case MessageOpen:
   322  			// Next token can be MessageClose or Name.
   323  			if isEOF {
   324  				return Token{}, ErrUnexpectedEOF
   325  			}
   326  			switch ch := d.in[0]; ch {
   327  			case closeCh:
   328  				d.popOpenStack()
   329  				return d.consumeToken(MessageClose, 1, 0), nil
   330  			case otherCloseChar[closeCh]:
   331  				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
   332  			default:
   333  				return d.parseFieldName()
   334  			}
   335  
   336  		case ListOpen:
   337  			if lastKind == semicolon {
   338  				// It is not be possible to have this case as logic here
   339  				// should not have produced a semicolon Token when inside a
   340  				// list. Let it panic below.
   341  				break
   342  			}
   343  			// Next token can be MessageOpen or Scalar.
   344  			if isEOF {
   345  				return Token{}, ErrUnexpectedEOF
   346  			}
   347  			switch ch := d.in[0]; ch {
   348  			case '{', '<':
   349  				d.pushOpenStack(ch)
   350  				return d.consumeToken(MessageOpen, 1, 0), nil
   351  			default:
   352  				return d.parseScalar()
   353  			}
   354  		}
   355  	}
   356  
   357  	line, column := d.Position(len(d.orig) - len(d.in))
   358  	panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
   359  }
   360  
   361  var otherCloseChar = map[byte]byte{
   362  	'}': '>',
   363  	'>': '}',
   364  }
   365  
   366  // currentOpenKind indicates whether current position is inside a message, list
   367  // or top-level message by returning MessageOpen, ListOpen or bof respectively.
   368  // If the returned kind is either a MessageOpen or ListOpen, it also returns the
   369  // corresponding closing character.
   370  func (d *Decoder) currentOpenKind() (Kind, byte) {
   371  	if len(d.openStack) == 0 {
   372  		return bof, 0
   373  	}
   374  	openCh := d.openStack[len(d.openStack)-1]
   375  	switch openCh {
   376  	case '{':
   377  		return MessageOpen, '}'
   378  	case '<':
   379  		return MessageOpen, '>'
   380  	case '[':
   381  		return ListOpen, ']'
   382  	}
   383  	panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh))
   384  }
   385  
   386  func (d *Decoder) pushOpenStack(ch byte) {
   387  	d.openStack = append(d.openStack, ch)
   388  }
   389  
   390  func (d *Decoder) popOpenStack() {
   391  	d.openStack = d.openStack[:len(d.openStack)-1]
   392  }
   393  
   394  // parseFieldName parses field name and separator.
   395  func (d *Decoder) parseFieldName() (tok Token, err error) {
   396  	defer func() {
   397  		if err == nil && d.tryConsumeChar(':') {
   398  			tok.attrs |= hasSeparator
   399  		}
   400  	}()
   401  
   402  	// Extension or Any type URL.
   403  	if d.in[0] == '[' {
   404  		return d.parseTypeName()
   405  	}
   406  
   407  	// Identifier.
   408  	if size := parseIdent(d.in, false); size > 0 {
   409  		return d.consumeToken(Name, size, uint8(IdentName)), nil
   410  	}
   411  
   412  	// Field number. Identify if input is a valid number that is not negative
   413  	// and is decimal integer within 32-bit range.
   414  	if num := parseNumber(d.in); num.size > 0 {
   415  		if !num.neg && num.kind == numDec {
   416  			if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil {
   417  				return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
   418  			}
   419  		}
   420  		return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
   421  	}
   422  
   423  	return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
   424  }
   425  
   426  // parseTypeName parses Any type URL or extension field name. The name is
   427  // enclosed in [ and ] characters. The C++ parser does not handle many legal URL
   428  // strings. This implementation is more liberal and allows for the pattern
   429  // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
   430  // in between [ ], '.', '/' and the sub names.
   431  func (d *Decoder) parseTypeName() (Token, error) {
   432  	startPos := len(d.orig) - len(d.in)
   433  	// Use alias s to advance first in order to use d.in for error handling.
   434  	// Caller already checks for [ as first character.
   435  	s := consume(d.in[1:], 0)
   436  	if len(s) == 0 {
   437  		return Token{}, ErrUnexpectedEOF
   438  	}
   439  
   440  	var name []byte
   441  	for len(s) > 0 && isTypeNameChar(s[0]) {
   442  		name = append(name, s[0])
   443  		s = s[1:]
   444  	}
   445  	s = consume(s, 0)
   446  
   447  	var closed bool
   448  	for len(s) > 0 && !closed {
   449  		switch {
   450  		case s[0] == ']':
   451  			s = s[1:]
   452  			closed = true
   453  
   454  		case s[0] == '/', s[0] == '.':
   455  			if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
   456  				return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
   457  					d.orig[startPos:len(d.orig)-len(s)+1])
   458  			}
   459  			name = append(name, s[0])
   460  			s = s[1:]
   461  			s = consume(s, 0)
   462  			for len(s) > 0 && isTypeNameChar(s[0]) {
   463  				name = append(name, s[0])
   464  				s = s[1:]
   465  			}
   466  			s = consume(s, 0)
   467  
   468  		default:
   469  			return Token{}, d.newSyntaxError(
   470  				"invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
   471  		}
   472  	}
   473  
   474  	if !closed {
   475  		return Token{}, ErrUnexpectedEOF
   476  	}
   477  
   478  	// First character cannot be '.'. Last character cannot be '.' or '/'.
   479  	size := len(name)
   480  	if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
   481  		return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
   482  			d.orig[startPos:len(d.orig)-len(s)])
   483  	}
   484  
   485  	d.in = s
   486  	endPos := len(d.orig) - len(d.in)
   487  	d.consume(0)
   488  
   489  	return Token{
   490  		kind:  Name,
   491  		attrs: uint8(TypeName),
   492  		pos:   startPos,
   493  		raw:   d.orig[startPos:endPos],
   494  		str:   string(name),
   495  	}, nil
   496  }
   497  
   498  func isTypeNameChar(b byte) bool {
   499  	return (b == '-' || b == '_' ||
   500  		('0' <= b && b <= '9') ||
   501  		('a' <= b && b <= 'z') ||
   502  		('A' <= b && b <= 'Z'))
   503  }
   504  
   505  func isWhiteSpace(b byte) bool {
   506  	switch b {
   507  	case ' ', '\n', '\r', '\t':
   508  		return true
   509  	default:
   510  		return false
   511  	}
   512  }
   513  
   514  // parseIdent parses an unquoted proto identifier and returns size.
   515  // If allowNeg is true, it allows '-' to be the first character in the
   516  // identifier. This is used when parsing literal values like -infinity, etc.
   517  // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
   518  func parseIdent(input []byte, allowNeg bool) int {
   519  	var size int
   520  
   521  	s := input
   522  	if len(s) == 0 {
   523  		return 0
   524  	}
   525  
   526  	if allowNeg && s[0] == '-' {
   527  		s = s[1:]
   528  		size++
   529  		if len(s) == 0 {
   530  			return 0
   531  		}
   532  	}
   533  
   534  	switch {
   535  	case s[0] == '_',
   536  		'a' <= s[0] && s[0] <= 'z',
   537  		'A' <= s[0] && s[0] <= 'Z':
   538  		s = s[1:]
   539  		size++
   540  	default:
   541  		return 0
   542  	}
   543  
   544  	for len(s) > 0 && (s[0] == '_' ||
   545  		'a' <= s[0] && s[0] <= 'z' ||
   546  		'A' <= s[0] && s[0] <= 'Z' ||
   547  		'0' <= s[0] && s[0] <= '9') {
   548  		s = s[1:]
   549  		size++
   550  	}
   551  
   552  	if len(s) > 0 && !isDelim(s[0]) {
   553  		return 0
   554  	}
   555  
   556  	return size
   557  }
   558  
   559  // parseScalar parses for a string, literal or number value.
   560  func (d *Decoder) parseScalar() (Token, error) {
   561  	if d.in[0] == '"' || d.in[0] == '\'' {
   562  		return d.parseStringValue()
   563  	}
   564  
   565  	if tok, ok := d.parseLiteralValue(); ok {
   566  		return tok, nil
   567  	}
   568  
   569  	if tok, ok := d.parseNumberValue(); ok {
   570  		return tok, nil
   571  	}
   572  
   573  	return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
   574  }
   575  
   576  // parseLiteralValue parses a literal value. A literal value is used for
   577  // bools, special floats and enums. This function simply identifies that the
   578  // field value is a literal.
   579  func (d *Decoder) parseLiteralValue() (Token, bool) {
   580  	size := parseIdent(d.in, true)
   581  	if size == 0 {
   582  		return Token{}, false
   583  	}
   584  	return d.consumeToken(Scalar, size, literalValue), true
   585  }
   586  
   587  // consumeToken constructs a Token for given Kind from d.in and consumes given
   588  // size-length from it.
   589  func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
   590  	// Important to compute raw and pos before consuming.
   591  	tok := Token{
   592  		kind:  kind,
   593  		attrs: attrs,
   594  		pos:   len(d.orig) - len(d.in),
   595  		raw:   d.in[:size],
   596  	}
   597  	d.consume(size)
   598  	return tok
   599  }
   600  
   601  // newSyntaxError returns a syntax error with line and column information for
   602  // current position.
   603  func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
   604  	e := errors.New(f, x...)
   605  	line, column := d.Position(len(d.orig) - len(d.in))
   606  	return errors.New("syntax error (line %d:%d): %v", line, column, e)
   607  }
   608  
   609  // Position returns line and column number of given index of the original input.
   610  // It will panic if index is out of range.
   611  func (d *Decoder) Position(idx int) (line int, column int) {
   612  	b := d.orig[:idx]
   613  	line = bytes.Count(b, []byte("\n")) + 1
   614  	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
   615  		b = b[i+1:]
   616  	}
   617  	column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
   618  	return line, column
   619  }
   620  
   621  func (d *Decoder) tryConsumeChar(c byte) bool {
   622  	if len(d.in) > 0 && d.in[0] == c {
   623  		d.consume(1)
   624  		return true
   625  	}
   626  	return false
   627  }
   628  
   629  // consume consumes n bytes of input and any subsequent whitespace or comments.
   630  func (d *Decoder) consume(n int) {
   631  	d.in = consume(d.in, n)
   632  	return
   633  }
   634  
   635  // consume consumes n bytes of input and any subsequent whitespace or comments.
   636  func consume(b []byte, n int) []byte {
   637  	b = b[n:]
   638  	for len(b) > 0 {
   639  		switch b[0] {
   640  		case ' ', '\n', '\r', '\t':
   641  			b = b[1:]
   642  		case '#':
   643  			if i := bytes.IndexByte(b, '\n'); i >= 0 {
   644  				b = b[i+len("\n"):]
   645  			} else {
   646  				b = nil
   647  			}
   648  		default:
   649  			return b
   650  		}
   651  	}
   652  	return b
   653  }
   654  
   655  // errId extracts a byte sequence that looks like an invalid ID
   656  // (for the purposes of error reporting).
   657  func errId(seq []byte) []byte {
   658  	const maxLen = 32
   659  	for i := 0; i < len(seq); {
   660  		if i > maxLen {
   661  			return append(seq[:i:i], "…"...)
   662  		}
   663  		r, size := utf8.DecodeRune(seq[i:])
   664  		if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
   665  			if i == 0 {
   666  				// Either the first byte is invalid UTF-8 or a
   667  				// delimiter, or the first rune is non-ASCII.
   668  				// Return it as-is.
   669  				i = size
   670  			}
   671  			return seq[:i:i]
   672  		}
   673  		i += size
   674  	}
   675  	// No delimiter found.
   676  	return seq
   677  }
   678  
   679  // isDelim returns true if given byte is a delimiter character.
   680  func isDelim(c byte) bool {
   681  	return !(c == '-' || c == '+' || c == '.' || c == '_' ||
   682  		('a' <= c && c <= 'z') ||
   683  		('A' <= c && c <= 'Z') ||
   684  		('0' <= c && c <= '9'))
   685  }