github.com/hashicorp/hcl/v2@v2.20.0/json/scanner.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package json
     5  
     6  import (
     7  	"fmt"
     8  
     9  	"github.com/apparentlymart/go-textseg/v15/textseg"
    10  	"github.com/hashicorp/hcl/v2"
    11  )
    12  
    13  //go:generate go run golang.org/x/tools/cmd/stringer -type tokenType scanner.go
    14  type tokenType rune
    15  
    16  const (
    17  	tokenBraceO  tokenType = '{'
    18  	tokenBraceC  tokenType = '}'
    19  	tokenBrackO  tokenType = '['
    20  	tokenBrackC  tokenType = ']'
    21  	tokenComma   tokenType = ','
    22  	tokenColon   tokenType = ':'
    23  	tokenKeyword tokenType = 'K'
    24  	tokenString  tokenType = 'S'
    25  	tokenNumber  tokenType = 'N'
    26  	tokenEOF     tokenType = '␄'
    27  	tokenInvalid tokenType = 0
    28  	tokenEquals  tokenType = '=' // used only for reminding the user of JSON syntax
    29  )
    30  
    31  type token struct {
    32  	Type  tokenType
    33  	Bytes []byte
    34  	Range hcl.Range
    35  }
    36  
    37  // scan returns the primary tokens for the given JSON buffer in sequence.
    38  //
    39  // The responsibility of this pass is to just mark the slices of the buffer
    40  // as being of various types. It is lax in how it interprets the multi-byte
    41  // token types keyword, string and number, preferring to capture erroneous
    42  // extra bytes that we presume the user intended to be part of the token
    43  // so that we can generate more helpful diagnostics in the parser.
    44  func scan(buf []byte, start pos) []token {
    45  	var tokens []token
    46  	p := start
    47  	for {
    48  		if len(buf) == 0 {
    49  			tokens = append(tokens, token{
    50  				Type:  tokenEOF,
    51  				Bytes: nil,
    52  				Range: posRange(p, p),
    53  			})
    54  			return tokens
    55  		}
    56  
    57  		buf, p = skipWhitespace(buf, p)
    58  
    59  		if len(buf) == 0 {
    60  			tokens = append(tokens, token{
    61  				Type:  tokenEOF,
    62  				Bytes: nil,
    63  				Range: posRange(p, p),
    64  			})
    65  			return tokens
    66  		}
    67  
    68  		start = p
    69  
    70  		first := buf[0]
    71  		switch {
    72  		case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=':
    73  			p.Pos.Column++
    74  			p.Pos.Byte++
    75  			tokens = append(tokens, token{
    76  				Type:  tokenType(first),
    77  				Bytes: buf[0:1],
    78  				Range: posRange(start, p),
    79  			})
    80  			buf = buf[1:]
    81  		case first == '"':
    82  			var tokBuf []byte
    83  			tokBuf, buf, p = scanString(buf, p)
    84  			tokens = append(tokens, token{
    85  				Type:  tokenString,
    86  				Bytes: tokBuf,
    87  				Range: posRange(start, p),
    88  			})
    89  		case byteCanStartNumber(first):
    90  			var tokBuf []byte
    91  			tokBuf, buf, p = scanNumber(buf, p)
    92  			tokens = append(tokens, token{
    93  				Type:  tokenNumber,
    94  				Bytes: tokBuf,
    95  				Range: posRange(start, p),
    96  			})
    97  		case byteCanStartKeyword(first):
    98  			var tokBuf []byte
    99  			tokBuf, buf, p = scanKeyword(buf, p)
   100  			tokens = append(tokens, token{
   101  				Type:  tokenKeyword,
   102  				Bytes: tokBuf,
   103  				Range: posRange(start, p),
   104  			})
   105  		default:
   106  			tokens = append(tokens, token{
   107  				Type:  tokenInvalid,
   108  				Bytes: buf[:1],
   109  				Range: start.Range(1, 1),
   110  			})
   111  			// If we've encountered an invalid then we might as well stop
   112  			// scanning since the parser won't proceed beyond this point.
   113  			// We insert a synthetic EOF marker here to match the expectations
   114  			// of consumers of this data structure.
   115  			p.Pos.Column++
   116  			p.Pos.Byte++
   117  			tokens = append(tokens, token{
   118  				Type:  tokenEOF,
   119  				Bytes: nil,
   120  				Range: posRange(p, p),
   121  			})
   122  			return tokens
   123  		}
   124  	}
   125  }
   126  
   127  func byteCanStartNumber(b byte) bool {
   128  	switch b {
   129  	// We are slightly more tolerant than JSON requires here since we
   130  	// expect the parser will make a stricter interpretation of the
   131  	// number bytes, but we specifically don't allow 'e' or 'E' here
   132  	// since we want the scanner to treat that as the start of an
   133  	// invalid keyword instead, to produce more intelligible error messages.
   134  	case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   135  		return true
   136  	default:
   137  		return false
   138  	}
   139  }
   140  
   141  func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
   142  	// The scanner doesn't check that the sequence of digit-ish bytes is
   143  	// in a valid order. The parser must do this when decoding a number
   144  	// token.
   145  	var i int
   146  	p := start
   147  Byte:
   148  	for i = 0; i < len(buf); i++ {
   149  		switch buf[i] {
   150  		case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   151  			p.Pos.Byte++
   152  			p.Pos.Column++
   153  		default:
   154  			break Byte
   155  		}
   156  	}
   157  	return buf[:i], buf[i:], p
   158  }
   159  
   160  func byteCanStartKeyword(b byte) bool {
   161  	switch {
   162  	// We allow any sequence of alphabetical characters here, even though
   163  	// JSON is more constrained, so that we can collect what we presume
   164  	// the user intended to be a single keyword and then check its validity
   165  	// in the parser, where we can generate better diagnostics.
   166  	// So e.g. we want to be able to say:
   167  	//   unrecognized keyword "True". Did you mean "true"?
   168  	case isAlphabetical(b):
   169  		return true
   170  	default:
   171  		return false
   172  	}
   173  }
   174  
   175  func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
   176  	var i int
   177  	p := start
   178  Byte:
   179  	for i = 0; i < len(buf); i++ {
   180  		b := buf[i]
   181  		switch {
   182  		case isAlphabetical(b) || b == '_':
   183  			p.Pos.Byte++
   184  			p.Pos.Column++
   185  		default:
   186  			break Byte
   187  		}
   188  	}
   189  	return buf[:i], buf[i:], p
   190  }
   191  
   192  func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
   193  	// The scanner doesn't validate correct use of escapes, etc. It pays
   194  	// attention to escapes only for the purpose of identifying the closing
   195  	// quote character. It's the parser's responsibility to do proper
   196  	// validation.
   197  	//
   198  	// The scanner also doesn't specifically detect unterminated string
   199  	// literals, though they can be identified in the parser by checking if
   200  	// the final byte in a string token is the double-quote character.
   201  
   202  	// Skip the opening quote symbol
   203  	i := 1
   204  	p := start
   205  	p.Pos.Byte++
   206  	p.Pos.Column++
   207  	escaping := false
   208  Byte:
   209  	for i < len(buf) {
   210  		b := buf[i]
   211  
   212  		switch {
   213  		case b == '\\':
   214  			escaping = !escaping
   215  			p.Pos.Byte++
   216  			p.Pos.Column++
   217  			i++
   218  		case b == '"':
   219  			p.Pos.Byte++
   220  			p.Pos.Column++
   221  			i++
   222  			if !escaping {
   223  				break Byte
   224  			}
   225  			escaping = false
   226  		case b < 32:
   227  			break Byte
   228  		default:
   229  			// Advance by one grapheme cluster, so that we consider each
   230  			// grapheme to be a "column".
   231  			// Ignoring error because this scanner cannot produce errors.
   232  			advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
   233  
   234  			p.Pos.Byte += advance
   235  			p.Pos.Column++
   236  			i += advance
   237  
   238  			escaping = false
   239  		}
   240  	}
   241  	return buf[:i], buf[i:], p
   242  }
   243  
   244  func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
   245  	var i int
   246  	p := start
   247  Byte:
   248  	for i = 0; i < len(buf); i++ {
   249  		switch buf[i] {
   250  		case ' ':
   251  			p.Pos.Byte++
   252  			p.Pos.Column++
   253  		case '\n':
   254  			p.Pos.Byte++
   255  			p.Pos.Column = 1
   256  			p.Pos.Line++
   257  		case '\r':
   258  			// For the purpose of line/column counting we consider a
   259  			// carriage return to take up no space, assuming that it will
   260  			// be paired up with a newline (on Windows, for example) that
   261  			// will account for both of them.
   262  			p.Pos.Byte++
   263  		case '\t':
   264  			// We arbitrarily count a tab as if it were two spaces, because
   265  			// we need to choose _some_ number here. This means any system
   266  			// that renders code on-screen with markers must itself treat
   267  			// tabs as a pair of spaces for rendering purposes, or instead
   268  			// use the byte offset and back into its own column position.
   269  			p.Pos.Byte++
   270  			p.Pos.Column += 2
   271  		default:
   272  			break Byte
   273  		}
   274  	}
   275  	return buf[i:], p
   276  }
   277  
   278  type pos struct {
   279  	Filename string
   280  	Pos      hcl.Pos
   281  }
   282  
   283  func (p *pos) Range(byteLen, charLen int) hcl.Range {
   284  	start := p.Pos
   285  	end := p.Pos
   286  	end.Byte += byteLen
   287  	end.Column += charLen
   288  	return hcl.Range{
   289  		Filename: p.Filename,
   290  		Start:    start,
   291  		End:      end,
   292  	}
   293  }
   294  
   295  func posRange(start, end pos) hcl.Range {
   296  	return hcl.Range{
   297  		Filename: start.Filename,
   298  		Start:    start.Pos,
   299  		End:      end.Pos,
   300  	}
   301  }
   302  
   303  func (t token) GoString() string {
   304  	return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
   305  }
   306  
   307  func isAlphabetical(b byte) bool {
   308  	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')
   309  }