github.com/hashicorp/hcl/v2@v2.20.0/hclsyntax/token.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package hclsyntax
     5  
     6  import (
     7  	"bytes"
     8  	"fmt"
     9  
    10  	"github.com/apparentlymart/go-textseg/v15/textseg"
    11  	"github.com/hashicorp/hcl/v2"
    12  )
    13  
    14  // Token represents a sequence of bytes from some HCL code that has been
    15  // tagged with a type and its range within the source file.
    16  type Token struct {
    17  	Type  TokenType
    18  	Bytes []byte
    19  	Range hcl.Range
    20  }
    21  
    22  // Tokens is a slice of Token.
    23  type Tokens []Token
    24  
    25  // TokenType is an enumeration used for the Type field on Token.
    26  type TokenType rune
    27  
    28  const (
    29  	// Single-character tokens are represented by their own character, for
    30  	// convenience in producing these within the scanner. However, the values
    31  	// are otherwise arbitrary and just intended to be mnemonic for humans
    32  	// who might see them in debug output.
    33  
    34  	TokenOBrace   TokenType = '{'
    35  	TokenCBrace   TokenType = '}'
    36  	TokenOBrack   TokenType = '['
    37  	TokenCBrack   TokenType = ']'
    38  	TokenOParen   TokenType = '('
    39  	TokenCParen   TokenType = ')'
    40  	TokenOQuote   TokenType = '«'
    41  	TokenCQuote   TokenType = '»'
    42  	TokenOHeredoc TokenType = 'H'
    43  	TokenCHeredoc TokenType = 'h'
    44  
    45  	TokenStar    TokenType = '*'
    46  	TokenSlash   TokenType = '/'
    47  	TokenPlus    TokenType = '+'
    48  	TokenMinus   TokenType = '-'
    49  	TokenPercent TokenType = '%'
    50  
    51  	TokenEqual         TokenType = '='
    52  	TokenEqualOp       TokenType = '≔'
    53  	TokenNotEqual      TokenType = '≠'
    54  	TokenLessThan      TokenType = '<'
    55  	TokenLessThanEq    TokenType = '≤'
    56  	TokenGreaterThan   TokenType = '>'
    57  	TokenGreaterThanEq TokenType = '≥'
    58  
    59  	TokenAnd  TokenType = '∧'
    60  	TokenOr   TokenType = '∨'
    61  	TokenBang TokenType = '!'
    62  
    63  	TokenDot   TokenType = '.'
    64  	TokenComma TokenType = ','
    65  
    66  	TokenDoubleColon TokenType = '⸬'
    67  	TokenEllipsis    TokenType = '…'
    68  	TokenFatArrow    TokenType = '⇒'
    69  
    70  	TokenQuestion TokenType = '?'
    71  	TokenColon    TokenType = ':'
    72  
    73  	TokenTemplateInterp  TokenType = '∫'
    74  	TokenTemplateControl TokenType = 'λ'
    75  	TokenTemplateSeqEnd  TokenType = '∎'
    76  
    77  	TokenQuotedLit TokenType = 'Q' // might contain backslash escapes
    78  	TokenStringLit TokenType = 'S' // cannot contain backslash escapes
    79  	TokenNumberLit TokenType = 'N'
    80  	TokenIdent     TokenType = 'I'
    81  
    82  	TokenComment TokenType = 'C'
    83  
    84  	TokenNewline TokenType = '\n'
    85  	TokenEOF     TokenType = '␄'
    86  
    87  	// The rest are not used in the language but recognized by the scanner so
    88  	// we can generate good diagnostics in the parser when users try to write
    89  	// things that might work in other languages they are familiar with, or
    90  	// simply make incorrect assumptions about the HCL language.
    91  
    92  	TokenBitwiseAnd    TokenType = '&'
    93  	TokenBitwiseOr     TokenType = '|'
    94  	TokenBitwiseNot    TokenType = '~'
    95  	TokenBitwiseXor    TokenType = '^'
    96  	TokenStarStar      TokenType = '➚'
    97  	TokenApostrophe    TokenType = '\''
    98  	TokenBacktick      TokenType = '`'
    99  	TokenSemicolon     TokenType = ';'
   100  	TokenTabs          TokenType = '␉'
   101  	TokenInvalid       TokenType = '�'
   102  	TokenBadUTF8       TokenType = '💩'
   103  	TokenQuotedNewline TokenType = '␤'
   104  
   105  	// TokenNil is a placeholder for when a token is required but none is
   106  	// available, e.g. when reporting errors. The scanner will never produce
   107  	// this as part of a token stream.
   108  	TokenNil TokenType = '\x00'
   109  )
   110  
   111  func (t TokenType) GoString() string {
   112  	return fmt.Sprintf("hclsyntax.%s", t.String())
   113  }
   114  
   115  type scanMode int
   116  
   117  const (
   118  	scanNormal scanMode = iota
   119  	scanTemplate
   120  	scanIdentOnly
   121  )
   122  
   123  type tokenAccum struct {
   124  	Filename  string
   125  	Bytes     []byte
   126  	Pos       hcl.Pos
   127  	Tokens    []Token
   128  	StartByte int
   129  }
   130  
   131  func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
   132  	// Walk through our buffer to figure out how much we need to adjust
   133  	// the start pos to get our end pos.
   134  
   135  	start := f.Pos
   136  	start.Column += startOfs + f.StartByte - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
   137  	start.Byte = startOfs + f.StartByte
   138  
   139  	end := start
   140  	end.Byte = endOfs + f.StartByte
   141  	b := f.Bytes[startOfs:endOfs]
   142  	for len(b) > 0 {
   143  		advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
   144  		if (len(seq) == 1 && seq[0] == '\n') || (len(seq) == 2 && seq[0] == '\r' && seq[1] == '\n') {
   145  			end.Line++
   146  			end.Column = 1
   147  		} else {
   148  			end.Column++
   149  		}
   150  		b = b[advance:]
   151  	}
   152  
   153  	f.Pos = end
   154  
   155  	f.Tokens = append(f.Tokens, Token{
   156  		Type:  ty,
   157  		Bytes: f.Bytes[startOfs:endOfs],
   158  		Range: hcl.Range{
   159  			Filename: f.Filename,
   160  			Start:    start,
   161  			End:      end,
   162  		},
   163  	})
   164  }
   165  
   166  type heredocInProgress struct {
   167  	Marker      []byte
   168  	StartOfLine bool
   169  }
   170  
   171  func tokenOpensFlushHeredoc(tok Token) bool {
   172  	if tok.Type != TokenOHeredoc {
   173  		return false
   174  	}
   175  	return bytes.HasPrefix(tok.Bytes, []byte{'<', '<', '-'})
   176  }
   177  
   178  // checkInvalidTokens does a simple pass across the given tokens and generates
   179  // diagnostics for tokens that should _never_ appear in HCL source. This
   180  // is intended to avoid the need for the parser to have special support
   181  // for them all over.
   182  //
   183  // Returns a diagnostics with no errors if everything seems acceptable.
   184  // Otherwise, returns zero or more error diagnostics, though tries to limit
   185  // repetition of the same information.
   186  func checkInvalidTokens(tokens Tokens) hcl.Diagnostics {
   187  	var diags hcl.Diagnostics
   188  
   189  	toldBitwise := 0
   190  	toldExponent := 0
   191  	toldBacktick := 0
   192  	toldApostrophe := 0
   193  	toldSemicolon := 0
   194  	toldTabs := 0
   195  	toldBadUTF8 := 0
   196  
   197  	for _, tok := range tokens {
   198  		tokRange := func() *hcl.Range {
   199  			r := tok.Range
   200  			return &r
   201  		}
   202  
   203  		switch tok.Type {
   204  		case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot:
   205  			if toldBitwise < 4 {
   206  				var suggestion string
   207  				switch tok.Type {
   208  				case TokenBitwiseAnd:
   209  					suggestion = " Did you mean boolean AND (\"&&\")?"
   210  				case TokenBitwiseOr:
   211  					suggestion = " Did you mean boolean OR (\"||\")?"
   212  				case TokenBitwiseNot:
   213  					suggestion = " Did you mean boolean NOT (\"!\")?"
   214  				}
   215  
   216  				diags = append(diags, &hcl.Diagnostic{
   217  					Severity: hcl.DiagError,
   218  					Summary:  "Unsupported operator",
   219  					Detail:   fmt.Sprintf("Bitwise operators are not supported.%s", suggestion),
   220  					Subject:  tokRange(),
   221  				})
   222  				toldBitwise++
   223  			}
   224  		case TokenStarStar:
   225  			if toldExponent < 1 {
   226  				diags = append(diags, &hcl.Diagnostic{
   227  					Severity: hcl.DiagError,
   228  					Summary:  "Unsupported operator",
   229  					Detail:   "\"**\" is not a supported operator. Exponentiation is not supported as an operator.",
   230  					Subject:  tokRange(),
   231  				})
   232  
   233  				toldExponent++
   234  			}
   235  		case TokenBacktick:
   236  			// Only report for alternating (even) backticks, so we won't report both start and ends of the same
   237  			// backtick-quoted string.
   238  			if (toldBacktick % 2) == 0 {
   239  				diags = append(diags, &hcl.Diagnostic{
   240  					Severity: hcl.DiagError,
   241  					Summary:  "Invalid character",
   242  					Detail:   "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".",
   243  					Subject:  tokRange(),
   244  				})
   245  			}
   246  			if toldBacktick <= 2 {
   247  				toldBacktick++
   248  			}
   249  		case TokenApostrophe:
   250  			if (toldApostrophe % 2) == 0 {
   251  				newDiag := &hcl.Diagnostic{
   252  					Severity: hcl.DiagError,
   253  					Summary:  "Invalid character",
   254  					Detail:   "Single quotes are not valid. Use double quotes (\") to enclose strings.",
   255  					Subject:  tokRange(),
   256  				}
   257  				diags = append(diags, newDiag)
   258  			}
   259  			if toldApostrophe <= 2 {
   260  				toldApostrophe++
   261  			}
   262  		case TokenSemicolon:
   263  			if toldSemicolon < 1 {
   264  				diags = append(diags, &hcl.Diagnostic{
   265  					Severity: hcl.DiagError,
   266  					Summary:  "Invalid character",
   267  					Detail:   "The \";\" character is not valid. Use newlines to separate arguments and blocks, and commas to separate items in collection values.",
   268  					Subject:  tokRange(),
   269  				})
   270  
   271  				toldSemicolon++
   272  			}
   273  		case TokenTabs:
   274  			if toldTabs < 1 {
   275  				diags = append(diags, &hcl.Diagnostic{
   276  					Severity: hcl.DiagError,
   277  					Summary:  "Invalid character",
   278  					Detail:   "Tab characters may not be used. The recommended indentation style is two spaces per indent.",
   279  					Subject:  tokRange(),
   280  				})
   281  
   282  				toldTabs++
   283  			}
   284  		case TokenBadUTF8:
   285  			if toldBadUTF8 < 1 {
   286  				diags = append(diags, &hcl.Diagnostic{
   287  					Severity: hcl.DiagError,
   288  					Summary:  "Invalid character encoding",
   289  					Detail:   "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.",
   290  					Subject:  tokRange(),
   291  				})
   292  
   293  				toldBadUTF8++
   294  			}
   295  		case TokenQuotedNewline:
   296  			diags = append(diags, &hcl.Diagnostic{
   297  				Severity: hcl.DiagError,
   298  				Summary:  "Invalid multi-line string",
   299  				Detail:   "Quoted strings may not be split over multiple lines. To produce a multi-line string, either use the \\n escape to represent a newline character or use the \"heredoc\" multi-line template syntax.",
   300  				Subject:  tokRange(),
   301  			})
   302  		case TokenInvalid:
   303  			chars := string(tok.Bytes)
   304  			switch chars {
   305  			case "“", "”":
   306  				diags = append(diags, &hcl.Diagnostic{
   307  					Severity: hcl.DiagError,
   308  					Summary:  "Invalid character",
   309  					Detail:   "\"Curly quotes\" are not valid here. These can sometimes be inadvertently introduced when sharing code via documents or discussion forums. It might help to replace the character with a \"straight quote\".",
   310  					Subject:  tokRange(),
   311  				})
   312  			default:
   313  				diags = append(diags, &hcl.Diagnostic{
   314  					Severity: hcl.DiagError,
   315  					Summary:  "Invalid character",
   316  					Detail:   "This character is not used within the language.",
   317  					Subject:  tokRange(),
   318  				})
   319  			}
   320  		}
   321  	}
   322  	return diags
   323  }
   324  
   325  var utf8BOM = []byte{0xef, 0xbb, 0xbf}
   326  
   327  // stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order
   328  // mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same
   329  // backing array but with the BOM skipped.
   330  //
   331  // If there is no BOM present, the given slice is returned verbatim.
   332  func stripUTF8BOM(src []byte) []byte {
   333  	if bytes.HasPrefix(src, utf8BOM) {
   334  		return src[3:]
   335  	}
   336  	return src
   337  }