github.com/hashicorp/hcl/v2@v2.20.0/hclsyntax/token.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package hclsyntax 5 6 import ( 7 "bytes" 8 "fmt" 9 10 "github.com/apparentlymart/go-textseg/v15/textseg" 11 "github.com/hashicorp/hcl/v2" 12 ) 13 14 // Token represents a sequence of bytes from some HCL code that has been 15 // tagged with a type and its range within the source file. 16 type Token struct { 17 Type TokenType 18 Bytes []byte 19 Range hcl.Range 20 } 21 22 // Tokens is a slice of Token. 23 type Tokens []Token 24 25 // TokenType is an enumeration used for the Type field on Token. 26 type TokenType rune 27 28 const ( 29 // Single-character tokens are represented by their own character, for 30 // convenience in producing these within the scanner. However, the values 31 // are otherwise arbitrary and just intended to be mnemonic for humans 32 // who might see them in debug output. 33 34 TokenOBrace TokenType = '{' 35 TokenCBrace TokenType = '}' 36 TokenOBrack TokenType = '[' 37 TokenCBrack TokenType = ']' 38 TokenOParen TokenType = '(' 39 TokenCParen TokenType = ')' 40 TokenOQuote TokenType = '«' 41 TokenCQuote TokenType = '»' 42 TokenOHeredoc TokenType = 'H' 43 TokenCHeredoc TokenType = 'h' 44 45 TokenStar TokenType = '*' 46 TokenSlash TokenType = '/' 47 TokenPlus TokenType = '+' 48 TokenMinus TokenType = '-' 49 TokenPercent TokenType = '%' 50 51 TokenEqual TokenType = '=' 52 TokenEqualOp TokenType = '≔' 53 TokenNotEqual TokenType = '≠' 54 TokenLessThan TokenType = '<' 55 TokenLessThanEq TokenType = '≤' 56 TokenGreaterThan TokenType = '>' 57 TokenGreaterThanEq TokenType = '≥' 58 59 TokenAnd TokenType = '∧' 60 TokenOr TokenType = '∨' 61 TokenBang TokenType = '!' 62 63 TokenDot TokenType = '.' 64 TokenComma TokenType = ',' 65 66 TokenDoubleColon TokenType = '⸬' 67 TokenEllipsis TokenType = '…' 68 TokenFatArrow TokenType = '⇒' 69 70 TokenQuestion TokenType = '?' 71 TokenColon TokenType = ':' 72 73 TokenTemplateInterp TokenType = '∫' 74 TokenTemplateControl TokenType = 'λ' 75 TokenTemplateSeqEnd TokenType = '∎' 76 77 TokenQuotedLit TokenType = 'Q' // might contain backslash escapes 78 TokenStringLit TokenType = 'S' // cannot contain backslash escapes 79 TokenNumberLit TokenType = 'N' 80 TokenIdent TokenType = 'I' 81 82 TokenComment TokenType = 'C' 83 84 TokenNewline TokenType = '\n' 85 TokenEOF TokenType = '␄' 86 87 // The rest are not used in the language but recognized by the scanner so 88 // we can generate good diagnostics in the parser when users try to write 89 // things that might work in other languages they are familiar with, or 90 // simply make incorrect assumptions about the HCL language. 91 92 TokenBitwiseAnd TokenType = '&' 93 TokenBitwiseOr TokenType = '|' 94 TokenBitwiseNot TokenType = '~' 95 TokenBitwiseXor TokenType = '^' 96 TokenStarStar TokenType = '➚' 97 TokenApostrophe TokenType = '\'' 98 TokenBacktick TokenType = '`' 99 TokenSemicolon TokenType = ';' 100 TokenTabs TokenType = '␉' 101 TokenInvalid TokenType = '�' 102 TokenBadUTF8 TokenType = '💩' 103 TokenQuotedNewline TokenType = '' 104 105 // TokenNil is a placeholder for when a token is required but none is 106 // available, e.g. when reporting errors. The scanner will never produce 107 // this as part of a token stream. 108 TokenNil TokenType = '\x00' 109 ) 110 111 func (t TokenType) GoString() string { 112 return fmt.Sprintf("hclsyntax.%s", t.String()) 113 } 114 115 type scanMode int 116 117 const ( 118 scanNormal scanMode = iota 119 scanTemplate 120 scanIdentOnly 121 ) 122 123 type tokenAccum struct { 124 Filename string 125 Bytes []byte 126 Pos hcl.Pos 127 Tokens []Token 128 StartByte int 129 } 130 131 func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) { 132 // Walk through our buffer to figure out how much we need to adjust 133 // the start pos to get our end pos. 134 135 start := f.Pos 136 start.Column += startOfs + f.StartByte - f.Pos.Byte // Safe because only ASCII spaces can be in the offset 137 start.Byte = startOfs + f.StartByte 138 139 end := start 140 end.Byte = endOfs + f.StartByte 141 b := f.Bytes[startOfs:endOfs] 142 for len(b) > 0 { 143 advance, seq, _ := textseg.ScanGraphemeClusters(b, true) 144 if (len(seq) == 1 && seq[0] == '\n') || (len(seq) == 2 && seq[0] == '\r' && seq[1] == '\n') { 145 end.Line++ 146 end.Column = 1 147 } else { 148 end.Column++ 149 } 150 b = b[advance:] 151 } 152 153 f.Pos = end 154 155 f.Tokens = append(f.Tokens, Token{ 156 Type: ty, 157 Bytes: f.Bytes[startOfs:endOfs], 158 Range: hcl.Range{ 159 Filename: f.Filename, 160 Start: start, 161 End: end, 162 }, 163 }) 164 } 165 166 type heredocInProgress struct { 167 Marker []byte 168 StartOfLine bool 169 } 170 171 func tokenOpensFlushHeredoc(tok Token) bool { 172 if tok.Type != TokenOHeredoc { 173 return false 174 } 175 return bytes.HasPrefix(tok.Bytes, []byte{'<', '<', '-'}) 176 } 177 178 // checkInvalidTokens does a simple pass across the given tokens and generates 179 // diagnostics for tokens that should _never_ appear in HCL source. This 180 // is intended to avoid the need for the parser to have special support 181 // for them all over. 182 // 183 // Returns a diagnostics with no errors if everything seems acceptable. 184 // Otherwise, returns zero or more error diagnostics, though tries to limit 185 // repetition of the same information. 186 func checkInvalidTokens(tokens Tokens) hcl.Diagnostics { 187 var diags hcl.Diagnostics 188 189 toldBitwise := 0 190 toldExponent := 0 191 toldBacktick := 0 192 toldApostrophe := 0 193 toldSemicolon := 0 194 toldTabs := 0 195 toldBadUTF8 := 0 196 197 for _, tok := range tokens { 198 tokRange := func() *hcl.Range { 199 r := tok.Range 200 return &r 201 } 202 203 switch tok.Type { 204 case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot: 205 if toldBitwise < 4 { 206 var suggestion string 207 switch tok.Type { 208 case TokenBitwiseAnd: 209 suggestion = " Did you mean boolean AND (\"&&\")?" 210 case TokenBitwiseOr: 211 suggestion = " Did you mean boolean OR (\"||\")?" 212 case TokenBitwiseNot: 213 suggestion = " Did you mean boolean NOT (\"!\")?" 214 } 215 216 diags = append(diags, &hcl.Diagnostic{ 217 Severity: hcl.DiagError, 218 Summary: "Unsupported operator", 219 Detail: fmt.Sprintf("Bitwise operators are not supported.%s", suggestion), 220 Subject: tokRange(), 221 }) 222 toldBitwise++ 223 } 224 case TokenStarStar: 225 if toldExponent < 1 { 226 diags = append(diags, &hcl.Diagnostic{ 227 Severity: hcl.DiagError, 228 Summary: "Unsupported operator", 229 Detail: "\"**\" is not a supported operator. Exponentiation is not supported as an operator.", 230 Subject: tokRange(), 231 }) 232 233 toldExponent++ 234 } 235 case TokenBacktick: 236 // Only report for alternating (even) backticks, so we won't report both start and ends of the same 237 // backtick-quoted string. 238 if (toldBacktick % 2) == 0 { 239 diags = append(diags, &hcl.Diagnostic{ 240 Severity: hcl.DiagError, 241 Summary: "Invalid character", 242 Detail: "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".", 243 Subject: tokRange(), 244 }) 245 } 246 if toldBacktick <= 2 { 247 toldBacktick++ 248 } 249 case TokenApostrophe: 250 if (toldApostrophe % 2) == 0 { 251 newDiag := &hcl.Diagnostic{ 252 Severity: hcl.DiagError, 253 Summary: "Invalid character", 254 Detail: "Single quotes are not valid. Use double quotes (\") to enclose strings.", 255 Subject: tokRange(), 256 } 257 diags = append(diags, newDiag) 258 } 259 if toldApostrophe <= 2 { 260 toldApostrophe++ 261 } 262 case TokenSemicolon: 263 if toldSemicolon < 1 { 264 diags = append(diags, &hcl.Diagnostic{ 265 Severity: hcl.DiagError, 266 Summary: "Invalid character", 267 Detail: "The \";\" character is not valid. Use newlines to separate arguments and blocks, and commas to separate items in collection values.", 268 Subject: tokRange(), 269 }) 270 271 toldSemicolon++ 272 } 273 case TokenTabs: 274 if toldTabs < 1 { 275 diags = append(diags, &hcl.Diagnostic{ 276 Severity: hcl.DiagError, 277 Summary: "Invalid character", 278 Detail: "Tab characters may not be used. The recommended indentation style is two spaces per indent.", 279 Subject: tokRange(), 280 }) 281 282 toldTabs++ 283 } 284 case TokenBadUTF8: 285 if toldBadUTF8 < 1 { 286 diags = append(diags, &hcl.Diagnostic{ 287 Severity: hcl.DiagError, 288 Summary: "Invalid character encoding", 289 Detail: "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.", 290 Subject: tokRange(), 291 }) 292 293 toldBadUTF8++ 294 } 295 case TokenQuotedNewline: 296 diags = append(diags, &hcl.Diagnostic{ 297 Severity: hcl.DiagError, 298 Summary: "Invalid multi-line string", 299 Detail: "Quoted strings may not be split over multiple lines. To produce a multi-line string, either use the \\n escape to represent a newline character or use the \"heredoc\" multi-line template syntax.", 300 Subject: tokRange(), 301 }) 302 case TokenInvalid: 303 chars := string(tok.Bytes) 304 switch chars { 305 case "“", "”": 306 diags = append(diags, &hcl.Diagnostic{ 307 Severity: hcl.DiagError, 308 Summary: "Invalid character", 309 Detail: "\"Curly quotes\" are not valid here. These can sometimes be inadvertently introduced when sharing code via documents or discussion forums. It might help to replace the character with a \"straight quote\".", 310 Subject: tokRange(), 311 }) 312 default: 313 diags = append(diags, &hcl.Diagnostic{ 314 Severity: hcl.DiagError, 315 Summary: "Invalid character", 316 Detail: "This character is not used within the language.", 317 Subject: tokRange(), 318 }) 319 } 320 } 321 } 322 return diags 323 } 324 325 var utf8BOM = []byte{0xef, 0xbb, 0xbf} 326 327 // stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order 328 // mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same 329 // backing array but with the BOM skipped. 330 // 331 // If there is no BOM present, the given slice is returned verbatim. 332 func stripUTF8BOM(src []byte) []byte { 333 if bytes.HasPrefix(src, utf8BOM) { 334 return src[3:] 335 } 336 return src 337 }