github.com/hashicorp/hcl/v2@v2.20.0/hclsyntax/scan_tokens.rl (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package hclsyntax 5 6 import ( 7 "bytes" 8 9 "github.com/hashicorp/hcl/v2" 10 ) 11 12 // This file is generated from scan_tokens.rl. DO NOT EDIT. 13 %%{ 14 # (except when you are actually in scan_tokens.rl here, so edit away!) 15 16 machine hcltok; 17 write data; 18 }%% 19 20 func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token { 21 stripData := stripUTF8BOM(data) 22 start.Byte += len(data) - len(stripData) 23 data = stripData 24 25 f := &tokenAccum{ 26 Filename: filename, 27 Bytes: data, 28 Pos: start, 29 StartByte: start.Byte, 30 } 31 32 %%{ 33 include UnicodeDerived "unicode_derived.rl"; 34 35 UTF8Cont = 0x80 .. 0xBF; 36 AnyUTF8 = ( 37 0x00..0x7F | 38 0xC0..0xDF . UTF8Cont | 39 0xE0..0xEF . UTF8Cont . UTF8Cont | 40 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont 41 ); 42 BrokenUTF8 = any - AnyUTF8; 43 44 NumberLitContinue = (digit|'.'|('e'|'E') ('+'|'-')? digit); 45 NumberLit = digit ("" | (NumberLitContinue - '.') | (NumberLitContinue* (NumberLitContinue - '.'))); 46 Ident = (ID_Start | '_') (ID_Continue | '-')*; 47 48 # Symbols that just represent themselves are handled as a single rule. 49 SelfToken = "[" | "]" | "(" | ")" | "." | "," | "*" | "/" | "%" | "+" | "-" | "=" | "<" | ">" | "!" | "?" | ":" | "\n" | "&" | "|" | "~" | "^" | ";" | "`" | "'"; 50 51 EqualOp = "=="; 52 NotEqual = "!="; 53 GreaterThanEqual = ">="; 54 LessThanEqual = "<="; 55 LogicalAnd = "&&"; 56 LogicalOr = "||"; 57 58 DoubleColon = "::"; 59 Ellipsis = "..."; 60 FatArrow = "=>"; 61 62 Newline = '\r' ? '\n'; 63 EndOfLine = Newline; 64 65 BeginStringTmpl = '"'; 66 BeginHeredocTmpl = '<<' ('-')? Ident Newline; 67 68 Comment = ( 69 # The :>> operator in these is a "finish-guarded concatenation", 70 # which terminates the sequence on its left when it completes 71 # the sequence on its right. 72 # In the single-line comment cases this is allowing us to make 73 # the trailing EndOfLine optional while still having the overall 74 # pattern terminate. In the multi-line case it ensures that 75 # the first comment in the file ends at the first */, rather than 76 # gobbling up all of the "any*" until the _final_ */ in the file. 77 ("#" (any - EndOfLine)* :>> EndOfLine?) | 78 ("//" (any - EndOfLine)* :>> EndOfLine?) | 79 ("/*" any* :>> "*/") 80 ); 81 82 # Note: hclwrite assumes that only ASCII spaces appear between tokens, 83 # and uses this assumption to recreate the spaces between tokens by 84 # looking at byte offset differences. This means it will produce 85 # incorrect results in the presence of tabs, but that's acceptable 86 # because the canonical style (which hclwrite itself can impose 87 # automatically is to never use tabs). 88 Spaces = (' ' | 0x09)+; 89 90 action beginStringTemplate { 91 token(TokenOQuote); 92 fcall stringTemplate; 93 } 94 95 action endStringTemplate { 96 token(TokenCQuote); 97 fret; 98 } 99 100 action beginHeredocTemplate { 101 token(TokenOHeredoc); 102 // the token is currently the whole heredoc introducer, like 103 // <<EOT or <<-EOT, followed by a newline. We want to extract 104 // just the "EOT" portion that we'll use as the closing marker. 105 106 marker := data[ts+2:te-1] 107 if marker[0] == '-' { 108 marker = marker[1:] 109 } 110 if marker[len(marker)-1] == '\r' { 111 marker = marker[:len(marker)-1] 112 } 113 114 heredocs = append(heredocs, heredocInProgress{ 115 Marker: marker, 116 StartOfLine: true, 117 }) 118 119 fcall heredocTemplate; 120 } 121 122 action heredocLiteralEOL { 123 // This action is called specificially when a heredoc literal 124 // ends with a newline character. 125 126 // This might actually be our end marker. 127 topdoc := &heredocs[len(heredocs)-1] 128 if topdoc.StartOfLine { 129 maybeMarker := bytes.TrimSpace(data[ts:te]) 130 if bytes.Equal(maybeMarker, topdoc.Marker) { 131 // We actually emit two tokens here: the end-of-heredoc 132 // marker first, and then separately the newline that 133 // follows it. This then avoids issues with the closing 134 // marker consuming a newline that would normally be used 135 // to mark the end of an attribute definition. 136 // We might have either a \n sequence or an \r\n sequence 137 // here, so we must handle both. 138 nls := te-1 139 nle := te 140 te-- 141 if data[te-1] == '\r' { 142 // back up one more byte 143 nls-- 144 te-- 145 } 146 token(TokenCHeredoc); 147 ts = nls 148 te = nle 149 token(TokenNewline); 150 heredocs = heredocs[:len(heredocs)-1] 151 fret; 152 } 153 } 154 155 topdoc.StartOfLine = true; 156 token(TokenStringLit); 157 } 158 159 action heredocLiteralMidline { 160 // This action is called when a heredoc literal _doesn't_ end 161 // with a newline character, e.g. because we're about to enter 162 // an interpolation sequence. 163 heredocs[len(heredocs)-1].StartOfLine = false; 164 token(TokenStringLit); 165 } 166 167 action bareTemplateLiteral { 168 token(TokenStringLit); 169 } 170 171 action beginTemplateInterp { 172 token(TokenTemplateInterp); 173 braces++; 174 retBraces = append(retBraces, braces); 175 if len(heredocs) > 0 { 176 heredocs[len(heredocs)-1].StartOfLine = false; 177 } 178 fcall main; 179 } 180 181 action beginTemplateControl { 182 token(TokenTemplateControl); 183 braces++; 184 retBraces = append(retBraces, braces); 185 if len(heredocs) > 0 { 186 heredocs[len(heredocs)-1].StartOfLine = false; 187 } 188 fcall main; 189 } 190 191 action openBrace { 192 token(TokenOBrace); 193 braces++; 194 } 195 196 action closeBrace { 197 if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces { 198 token(TokenTemplateSeqEnd); 199 braces--; 200 retBraces = retBraces[0:len(retBraces)-1] 201 fret; 202 } else { 203 token(TokenCBrace); 204 braces--; 205 } 206 } 207 208 action closeTemplateSeqEatWhitespace { 209 // Only consume from the retBraces stack and return if we are at 210 // a suitable brace nesting level, otherwise things will get 211 // confused. (Not entering this branch indicates a syntax error, 212 // which we will catch in the parser.) 213 if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces { 214 token(TokenTemplateSeqEnd); 215 braces--; 216 retBraces = retBraces[0:len(retBraces)-1] 217 fret; 218 } else { 219 // We intentionally generate a TokenTemplateSeqEnd here, 220 // even though the user apparently wanted a brace, because 221 // we want to allow the parser to catch the incorrect use 222 // of a ~} to balance a generic opening brace, rather than 223 // a template sequence. 224 token(TokenTemplateSeqEnd); 225 braces--; 226 } 227 } 228 229 TemplateInterp = "${" ("~")?; 230 TemplateControl = "%{" ("~")?; 231 EndStringTmpl = '"'; 232 NewlineChars = ("\r"|"\n"); 233 NewlineCharsSeq = NewlineChars+; 234 StringLiteralChars = (AnyUTF8 - NewlineChars); 235 TemplateIgnoredNonBrace = (^'{' %{ fhold; }); 236 TemplateNotInterp = '$' (TemplateIgnoredNonBrace | TemplateInterp); 237 TemplateNotControl = '%' (TemplateIgnoredNonBrace | TemplateControl); 238 QuotedStringLiteralWithEsc = ('\\' StringLiteralChars) | (StringLiteralChars - ("$" | '%' | '"' | "\\")); 239 TemplateStringLiteral = ( 240 (TemplateNotInterp) | 241 (TemplateNotControl) | 242 (QuotedStringLiteralWithEsc)+ 243 ); 244 HeredocStringLiteral = ( 245 (TemplateNotInterp) | 246 (TemplateNotControl) | 247 (StringLiteralChars - ("$" | '%'))* 248 ); 249 BareStringLiteral = ( 250 (TemplateNotInterp) | 251 (TemplateNotControl) | 252 (StringLiteralChars - ("$" | '%'))* 253 ) Newline?; 254 255 stringTemplate := |* 256 TemplateInterp => beginTemplateInterp; 257 TemplateControl => beginTemplateControl; 258 EndStringTmpl => endStringTemplate; 259 TemplateStringLiteral => { token(TokenQuotedLit); }; 260 NewlineCharsSeq => { token(TokenQuotedNewline); }; 261 AnyUTF8 => { token(TokenInvalid); }; 262 BrokenUTF8 => { token(TokenBadUTF8); }; 263 *|; 264 265 heredocTemplate := |* 266 TemplateInterp => beginTemplateInterp; 267 TemplateControl => beginTemplateControl; 268 HeredocStringLiteral EndOfLine => heredocLiteralEOL; 269 HeredocStringLiteral => heredocLiteralMidline; 270 BrokenUTF8 => { token(TokenBadUTF8); }; 271 *|; 272 273 bareTemplate := |* 274 TemplateInterp => beginTemplateInterp; 275 TemplateControl => beginTemplateControl; 276 BareStringLiteral => bareTemplateLiteral; 277 BrokenUTF8 => { token(TokenBadUTF8); }; 278 *|; 279 280 identOnly := |* 281 Ident => { token(TokenIdent) }; 282 BrokenUTF8 => { token(TokenBadUTF8) }; 283 AnyUTF8 => { token(TokenInvalid) }; 284 *|; 285 286 main := |* 287 Spaces => {}; 288 NumberLit => { token(TokenNumberLit) }; 289 Ident => { token(TokenIdent) }; 290 291 Comment => { token(TokenComment) }; 292 Newline => { token(TokenNewline) }; 293 294 EqualOp => { token(TokenEqualOp); }; 295 NotEqual => { token(TokenNotEqual); }; 296 GreaterThanEqual => { token(TokenGreaterThanEq); }; 297 LessThanEqual => { token(TokenLessThanEq); }; 298 LogicalAnd => { token(TokenAnd); }; 299 LogicalOr => { token(TokenOr); }; 300 DoubleColon => { token(TokenDoubleColon); }; 301 Ellipsis => { token(TokenEllipsis); }; 302 FatArrow => { token(TokenFatArrow); }; 303 SelfToken => { selfToken() }; 304 305 "{" => openBrace; 306 "}" => closeBrace; 307 308 "~}" => closeTemplateSeqEatWhitespace; 309 310 BeginStringTmpl => beginStringTemplate; 311 BeginHeredocTmpl => beginHeredocTemplate; 312 313 BrokenUTF8 => { token(TokenBadUTF8) }; 314 AnyUTF8 => { token(TokenInvalid) }; 315 *|; 316 317 }%% 318 319 // Ragel state 320 p := 0 // "Pointer" into data 321 pe := len(data) // End-of-data "pointer" 322 ts := 0 323 te := 0 324 act := 0 325 eof := pe 326 var stack []int 327 var top int 328 329 var cs int // current state 330 switch mode { 331 case scanNormal: 332 cs = hcltok_en_main 333 case scanTemplate: 334 cs = hcltok_en_bareTemplate 335 case scanIdentOnly: 336 cs = hcltok_en_identOnly 337 default: 338 panic("invalid scanMode") 339 } 340 341 braces := 0 342 var retBraces []int // stack of brace levels that cause us to use fret 343 var heredocs []heredocInProgress // stack of heredocs we're currently processing 344 345 %%{ 346 prepush { 347 stack = append(stack, 0); 348 } 349 postpop { 350 stack = stack[:len(stack)-1]; 351 } 352 }%% 353 354 // Make Go compiler happy 355 _ = ts 356 _ = te 357 _ = act 358 _ = eof 359 360 token := func (ty TokenType) { 361 f.emitToken(ty, ts, te) 362 } 363 selfToken := func () { 364 b := data[ts:te] 365 if len(b) != 1 { 366 // should never happen 367 panic("selfToken only works for single-character tokens") 368 } 369 f.emitToken(TokenType(b[0]), ts, te) 370 } 371 372 %%{ 373 write init nocs; 374 write exec; 375 }%% 376 377 // If we fall out here without being in a final state then we've 378 // encountered something that the scanner can't match, which we'll 379 // deal with as an invalid. 380 if cs < hcltok_first_final { 381 if mode == scanTemplate && len(stack) == 0 { 382 // If we're scanning a bare template then any straggling 383 // top-level stuff is actually literal string, rather than 384 // invalid. This handles the case where the template ends 385 // with a single "$" or "%", which trips us up because we 386 // want to see another character to decide if it's a sequence 387 // or an escape. 388 f.emitToken(TokenStringLit, ts, len(data)) 389 } else { 390 f.emitToken(TokenInvalid, ts, len(data)) 391 } 392 } 393 394 // We always emit a synthetic EOF token at the end, since it gives the 395 // parser position information for an "unexpected EOF" diagnostic. 396 f.emitToken(TokenEOF, len(data), len(data)) 397 398 return f.Tokens 399 }