github.com/hashicorp/hcl/v2@v2.20.0/hclsyntax/scan_tokens.rl (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package hclsyntax
     5  
     6  import (
     7      "bytes"
     8  
     9      "github.com/hashicorp/hcl/v2"
    10  )
    11  
    12  // This file is generated from scan_tokens.rl. DO NOT EDIT.
    13  %%{
    14    # (except when you are actually in scan_tokens.rl here, so edit away!)
    15  
    16    machine hcltok;
    17    write data;
    18  }%%
    19  
    20  func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token {
    21      stripData := stripUTF8BOM(data)
    22      start.Byte += len(data) - len(stripData)
    23      data = stripData
    24  
    25      f := &tokenAccum{
    26          Filename:  filename,
    27          Bytes:     data,
    28          Pos:       start,
    29          StartByte: start.Byte,
    30      }
    31  
    32      %%{
    33          include UnicodeDerived "unicode_derived.rl";
    34  
    35          UTF8Cont = 0x80 .. 0xBF;
    36          AnyUTF8 = (
    37              0x00..0x7F |
    38              0xC0..0xDF . UTF8Cont |
    39              0xE0..0xEF . UTF8Cont . UTF8Cont |
    40              0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
    41          );
    42          BrokenUTF8 = any - AnyUTF8;
    43  
    44          NumberLitContinue = (digit|'.'|('e'|'E') ('+'|'-')? digit);
    45          NumberLit = digit ("" | (NumberLitContinue - '.') | (NumberLitContinue* (NumberLitContinue - '.')));
    46          Ident = (ID_Start | '_') (ID_Continue | '-')*;
    47  
    48          # Symbols that just represent themselves are handled as a single rule.
    49          SelfToken = "[" | "]" | "(" | ")" | "." | "," | "*" | "/" | "%" | "+" | "-" | "=" | "<" | ">" | "!" | "?" | ":" | "\n" | "&" | "|" | "~" | "^" | ";" | "`" | "'";
    50  
    51          EqualOp = "==";
    52          NotEqual = "!=";
    53          GreaterThanEqual = ">=";
    54          LessThanEqual = "<=";
    55          LogicalAnd = "&&";
    56          LogicalOr = "||";
    57  
    58          DoubleColon = "::";
    59          Ellipsis = "...";
    60          FatArrow = "=>";
    61  
    62          Newline = '\r' ? '\n';
    63          EndOfLine = Newline;
    64  
    65          BeginStringTmpl = '"';
    66          BeginHeredocTmpl = '<<' ('-')? Ident Newline;
    67  
    68          Comment = (
    69              # The :>> operator in these is a "finish-guarded concatenation",
    70              # which terminates the sequence on its left when it completes
    71              # the sequence on its right.
    72              # In the single-line comment cases this is allowing us to make
    73              # the trailing EndOfLine optional while still having the overall
    74              # pattern terminate. In the multi-line case it ensures that
    75              # the first comment in the file ends at the first */, rather than
    76              # gobbling up all of the "any*" until the _final_ */ in the file.
    77              ("#" (any - EndOfLine)* :>> EndOfLine?) |
    78              ("//" (any - EndOfLine)* :>> EndOfLine?) |
    79              ("/*" any* :>> "*/")
    80          );
    81  
    82          # Note: hclwrite assumes that only ASCII spaces appear between tokens,
    83          # and uses this assumption to recreate the spaces between tokens by
    84          # looking at byte offset differences. This means it will produce
    85          # incorrect results in the presence of tabs, but that's acceptable
    86          # because the canonical style (which hclwrite itself can impose
    87          # automatically is to never use tabs).
    88          Spaces = (' ' | 0x09)+;
    89  
    90          action beginStringTemplate {
    91              token(TokenOQuote);
    92              fcall stringTemplate;
    93          }
    94  
    95          action endStringTemplate {
    96              token(TokenCQuote);
    97              fret;
    98          }
    99  
   100          action beginHeredocTemplate {
   101              token(TokenOHeredoc);
   102              // the token is currently the whole heredoc introducer, like
   103              // <<EOT or <<-EOT, followed by a newline. We want to extract
   104              // just the "EOT" portion that we'll use as the closing marker.
   105  
   106              marker := data[ts+2:te-1]
   107              if marker[0] == '-' {
   108                  marker = marker[1:]
   109              }
   110              if marker[len(marker)-1] == '\r' {
   111                  marker = marker[:len(marker)-1]
   112              }
   113  
   114              heredocs = append(heredocs, heredocInProgress{
   115                  Marker:      marker,
   116                  StartOfLine: true,
   117              })
   118  
   119              fcall heredocTemplate;
   120          }
   121  
   122          action heredocLiteralEOL {
   123              // This action is called specificially when a heredoc literal
   124              // ends with a newline character.
   125  
   126              // This might actually be our end marker.
   127              topdoc := &heredocs[len(heredocs)-1]
   128              if topdoc.StartOfLine {
   129                  maybeMarker := bytes.TrimSpace(data[ts:te])
   130                  if bytes.Equal(maybeMarker, topdoc.Marker) {
   131                      // We actually emit two tokens here: the end-of-heredoc
   132                      // marker first, and then separately the newline that
   133                      // follows it. This then avoids issues with the closing
   134                      // marker consuming a newline that would normally be used
   135                      // to mark the end of an attribute definition.
   136                      // We might have either a \n sequence or an \r\n sequence
   137                      // here, so we must handle both.
   138                      nls := te-1
   139                      nle := te
   140                      te--
   141                      if data[te-1] == '\r' {
   142                          // back up one more byte
   143                          nls--
   144                          te--
   145                      }
   146                      token(TokenCHeredoc);
   147                      ts = nls
   148                      te = nle
   149                      token(TokenNewline);
   150                      heredocs = heredocs[:len(heredocs)-1]
   151                      fret;
   152                  }
   153              }
   154  
   155              topdoc.StartOfLine = true;
   156              token(TokenStringLit);
   157          }
   158  
   159          action heredocLiteralMidline {
   160              // This action is called when a heredoc literal _doesn't_ end
   161              // with a newline character, e.g. because we're about to enter
   162              // an interpolation sequence.
   163              heredocs[len(heredocs)-1].StartOfLine = false;
   164              token(TokenStringLit);
   165          }
   166  
   167          action bareTemplateLiteral {
   168              token(TokenStringLit);
   169          }
   170  
   171          action beginTemplateInterp {
   172              token(TokenTemplateInterp);
   173              braces++;
   174              retBraces = append(retBraces, braces);
   175              if len(heredocs) > 0 {
   176                  heredocs[len(heredocs)-1].StartOfLine = false;
   177              }
   178              fcall main;
   179          }
   180  
   181          action beginTemplateControl {
   182              token(TokenTemplateControl);
   183              braces++;
   184              retBraces = append(retBraces, braces);
   185              if len(heredocs) > 0 {
   186                  heredocs[len(heredocs)-1].StartOfLine = false;
   187              }
   188              fcall main;
   189          }
   190  
   191          action openBrace {
   192              token(TokenOBrace);
   193              braces++;
   194          }
   195  
   196          action closeBrace {
   197              if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces {
   198                  token(TokenTemplateSeqEnd);
   199                  braces--;
   200                  retBraces = retBraces[0:len(retBraces)-1]
   201                  fret;
   202              } else {
   203                  token(TokenCBrace);
   204                  braces--;
   205              }
   206          }
   207  
   208          action closeTemplateSeqEatWhitespace {
   209              // Only consume from the retBraces stack and return if we are at
   210              // a suitable brace nesting level, otherwise things will get
   211              // confused. (Not entering this branch indicates a syntax error,
   212              // which we will catch in the parser.)
   213              if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces {
   214                  token(TokenTemplateSeqEnd);
   215                  braces--;
   216                  retBraces = retBraces[0:len(retBraces)-1]
   217                  fret;
   218              } else {
   219                  // We intentionally generate a TokenTemplateSeqEnd here,
   220                  // even though the user apparently wanted a brace, because
   221                  // we want to allow the parser to catch the incorrect use
   222                  // of a ~} to balance a generic opening brace, rather than
   223                  // a template sequence.
   224                  token(TokenTemplateSeqEnd);
   225                  braces--;
   226              }
   227          }
   228  
   229          TemplateInterp = "${" ("~")?;
   230          TemplateControl = "%{" ("~")?;
   231          EndStringTmpl = '"';
   232          NewlineChars = ("\r"|"\n");
   233          NewlineCharsSeq = NewlineChars+;
   234          StringLiteralChars = (AnyUTF8 - NewlineChars);
   235          TemplateIgnoredNonBrace = (^'{' %{ fhold; });
   236          TemplateNotInterp = '$' (TemplateIgnoredNonBrace | TemplateInterp);
   237          TemplateNotControl = '%' (TemplateIgnoredNonBrace | TemplateControl);
   238          QuotedStringLiteralWithEsc = ('\\' StringLiteralChars) | (StringLiteralChars - ("$" | '%' | '"' | "\\"));
   239          TemplateStringLiteral = (
   240              (TemplateNotInterp) |
   241              (TemplateNotControl) |
   242              (QuotedStringLiteralWithEsc)+
   243          );
   244          HeredocStringLiteral = (
   245              (TemplateNotInterp) |
   246              (TemplateNotControl) |
   247              (StringLiteralChars - ("$" | '%'))*
   248          );
   249          BareStringLiteral = (
   250              (TemplateNotInterp) |
   251              (TemplateNotControl) |
   252              (StringLiteralChars - ("$" | '%'))*
   253          ) Newline?;
   254  
   255          stringTemplate := |*
   256              TemplateInterp        => beginTemplateInterp;
   257              TemplateControl       => beginTemplateControl;
   258              EndStringTmpl         => endStringTemplate;
   259              TemplateStringLiteral => { token(TokenQuotedLit); };
   260              NewlineCharsSeq       => { token(TokenQuotedNewline); };
   261              AnyUTF8               => { token(TokenInvalid); };
   262              BrokenUTF8            => { token(TokenBadUTF8); };
   263          *|;
   264  
   265          heredocTemplate := |*
   266              TemplateInterp        => beginTemplateInterp;
   267              TemplateControl       => beginTemplateControl;
   268              HeredocStringLiteral EndOfLine => heredocLiteralEOL;
   269              HeredocStringLiteral  => heredocLiteralMidline;
   270              BrokenUTF8            => { token(TokenBadUTF8); };
   271          *|;
   272  
   273          bareTemplate := |*
   274              TemplateInterp        => beginTemplateInterp;
   275              TemplateControl       => beginTemplateControl;
   276              BareStringLiteral     => bareTemplateLiteral;
   277              BrokenUTF8            => { token(TokenBadUTF8); };
   278          *|;
   279  
   280          identOnly := |*
   281              Ident            => { token(TokenIdent) };
   282              BrokenUTF8       => { token(TokenBadUTF8) };
   283              AnyUTF8          => { token(TokenInvalid) };
   284          *|;
   285  
   286          main := |*
   287              Spaces           => {};
   288              NumberLit        => { token(TokenNumberLit) };
   289              Ident            => { token(TokenIdent) };
   290  
   291              Comment          => { token(TokenComment) };
   292              Newline          => { token(TokenNewline) };
   293  
   294              EqualOp          => { token(TokenEqualOp); };
   295              NotEqual         => { token(TokenNotEqual); };
   296              GreaterThanEqual => { token(TokenGreaterThanEq); };
   297              LessThanEqual    => { token(TokenLessThanEq); };
   298              LogicalAnd       => { token(TokenAnd); };
   299              LogicalOr        => { token(TokenOr); };
   300              DoubleColon      => { token(TokenDoubleColon); };
   301              Ellipsis         => { token(TokenEllipsis); };
   302              FatArrow         => { token(TokenFatArrow); };
   303              SelfToken        => { selfToken() };
   304  
   305              "{"              => openBrace;
   306              "}"              => closeBrace;
   307  
   308              "~}"             => closeTemplateSeqEatWhitespace;
   309  
   310              BeginStringTmpl  => beginStringTemplate;
   311              BeginHeredocTmpl => beginHeredocTemplate;
   312  
   313              BrokenUTF8       => { token(TokenBadUTF8) };
   314              AnyUTF8          => { token(TokenInvalid) };
   315          *|;
   316  
   317      }%%
   318  
   319      // Ragel state
   320  	p := 0  // "Pointer" into data
   321  	pe := len(data) // End-of-data "pointer"
   322      ts := 0
   323      te := 0
   324      act := 0
   325      eof := pe
   326      var stack []int
   327      var top int
   328  
   329      var cs int // current state
   330      switch mode {
   331      case scanNormal:
   332          cs = hcltok_en_main
   333      case scanTemplate:
   334          cs = hcltok_en_bareTemplate
   335      case scanIdentOnly:
   336          cs = hcltok_en_identOnly
   337      default:
   338          panic("invalid scanMode")
   339      }
   340  
   341      braces := 0
   342      var retBraces []int // stack of brace levels that cause us to use fret
   343      var heredocs []heredocInProgress // stack of heredocs we're currently processing
   344  
   345      %%{
   346          prepush {
   347              stack = append(stack, 0);
   348          }
   349          postpop {
   350              stack = stack[:len(stack)-1];
   351          }
   352      }%%
   353  
   354      // Make Go compiler happy
   355      _ = ts
   356      _ = te
   357      _ = act
   358      _ = eof
   359  
   360      token := func (ty TokenType) {
   361          f.emitToken(ty, ts, te)
   362      }
   363      selfToken := func () {
   364          b := data[ts:te]
   365          if len(b) != 1 {
   366              // should never happen
   367              panic("selfToken only works for single-character tokens")
   368          }
   369          f.emitToken(TokenType(b[0]), ts, te)
   370      }
   371  
   372      %%{
   373          write init nocs;
   374          write exec;
   375      }%%
   376  
   377      // If we fall out here without being in a final state then we've
   378      // encountered something that the scanner can't match, which we'll
   379      // deal with as an invalid.
   380      if cs < hcltok_first_final {
   381          if mode == scanTemplate && len(stack) == 0 {
   382              // If we're scanning a bare template then any straggling
   383              // top-level stuff is actually literal string, rather than
   384              // invalid. This handles the case where the template ends
   385              // with a single "$" or "%", which trips us up because we
   386              // want to see another character to decide if it's a sequence
   387              // or an escape.
   388              f.emitToken(TokenStringLit, ts, len(data))
   389          } else {
   390              f.emitToken(TokenInvalid, ts, len(data))
   391          }
   392      }
   393  
   394      // We always emit a synthetic EOF token at the end, since it gives the
   395      // parser position information for an "unexpected EOF" diagnostic.
   396      f.emitToken(TokenEOF, len(data), len(data))
   397  
   398      return f.Tokens
   399  }