github.com/hashicorp/hcl/v2@v2.20.0/hclwrite/parser.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package hclwrite 5 6 import ( 7 "fmt" 8 "sort" 9 10 "github.com/hashicorp/hcl/v2" 11 "github.com/hashicorp/hcl/v2/hclsyntax" 12 "github.com/zclconf/go-cty/cty" 13 ) 14 15 // Our "parser" here is actually not doing any parsing of its own. Instead, 16 // it leans on the native parser in hclsyntax, and then uses the source ranges 17 // from the AST to partition the raw token sequence to match the raw tokens 18 // up to AST nodes. 19 // 20 // This strategy feels somewhat counter-intuitive, since most of the work the 21 // parser does is thrown away here, but this strategy is chosen because the 22 // normal parsing work done by hclsyntax is considered to be the "main case", 23 // while modifying and re-printing source is more of an edge case, used only 24 // in ancillary tools, and so it's good to keep all the main parsing logic 25 // with the main case but keep all of the extra complexity of token wrangling 26 // out of the main parser, which is already rather complex just serving the 27 // use-cases it already serves. 28 // 29 // If the parsing step produces any errors, the returned File is nil because 30 // we can't reliably extract tokens from the partial AST produced by an 31 // erroneous parse. 32 func parse(src []byte, filename string, start hcl.Pos) (*File, hcl.Diagnostics) { 33 file, diags := hclsyntax.ParseConfig(src, filename, start) 34 if diags.HasErrors() { 35 return nil, diags 36 } 37 38 // To do our work here, we use the "native" tokens (those from hclsyntax) 39 // to match against source ranges in the AST, but ultimately produce 40 // slices from our sequence of "writer" tokens, which contain only 41 // *relative* position information that is more appropriate for 42 // transformation/writing use-cases. 43 nativeTokens, diags := hclsyntax.LexConfig(src, filename, start) 44 if diags.HasErrors() { 45 // should never happen, since we would've caught these diags in 46 // the first call above. 47 return nil, diags 48 } 49 writerTokens := writerTokens(nativeTokens) 50 51 from := inputTokens{ 52 nativeTokens: nativeTokens, 53 writerTokens: writerTokens, 54 } 55 56 before, root, after := parseBody(file.Body.(*hclsyntax.Body), from) 57 ret := &File{ 58 inTree: newInTree(), 59 60 srcBytes: src, 61 body: root, 62 } 63 64 nodes := ret.inTree.children 65 nodes.Append(before.Tokens()) 66 nodes.AppendNode(root) 67 nodes.Append(after.Tokens()) 68 69 return ret, diags 70 } 71 72 type inputTokens struct { 73 nativeTokens hclsyntax.Tokens 74 writerTokens Tokens 75 } 76 77 func (it inputTokens) Partition(rng hcl.Range) (before, within, after inputTokens) { 78 start, end := partitionTokens(it.nativeTokens, rng) 79 before = it.Slice(0, start) 80 within = it.Slice(start, end) 81 after = it.Slice(end, len(it.nativeTokens)) 82 return 83 } 84 85 func (it inputTokens) PartitionType(ty hclsyntax.TokenType) (before, within, after inputTokens) { 86 for i, t := range it.writerTokens { 87 if t.Type == ty { 88 return it.Slice(0, i), it.Slice(i, i+1), it.Slice(i+1, len(it.nativeTokens)) 89 } 90 } 91 panic(fmt.Sprintf("didn't find any token of type %s", ty)) 92 } 93 94 func (it inputTokens) PartitionTypeOk(ty hclsyntax.TokenType) (before, within, after inputTokens, ok bool) { 95 for i, t := range it.writerTokens { 96 if t.Type == ty { 97 return it.Slice(0, i), it.Slice(i, i+1), it.Slice(i+1, len(it.nativeTokens)), true 98 } 99 } 100 101 return inputTokens{}, inputTokens{}, inputTokens{}, false 102 } 103 104 func (it inputTokens) PartitionTypeSingle(ty hclsyntax.TokenType) (before inputTokens, found *Token, after inputTokens) { 105 before, within, after := it.PartitionType(ty) 106 if within.Len() != 1 { 107 panic("PartitionType found more than one token") 108 } 109 return before, within.Tokens()[0], after 110 } 111 112 // PartitionIncludeComments is like Partition except the returned "within" 113 // range includes any lead and line comments associated with the range. 114 func (it inputTokens) PartitionIncludingComments(rng hcl.Range) (before, within, after inputTokens) { 115 start, end := partitionTokens(it.nativeTokens, rng) 116 start = partitionLeadCommentTokens(it.nativeTokens[:start]) 117 _, afterNewline := partitionLineEndTokens(it.nativeTokens[end:]) 118 end += afterNewline 119 120 before = it.Slice(0, start) 121 within = it.Slice(start, end) 122 after = it.Slice(end, len(it.nativeTokens)) 123 return 124 125 } 126 127 // PartitionBlockItem is similar to PartitionIncludeComments but it returns 128 // the comments as separate token sequences so that they can be captured into 129 // AST attributes. It makes assumptions that apply only to block items, so 130 // should not be used for other constructs. 131 func (it inputTokens) PartitionBlockItem(rng hcl.Range) (before, leadComments, within, lineComments, newline, after inputTokens) { 132 before, within, after = it.Partition(rng) 133 before, leadComments = before.PartitionLeadComments() 134 lineComments, newline, after = after.PartitionLineEndTokens() 135 return 136 } 137 138 func (it inputTokens) PartitionLeadComments() (before, within inputTokens) { 139 start := partitionLeadCommentTokens(it.nativeTokens) 140 before = it.Slice(0, start) 141 within = it.Slice(start, len(it.nativeTokens)) 142 return 143 } 144 145 func (it inputTokens) PartitionLineEndTokens() (comments, newline, after inputTokens) { 146 afterComments, afterNewline := partitionLineEndTokens(it.nativeTokens) 147 comments = it.Slice(0, afterComments) 148 newline = it.Slice(afterComments, afterNewline) 149 after = it.Slice(afterNewline, len(it.nativeTokens)) 150 return 151 } 152 153 func (it inputTokens) Slice(start, end int) inputTokens { 154 // When we slice, we create a new slice with no additional capacity because 155 // we expect that these slices will be mutated in order to insert 156 // new code into the AST, and we want to ensure that a new underlying 157 // array gets allocated in that case, rather than writing into some 158 // following slice and corrupting it. 159 return inputTokens{ 160 nativeTokens: it.nativeTokens[start:end:end], 161 writerTokens: it.writerTokens[start:end:end], 162 } 163 } 164 165 func (it inputTokens) Len() int { 166 return len(it.nativeTokens) 167 } 168 169 func (it inputTokens) Tokens() Tokens { 170 return it.writerTokens 171 } 172 173 func (it inputTokens) Types() []hclsyntax.TokenType { 174 ret := make([]hclsyntax.TokenType, len(it.nativeTokens)) 175 for i, tok := range it.nativeTokens { 176 ret[i] = tok.Type 177 } 178 return ret 179 } 180 181 // parseBody locates the given body within the given input tokens and returns 182 // the resulting *Body object as well as the tokens that appeared before and 183 // after it. 184 func parseBody(nativeBody *hclsyntax.Body, from inputTokens) (inputTokens, *node, inputTokens) { 185 before, within, after := from.PartitionIncludingComments(nativeBody.SrcRange) 186 187 // The main AST doesn't retain the original source ordering of the 188 // body items, so we need to reconstruct that ordering by inspecting 189 // their source ranges. 190 nativeItems := make([]hclsyntax.Node, 0, len(nativeBody.Attributes)+len(nativeBody.Blocks)) 191 for _, nativeAttr := range nativeBody.Attributes { 192 nativeItems = append(nativeItems, nativeAttr) 193 } 194 for _, nativeBlock := range nativeBody.Blocks { 195 nativeItems = append(nativeItems, nativeBlock) 196 } 197 sort.Sort(nativeNodeSorter{nativeItems}) 198 199 body := &Body{ 200 inTree: newInTree(), 201 items: newNodeSet(), 202 } 203 204 remain := within 205 for _, nativeItem := range nativeItems { 206 beforeItem, item, afterItem := parseBodyItem(nativeItem, remain) 207 208 if beforeItem.Len() > 0 { 209 body.AppendUnstructuredTokens(beforeItem.Tokens()) 210 } 211 body.appendItemNode(item) 212 213 remain = afterItem 214 } 215 216 if remain.Len() > 0 { 217 body.AppendUnstructuredTokens(remain.Tokens()) 218 } 219 220 return before, newNode(body), after 221 } 222 223 func parseBodyItem(nativeItem hclsyntax.Node, from inputTokens) (inputTokens, *node, inputTokens) { 224 before, leadComments, within, lineComments, newline, after := from.PartitionBlockItem(nativeItem.Range()) 225 226 var item *node 227 228 switch tItem := nativeItem.(type) { 229 case *hclsyntax.Attribute: 230 item = parseAttribute(tItem, within, leadComments, lineComments, newline) 231 case *hclsyntax.Block: 232 item = parseBlock(tItem, within, leadComments, lineComments, newline) 233 default: 234 // should never happen if caller is behaving 235 panic("unsupported native item type") 236 } 237 238 return before, item, after 239 } 240 241 func parseAttribute(nativeAttr *hclsyntax.Attribute, from, leadComments, lineComments, newline inputTokens) *node { 242 attr := &Attribute{ 243 inTree: newInTree(), 244 } 245 children := attr.inTree.children 246 247 { 248 cn := newNode(newComments(leadComments.Tokens())) 249 attr.leadComments = cn 250 children.AppendNode(cn) 251 } 252 253 before, nameTokens, from := from.Partition(nativeAttr.NameRange) 254 { 255 children.AppendUnstructuredTokens(before.Tokens()) 256 if nameTokens.Len() != 1 { 257 // Should never happen with valid input 258 panic("attribute name is not exactly one token") 259 } 260 token := nameTokens.Tokens()[0] 261 in := newNode(newIdentifier(token)) 262 attr.name = in 263 children.AppendNode(in) 264 } 265 266 before, equalsTokens, from := from.Partition(nativeAttr.EqualsRange) 267 children.AppendUnstructuredTokens(before.Tokens()) 268 children.AppendUnstructuredTokens(equalsTokens.Tokens()) 269 270 before, exprTokens, from := from.Partition(nativeAttr.Expr.Range()) 271 { 272 children.AppendUnstructuredTokens(before.Tokens()) 273 exprNode := parseExpression(nativeAttr.Expr, exprTokens) 274 attr.expr = exprNode 275 children.AppendNode(exprNode) 276 } 277 278 { 279 cn := newNode(newComments(lineComments.Tokens())) 280 attr.lineComments = cn 281 children.AppendNode(cn) 282 } 283 284 children.AppendUnstructuredTokens(newline.Tokens()) 285 286 // Collect any stragglers, though there shouldn't be any 287 children.AppendUnstructuredTokens(from.Tokens()) 288 289 return newNode(attr) 290 } 291 292 func parseBlock(nativeBlock *hclsyntax.Block, from, leadComments, lineComments, newline inputTokens) *node { 293 block := &Block{ 294 inTree: newInTree(), 295 } 296 children := block.inTree.children 297 298 { 299 cn := newNode(newComments(leadComments.Tokens())) 300 block.leadComments = cn 301 children.AppendNode(cn) 302 } 303 304 before, typeTokens, from := from.Partition(nativeBlock.TypeRange) 305 { 306 children.AppendUnstructuredTokens(before.Tokens()) 307 if typeTokens.Len() != 1 { 308 // Should never happen with valid input 309 panic("block type name is not exactly one token") 310 } 311 token := typeTokens.Tokens()[0] 312 in := newNode(newIdentifier(token)) 313 block.typeName = in 314 children.AppendNode(in) 315 } 316 317 before, labelsNode, from := parseBlockLabels(nativeBlock, from) 318 block.labels = labelsNode 319 children.AppendNode(labelsNode) 320 321 before, oBrace, from := from.Partition(nativeBlock.OpenBraceRange) 322 children.AppendUnstructuredTokens(before.Tokens()) 323 block.open = children.AppendUnstructuredTokens(oBrace.Tokens()) 324 325 // We go a bit out of order here: we go hunting for the closing brace 326 // so that we have a delimited body, but then we'll deal with the body 327 // before we actually append the closing brace and any straggling tokens 328 // that appear after it. 329 bodyTokens, cBrace, from := from.Partition(nativeBlock.CloseBraceRange) 330 before, body, after := parseBody(nativeBlock.Body, bodyTokens) 331 children.AppendUnstructuredTokens(before.Tokens()) 332 block.body = body 333 children.AppendNode(body) 334 children.AppendUnstructuredTokens(after.Tokens()) 335 336 block.close = children.AppendUnstructuredTokens(cBrace.Tokens()) 337 338 // stragglers 339 children.AppendUnstructuredTokens(from.Tokens()) 340 if lineComments.Len() > 0 { 341 // blocks don't actually have line comments, so we'll just treat 342 // them as extra stragglers 343 children.AppendUnstructuredTokens(lineComments.Tokens()) 344 } 345 children.AppendUnstructuredTokens(newline.Tokens()) 346 347 return newNode(block) 348 } 349 350 func parseBlockLabels(nativeBlock *hclsyntax.Block, from inputTokens) (inputTokens, *node, inputTokens) { 351 labelsObj := newBlockLabels(nil) 352 children := labelsObj.children 353 354 var beforeAll inputTokens 355 for i, rng := range nativeBlock.LabelRanges { 356 var before, labelTokens inputTokens 357 before, labelTokens, from = from.Partition(rng) 358 if i == 0 { 359 beforeAll = before 360 } else { 361 children.AppendUnstructuredTokens(before.Tokens()) 362 } 363 tokens := labelTokens.Tokens() 364 var ln *node 365 if len(tokens) == 1 && tokens[0].Type == hclsyntax.TokenIdent { 366 ln = newNode(newIdentifier(tokens[0])) 367 } else { 368 ln = newNode(newQuoted(tokens)) 369 } 370 labelsObj.items.Add(ln) 371 children.AppendNode(ln) 372 } 373 374 after := from 375 return beforeAll, newNode(labelsObj), after 376 } 377 378 func parseExpression(nativeExpr hclsyntax.Expression, from inputTokens) *node { 379 expr := newExpression() 380 children := expr.inTree.children 381 382 nativeVars := nativeExpr.Variables() 383 384 for _, nativeTraversal := range nativeVars { 385 before, traversal, after := parseTraversal(nativeTraversal, from) 386 children.AppendUnstructuredTokens(before.Tokens()) 387 children.AppendNode(traversal) 388 expr.absTraversals.Add(traversal) 389 from = after 390 } 391 // Attach any stragglers that don't belong to a traversal to the expression 392 // itself. In an expression with no traversals at all, this is just the 393 // entirety of "from". 394 children.AppendUnstructuredTokens(from.Tokens()) 395 396 return newNode(expr) 397 } 398 399 func parseTraversal(nativeTraversal hcl.Traversal, from inputTokens) (before inputTokens, n *node, after inputTokens) { 400 traversal := newTraversal() 401 children := traversal.inTree.children 402 before, from, after = from.Partition(nativeTraversal.SourceRange()) 403 404 stepAfter := from 405 for _, nativeStep := range nativeTraversal { 406 before, step, after := parseTraversalStep(nativeStep, stepAfter) 407 children.AppendUnstructuredTokens(before.Tokens()) 408 children.AppendNode(step) 409 traversal.steps.Add(step) 410 stepAfter = after 411 } 412 413 return before, newNode(traversal), after 414 } 415 416 func parseTraversalStep(nativeStep hcl.Traverser, from inputTokens) (before inputTokens, n *node, after inputTokens) { 417 var children *nodes 418 switch tNativeStep := nativeStep.(type) { 419 420 case hcl.TraverseRoot, hcl.TraverseAttr: 421 step := newTraverseName() 422 children = step.inTree.children 423 before, from, after = from.Partition(nativeStep.SourceRange()) 424 inBefore, token, inAfter := from.PartitionTypeSingle(hclsyntax.TokenIdent) 425 name := newIdentifier(token) 426 children.AppendUnstructuredTokens(inBefore.Tokens()) 427 step.name = children.Append(name) 428 children.AppendUnstructuredTokens(inAfter.Tokens()) 429 return before, newNode(step), after 430 431 case hcl.TraverseIndex: 432 step := newTraverseIndex() 433 children = step.inTree.children 434 before, from, after = from.Partition(nativeStep.SourceRange()) 435 436 if inBefore, dot, from, ok := from.PartitionTypeOk(hclsyntax.TokenDot); ok { 437 children.AppendUnstructuredTokens(inBefore.Tokens()) 438 children.AppendUnstructuredTokens(dot.Tokens()) 439 440 valBefore, valToken, valAfter := from.PartitionTypeSingle(hclsyntax.TokenNumberLit) 441 children.AppendUnstructuredTokens(valBefore.Tokens()) 442 key := newNumber(valToken) 443 step.key = children.Append(key) 444 children.AppendUnstructuredTokens(valAfter.Tokens()) 445 446 return before, newNode(step), after 447 } 448 449 var inBefore, oBrack, keyTokens, cBrack inputTokens 450 inBefore, oBrack, from = from.PartitionType(hclsyntax.TokenOBrack) 451 children.AppendUnstructuredTokens(inBefore.Tokens()) 452 children.AppendUnstructuredTokens(oBrack.Tokens()) 453 keyTokens, cBrack, from = from.PartitionType(hclsyntax.TokenCBrack) 454 455 keyVal := tNativeStep.Key 456 switch keyVal.Type() { 457 case cty.String: 458 key := newQuoted(keyTokens.Tokens()) 459 step.key = children.Append(key) 460 case cty.Number: 461 valBefore, valToken, valAfter := keyTokens.PartitionTypeSingle(hclsyntax.TokenNumberLit) 462 children.AppendUnstructuredTokens(valBefore.Tokens()) 463 key := newNumber(valToken) 464 step.key = children.Append(key) 465 children.AppendUnstructuredTokens(valAfter.Tokens()) 466 } 467 468 children.AppendUnstructuredTokens(cBrack.Tokens()) 469 children.AppendUnstructuredTokens(from.Tokens()) 470 471 return before, newNode(step), after 472 default: 473 panic(fmt.Sprintf("unsupported traversal step type %T", nativeStep)) 474 } 475 476 } 477 478 // writerTokens takes a sequence of tokens as produced by the main hclsyntax 479 // package and transforms it into an equivalent sequence of tokens using 480 // this package's own token model. 481 // 482 // The resulting list contains the same number of tokens and uses the same 483 // indices as the input, allowing the two sets of tokens to be correlated 484 // by index. 485 func writerTokens(nativeTokens hclsyntax.Tokens) Tokens { 486 // Ultimately we want a slice of token _pointers_, but since we can 487 // predict how much memory we're going to devote to tokens we'll allocate 488 // it all as a single flat buffer and thus give the GC less work to do. 489 tokBuf := make([]Token, len(nativeTokens)) 490 var lastByteOffset int 491 for i, mainToken := range nativeTokens { 492 // Create a copy of the bytes so that we can mutate without 493 // corrupting the original token stream. 494 bytes := make([]byte, len(mainToken.Bytes)) 495 copy(bytes, mainToken.Bytes) 496 497 tokBuf[i] = Token{ 498 Type: mainToken.Type, 499 Bytes: bytes, 500 501 // We assume here that spaces are always ASCII spaces, since 502 // that's what the scanner also assumes, and thus the number 503 // of bytes skipped is also the number of space characters. 504 SpacesBefore: mainToken.Range.Start.Byte - lastByteOffset, 505 } 506 507 lastByteOffset = mainToken.Range.End.Byte 508 } 509 510 // Now make a slice of pointers into the previous slice. 511 ret := make(Tokens, len(tokBuf)) 512 for i := range ret { 513 ret[i] = &tokBuf[i] 514 } 515 516 return ret 517 } 518 519 // partitionTokens takes a sequence of tokens and a hcl.Range and returns 520 // two indices within the token sequence that correspond with the range 521 // boundaries, such that the slice operator could be used to produce 522 // three token sequences for before, within, and after respectively: 523 // 524 // start, end := partitionTokens(toks, rng) 525 // before := toks[:start] 526 // within := toks[start:end] 527 // after := toks[end:] 528 // 529 // This works best when the range is aligned with token boundaries (e.g. 530 // because it was produced in terms of the scanner's result) but if that isn't 531 // true then it will make a best effort that may produce strange results at 532 // the boundaries. 533 // 534 // Native hclsyntax tokens are used here, because they contain the necessary 535 // absolute position information. However, since writerTokens produces a 536 // correlatable sequence of writer tokens, the resulting indices can be 537 // used also to index into its result, allowing the partitioning of writer 538 // tokens to be driven by the partitioning of native tokens. 539 // 540 // The tokens are assumed to be in source order and non-overlapping, which 541 // will be true if the token sequence from the scanner is used directly. 542 func partitionTokens(toks hclsyntax.Tokens, rng hcl.Range) (start, end int) { 543 // We use a linear search here because we assume that in most cases our 544 // target range is close to the beginning of the sequence, and the sequences 545 // are generally small for most reasonable files anyway. 546 for i := 0; ; i++ { 547 if i >= len(toks) { 548 // No tokens for the given range at all! 549 return len(toks), len(toks) 550 } 551 552 if toks[i].Range.Start.Byte >= rng.Start.Byte { 553 start = i 554 break 555 } 556 } 557 558 for i := start; ; i++ { 559 if i >= len(toks) { 560 // The range "hangs off" the end of the token sequence 561 return start, len(toks) 562 } 563 564 if toks[i].Range.Start.Byte >= rng.End.Byte { 565 end = i // end marker is exclusive 566 break 567 } 568 } 569 570 return start, end 571 } 572 573 // partitionLeadCommentTokens takes a sequence of tokens that is assumed 574 // to immediately precede a construct that can have lead comment tokens, 575 // and returns the index into that sequence where the lead comments begin. 576 // 577 // Lead comments are defined as whole lines containing only comment tokens 578 // with no blank lines between. If no such lines are found, the returned 579 // index will be len(toks). 580 func partitionLeadCommentTokens(toks hclsyntax.Tokens) int { 581 // single-line comments (which is what we're interested in here) 582 // consume their trailing newline, so we can just walk backwards 583 // until we stop seeing comment tokens. 584 for i := len(toks) - 1; i >= 0; i-- { 585 if toks[i].Type != hclsyntax.TokenComment { 586 return i + 1 587 } 588 } 589 return 0 590 } 591 592 // partitionLineEndTokens takes a sequence of tokens that is assumed 593 // to immediately follow a construct that can have a line comment, and 594 // returns first the index where any line comments end and then second 595 // the index immediately after the trailing newline. 596 // 597 // Line comments are defined as comments that appear immediately after 598 // a construct on the same line where its significant tokens ended. 599 // 600 // Since single-line comment tokens (# and //) include the newline that 601 // terminates them, in the presence of these the two returned indices 602 // will be the same since the comment itself serves as the line end. 603 func partitionLineEndTokens(toks hclsyntax.Tokens) (afterComment, afterNewline int) { 604 for i := 0; i < len(toks); i++ { 605 tok := toks[i] 606 if tok.Type != hclsyntax.TokenComment { 607 switch tok.Type { 608 case hclsyntax.TokenNewline: 609 return i, i + 1 610 case hclsyntax.TokenEOF: 611 // Although this is valid, we mustn't include the EOF 612 // itself as our "newline" or else strange things will 613 // happen when we try to append new items. 614 return i, i 615 default: 616 // If we have well-formed input here then nothing else should be 617 // possible. This path should never happen, because we only try 618 // to extract tokens from the sequence if the parser succeeded, 619 // and it should catch this problem itself. 620 panic("malformed line trailers: expected only comments and newlines") 621 } 622 } 623 624 if len(tok.Bytes) > 0 && tok.Bytes[len(tok.Bytes)-1] == '\n' { 625 // Newline at the end of a single-line comment serves both as 626 // the end of comments *and* the end of the line. 627 return i + 1, i + 1 628 } 629 } 630 return len(toks), len(toks) 631 } 632 633 // lexConfig uses the hclsyntax scanner to get a token stream and then 634 // rewrites it into this package's token model. 635 // 636 // Any errors produced during scanning are ignored, so the results of this 637 // function should be used with care. 638 func lexConfig(src []byte) Tokens { 639 mainTokens, _ := hclsyntax.LexConfig(src, "", hcl.Pos{Byte: 0, Line: 1, Column: 1}) 640 return writerTokens(mainTokens) 641 }