github.com/crossplane/upjet@v1.3.0/pkg/registry/meta.go (about) 1 // SPDX-FileCopyrightText: 2023 The Crossplane Authors <https://crossplane.io> 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package registry 6 7 import ( 8 "bytes" 9 "encoding/json" 10 "fmt" 11 "io/fs" 12 "os" 13 "path/filepath" 14 "regexp" 15 "sort" 16 "strings" 17 18 "github.com/antchfx/htmlquery" 19 "github.com/hashicorp/hcl/v2" 20 "github.com/hashicorp/hcl/v2/hclparse" 21 "github.com/hashicorp/hcl/v2/hclsyntax" 22 "github.com/pkg/errors" 23 "github.com/tmccombs/hcl2json/convert" 24 "github.com/yuin/goldmark" 25 "golang.org/x/net/html" 26 "gopkg.in/yaml.v3" 27 ) 28 29 const ( 30 blockResource = "resource" 31 keySubCategory = "subcategory" 32 keyDescription = "description" 33 keyPageTitle = "page_title" 34 ) 35 36 var ( 37 regexConfigurationBlock = regexp.MustCompile(`block.*(support)?`) 38 regexHeaderNode = regexp.MustCompile(`h\d`) 39 ) 40 41 // NewProviderMetadata initializes a new ProviderMetadata for 42 // extracting metadata from the Terraform registry. 43 func NewProviderMetadata(name string) *ProviderMetadata { 44 return &ProviderMetadata{ 45 Name: name, 46 Resources: make(map[string]*Resource), 47 } 48 } 49 50 func (r *Resource) addExampleManifest(file *hcl.File, body *hclsyntax.Block) error { 51 refs, err := r.findReferences("", file, body) 52 if err != nil { 53 return err 54 } 55 r.Examples = append(r.Examples, ResourceExample{ 56 Name: body.Labels[1], 57 References: refs, 58 }) 59 return nil 60 } 61 62 func getResourceNameFromPath(path, resourcePrefix string) string { 63 tokens := strings.Split(filepath.Base(path), ".") 64 if len(tokens) < 2 { 65 return "" 66 } 67 prefix := "" 68 if len(resourcePrefix) != 0 { 69 prefix = resourcePrefix + "_" 70 } 71 return fmt.Sprintf("%s%s", prefix, tokens[0]) 72 } 73 74 func (r *Resource) scrapeExamples(doc *html.Node, codeElXPath string, path string, resourcePrefix string, debug bool) error { //nolint: gocyclo 75 resourceName := r.Title 76 nodes := htmlquery.Find(doc, codeElXPath) 77 for _, n := range nodes { 78 parser := hclparse.NewParser() 79 f, diag := parser.ParseHCL([]byte(n.Data), "example.hcl") 80 if debug && diag != nil && diag.HasErrors() { 81 fmt.Println(errors.Wrapf(diag, "failed to parse example Terraform configuration for %q: Configuration:\n%s", resourceName, n.Data)) 82 } 83 if f == nil { 84 continue 85 } 86 body, ok := f.Body.(*hclsyntax.Body) 87 if !ok { 88 return errors.Errorf("not an HCL Body: %s", n.Data) 89 } 90 trimmed := make(hclsyntax.Blocks, 0, len(body.Blocks)) 91 for _, b := range body.Blocks { 92 if b.Type == blockResource { 93 trimmed = append(trimmed, b) 94 } 95 } 96 body.Blocks = trimmed 97 // first try an exact match to find the example 98 if len(resourceName) == 0 { 99 resourceName = getResourceNameFromPath(path, resourcePrefix) 100 } 101 if err := r.findExampleBlock(f, body.Blocks, &resourceName, true); err != nil { 102 return err 103 } 104 r.Name = resourceName 105 } 106 107 if r.Name == "" { 108 r.Name = resourceName 109 } 110 return nil 111 } 112 113 func (r *Resource) findReferences(parentPath string, file *hcl.File, b *hclsyntax.Block) (map[string]string, error) { //nolint: gocyclo 114 refs := make(map[string]string) 115 if parentPath == "" && b.Labels[0] != r.Name { 116 return refs, nil 117 } 118 for name, attr := range b.Body.Attributes { 119 e, ok := attr.Expr.(*hclsyntax.ScopeTraversalExpr) 120 if !ok { 121 continue 122 } 123 refName := name 124 if parentPath != "" { 125 refName = fmt.Sprintf("%s.%s", parentPath, refName) 126 } 127 ref := string(file.Bytes[e.Range().Start.Byte:e.Range().End.Byte]) 128 if v, ok := refs[refName]; ok && v != ref { 129 return nil, errors.Errorf("attribute %s.%s refers to %s. New reference: %s", r.Name, refName, v, ref) 130 } 131 refs[refName] = ref 132 } 133 for _, nestedBlock := range b.Body.Blocks { 134 path := nestedBlock.Type 135 if parentPath != "" { 136 path = fmt.Sprintf("%s.%s", parentPath, path) 137 } 138 nestedRefs, err := r.findReferences(path, file, nestedBlock) 139 if err != nil { 140 return nil, errors.Wrapf(err, "cannot find references in nested block: %s", path) 141 } 142 for k, v := range nestedRefs { 143 refs[k] = v 144 } 145 } 146 return refs, nil 147 } 148 149 func suffixMatch(label, resourceName string, limit int) bool { 150 suffixParts := strings.Split(resourceName, "_") 151 for i := 0; i < len(suffixParts) && (limit == -1 || i <= limit); i++ { 152 s := strings.Join(suffixParts[i:], "_") 153 if strings.Contains(label, s) { 154 return true 155 } 156 } 157 return false 158 } 159 160 func convertManifest2JSON(file *hcl.File, b *hclsyntax.Block) (string, error) { 161 buff, err := convert.File(&hcl.File{ 162 Body: b.Body, 163 Bytes: file.Bytes, 164 }, convert.Options{}) 165 if err != nil { 166 return "", errors.Wrap(err, "failed to format as JSON") 167 } 168 out := bytes.Buffer{} 169 err = json.Indent(&out, buff, "", " ") 170 if err != nil { 171 return "", errors.Wrap(err, "unable to format JSON example manifest") 172 } 173 return out.String(), nil 174 } 175 176 func (r *Resource) findExampleBlock(file *hcl.File, blocks hclsyntax.Blocks, resourceName *string, exactMatch bool) error { //nolint:gocyclo 177 dependencies := make(map[string]string) 178 for _, b := range blocks { 179 depKey := fmt.Sprintf("%s.%s", b.Labels[0], b.Labels[1]) 180 m, err := convertManifest2JSON(file, b) 181 if err != nil { 182 return errors.Wrap(err, "failed to convert example manifest to JSON") 183 } 184 if b.Labels[0] != *resourceName { 185 if exactMatch { 186 dependencies[depKey] = m 187 continue 188 } 189 190 if suffixMatch(b.Labels[0], *resourceName, 1) || (strings.Contains(*resourceName, b.Labels[0]) && strings.Count(*resourceName, "_") == strings.Count(b.Labels[0], "_")) { 191 *resourceName = b.Labels[0] 192 exactMatch = true 193 } else { 194 dependencies[depKey] = m 195 continue 196 } 197 } 198 r.Name = *resourceName 199 err = r.addExampleManifest(file, b) 200 r.Examples[len(r.Examples)-1].Manifest = m 201 r.Examples[len(r.Examples)-1].Dependencies = dependencies 202 if err != nil { 203 return errors.Wrap(err, "failed to add example manifest to resource") 204 } 205 } 206 207 if len(r.Examples) == 0 && exactMatch { 208 return r.findExampleBlock(file, blocks, resourceName, false) 209 } 210 return nil 211 } 212 213 func (r *Resource) scrapePrelude(doc *html.Node, preludeXPath string) error { 214 // parse prelude 215 nodes := htmlquery.Find(doc, preludeXPath) 216 if len(nodes) == 0 { 217 return errors.Errorf("failed to find the prelude of the document using the xpath expressions: %s", preludeXPath) 218 } 219 220 n := nodes[0] 221 lines := strings.Split(n.Data, "\n") 222 descIndex := -1 223 for i, l := range lines { 224 kv := strings.Split(l, ":") 225 if len(kv) < 2 { 226 continue 227 } 228 switch kv[0] { 229 case keyPageTitle: 230 r.Title = strings.TrimSpace(strings.ReplaceAll(kv[len(kv)-1], `"`, "")) 231 232 case keyDescription: 233 r.Description = kv[1] 234 descIndex = i 235 236 case keySubCategory: 237 r.SubCategory = strings.TrimSpace(strings.ReplaceAll(kv[1], `"`, "")) 238 } 239 } 240 241 if descIndex > -1 { 242 r.Description += strings.Join(lines[descIndex+1:], " ") 243 } 244 r.Description = strings.TrimSpace(strings.Replace(r.Description, "|-", "", 1)) 245 246 return nil 247 } 248 249 func (r *Resource) scrapeFieldDocs(doc *html.Node, fieldXPath string) { 250 processed := make(map[*html.Node]struct{}) 251 codeNodes := htmlquery.Find(doc, fieldXPath) 252 for _, n := range codeNodes { 253 attrName := "" 254 docStr := r.scrapeDocString(n, &attrName, processed) 255 if docStr == "" { 256 continue 257 } 258 if r.ArgumentDocs == nil { 259 r.ArgumentDocs = make(map[string]string) 260 } 261 if r.ArgumentDocs[attrName] != "" && r.ArgumentDocs[attrName] != strings.TrimSpace(docStr) { 262 continue 263 } 264 r.ArgumentDocs[attrName] = strings.TrimSpace(docStr) 265 } 266 } 267 268 // getRootPath extracts the root attribute name for the specified HTML node n, 269 // from the preceding paragraph or header HTML nodes. 270 func (r *Resource) getRootPath(n *html.Node) string { 271 var ulNode, pNode *html.Node 272 for ulNode = n.Parent; ulNode != nil && ulNode.Data != "ul"; ulNode = ulNode.Parent { 273 } 274 if ulNode == nil { 275 return "" 276 } 277 for pNode = ulNode.PrevSibling; pNode != nil && (pNode.Data != "p" || !regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode)))); pNode = pNode.PrevSibling { 278 // if it's an HTML header node 279 if regexHeaderNode.MatchString(pNode.Data) { 280 return r.extractRootFromHeader(pNode) 281 } 282 } 283 if pNode == nil { 284 return "" 285 } 286 return r.extractRootFromParagraph(pNode) 287 } 288 289 // extractRootFromHeader extracts the root Terraform attribute name 290 // from the children of the specified header HTML node. 291 func (r *Resource) extractRootFromHeader(pNode *html.Node) string { 292 headerText := extractText(pNode) 293 if _, ok := r.ArgumentDocs[headerText]; ok { 294 return headerText 295 } 296 sortedKeys := make([]string, 0, len(r.ArgumentDocs)) 297 for k := range r.ArgumentDocs { 298 sortedKeys = append(sortedKeys, k) 299 } 300 sort.Strings(sortedKeys) 301 for _, k := range sortedKeys { 302 parts := strings.Split(k, ".") 303 if headerText == parts[len(parts)-1] { 304 return k 305 } 306 } 307 // try to convert header text to a hierarchical attribute name. 308 // For certain headers, the header text is attribute's relative (partial) 309 // hierarchical name separated with spaces. 310 if _, ok := r.ArgumentDocs[strings.ReplaceAll(headerText, " ", ".")]; ok { 311 return strings.ReplaceAll(headerText, " ", ".") 312 } 313 if regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode))) { 314 for _, s := range strings.Split(headerText, " ") { 315 if _, ok := r.ArgumentDocs[s]; ok { 316 return s 317 } 318 } 319 } 320 return "" 321 } 322 323 // extractRootFromParagraph extracts the root Terraform attribute name 324 // from the children of the specified paragraph HTML node. 325 func (r *Resource) extractRootFromParagraph(pNode *html.Node) string { 326 var codeNode *html.Node 327 for codeNode = pNode.FirstChild; codeNode != nil && codeNode.Data != "code"; codeNode = codeNode.NextSibling { 328 // intentionally left empty 329 } 330 if codeNode == nil || codeNode.FirstChild == nil { 331 return "" 332 } 333 prevLiNode := getPrevLiWithCodeText(codeNode.FirstChild.Data, pNode) 334 if prevLiNode == nil { 335 return codeNode.FirstChild.Data 336 } 337 root := r.getRootPath(prevLiNode) 338 if len(root) == 0 { 339 return codeNode.FirstChild.Data 340 } 341 return fmt.Sprintf("%s.%s", root, codeNode.FirstChild.Data) 342 } 343 344 // getPrevLiWithCodeText returns the list item node (in an UL) with 345 // a code child with text `codeText`. 346 func getPrevLiWithCodeText(codeText string, pNode *html.Node) *html.Node { 347 var ulNode, liNode *html.Node 348 for ulNode = pNode.PrevSibling; ulNode != nil && ulNode.Data != "ul"; ulNode = ulNode.PrevSibling { 349 } 350 if ulNode == nil { 351 return nil 352 } 353 for liNode = ulNode.FirstChild; liNode != nil; liNode = liNode.NextSibling { 354 if liNode.Data != "li" || liNode.FirstChild == nil || liNode.FirstChild.Data != "code" || liNode.FirstChild.FirstChild.Data != codeText { 355 continue 356 } 357 return liNode 358 } 359 return nil 360 } 361 362 // extractText extracts text from the children of an element node, 363 // removing any HTML tags and leaving only text data. 364 func extractText(n *html.Node) string { 365 switch n.Type { //nolint:exhaustive 366 case html.TextNode: 367 return n.Data 368 case html.ElementNode: 369 sb := strings.Builder{} 370 for c := n.FirstChild; c != nil; c = c.NextSibling { 371 s := "" 372 if c.Type != html.TextNode { 373 s = extractText(c) 374 } else { 375 s = c.Data 376 } 377 if len(s) != 0 { 378 sb.WriteString(s) 379 } 380 } 381 return sb.String() 382 default: 383 return "" 384 } 385 } 386 387 func (r *Resource) scrapeDocString(n *html.Node, attrName *string, processed map[*html.Node]struct{}) string { 388 if _, ok := processed[n]; ok { 389 return "" 390 } 391 processed[n] = struct{}{} 392 393 if n.Type == html.ElementNode { 394 return r.scrapeDocString(n.FirstChild, attrName, processed) 395 } 396 397 sb := strings.Builder{} 398 if *attrName == "" { 399 *attrName = n.Data 400 if root := r.getRootPath(n); len(root) != 0 { 401 *attrName = fmt.Sprintf("%s.%s", root, *attrName) 402 } 403 } else { 404 sb.WriteString(n.Data) 405 } 406 s := n.Parent 407 for s = s.NextSibling; s != nil; s = s.NextSibling { 408 if _, ok := processed[s]; ok { 409 continue 410 } 411 processed[s] = struct{}{} 412 413 switch s.Type { //nolint:exhaustive 414 case html.TextNode: 415 sb.WriteString(s.Data) 416 case html.ElementNode: 417 if s.FirstChild == nil { 418 continue 419 } 420 sb.WriteString(r.scrapeDocString(s.FirstChild, attrName, processed)) 421 } 422 } 423 return sb.String() 424 } 425 426 func (r *Resource) scrapeImportStatements(doc *html.Node, importXPath string) { 427 nodes := htmlquery.Find(doc, importXPath) 428 for _, n := range nodes { 429 r.ImportStatements = append(r.ImportStatements, strings.TrimSpace(n.Data)) 430 } 431 } 432 433 // scrape scrapes resource metadata from the specified HTML doc. 434 // filename is not always the precise resource name, hence, 435 // it returns the resource name scraped from the doc. 436 func (r *Resource) scrape(path string, config *ScrapeConfiguration) error { 437 source, err := os.ReadFile(filepath.Clean(path)) 438 if err != nil { 439 return errors.Wrap(err, "failed to read markdown file") 440 } 441 442 var buff bytes.Buffer 443 if err := goldmark.Convert(source, &buff); err != nil { 444 return errors.Wrap(err, "failed to convert markdown") 445 } 446 447 doc, err := htmlquery.Parse(&buff) 448 if err != nil { 449 return errors.Wrap(err, "failed to parse HTML") 450 } 451 452 if err := r.scrapePrelude(doc, config.PreludeXPath); err != nil { 453 return err 454 } 455 456 r.scrapeFieldDocs(doc, config.FieldDocXPath) 457 r.scrapeImportStatements(doc, config.ImportXPath) 458 459 return r.scrapeExamples(doc, config.CodeXPath, path, config.ResourcePrefix, config.Debug) 460 } 461 462 // ScrapeConfiguration is a configurator for the scraper 463 type ScrapeConfiguration struct { 464 // Debug Output debug messages 465 Debug bool 466 // RepoPath is the path of the Terraform native provider repo 467 RepoPath string 468 // CodeXPath Code XPath expression 469 CodeXPath string 470 // PreludeXPath Prelude XPath expression 471 PreludeXPath string 472 // FieldDocXPath Field documentation XPath expression 473 FieldDocXPath string 474 // ImportXPath Import statements XPath expression 475 ImportXPath string 476 // FileExtensions extensions of the files to be scraped 477 FileExtensions []string 478 // ResourcePrefix Terraform resource name prefix for the Terraform provider 479 ResourcePrefix string 480 } 481 482 func (sc *ScrapeConfiguration) hasExpectedExtension(fileName string) bool { 483 for _, e := range sc.FileExtensions { 484 if e == filepath.Ext(fileName) { 485 return true 486 } 487 } 488 return false 489 } 490 491 // ScrapeRepo scrape metadata from the configured Terraform native provider repo 492 func (pm *ProviderMetadata) ScrapeRepo(config *ScrapeConfiguration) error { 493 return errors.Wrap(filepath.WalkDir(config.RepoPath, func(path string, d fs.DirEntry, err error) error { 494 if err != nil { 495 return errors.Wrap(err, "failed to traverse Terraform registry") 496 } 497 if d.IsDir() || !config.hasExpectedExtension(d.Name()) { 498 return nil 499 } 500 r := &Resource{} 501 // don't scrape if file is empty 502 filename := filepath.Clean(path) 503 b, err := os.ReadFile(filename) 504 if err != nil { 505 return errors.Wrap(err, "failed to read markdown file") 506 } 507 if len(b) == 1 { 508 fmt.Printf("skipping empty file: %s\n", filename) 509 return nil 510 } 511 if err := r.scrape(path, config); err != nil { 512 return errors.Wrapf(err, "failed to scrape resource metadata from path: %s", path) 513 } 514 515 pm.Resources[r.Name] = r 516 return nil 517 }), "cannot scrape Terraform registry") 518 } 519 520 // Store stores this scraped ProviderMetadata at the specified path 521 func (pm *ProviderMetadata) Store(path string) error { 522 out, err := yaml.Marshal(pm) 523 if err != nil { 524 return errors.Wrap(err, "failed to marshal provider metadata to YAML") 525 } 526 return errors.Wrapf(os.WriteFile(path, out, 0600), "failed to write provider metada file: %s", path) 527 }