github.com/crossplane/upjet@v1.3.0/pkg/registry/meta.go (about)

     1  // SPDX-FileCopyrightText: 2023 The Crossplane Authors <https://crossplane.io>
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package registry
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/json"
    10  	"fmt"
    11  	"io/fs"
    12  	"os"
    13  	"path/filepath"
    14  	"regexp"
    15  	"sort"
    16  	"strings"
    17  
    18  	"github.com/antchfx/htmlquery"
    19  	"github.com/hashicorp/hcl/v2"
    20  	"github.com/hashicorp/hcl/v2/hclparse"
    21  	"github.com/hashicorp/hcl/v2/hclsyntax"
    22  	"github.com/pkg/errors"
    23  	"github.com/tmccombs/hcl2json/convert"
    24  	"github.com/yuin/goldmark"
    25  	"golang.org/x/net/html"
    26  	"gopkg.in/yaml.v3"
    27  )
    28  
    29  const (
    30  	blockResource  = "resource"
    31  	keySubCategory = "subcategory"
    32  	keyDescription = "description"
    33  	keyPageTitle   = "page_title"
    34  )
    35  
    36  var (
    37  	regexConfigurationBlock = regexp.MustCompile(`block.*(support)?`)
    38  	regexHeaderNode         = regexp.MustCompile(`h\d`)
    39  )
    40  
    41  // NewProviderMetadata initializes a new ProviderMetadata for
    42  // extracting metadata from the Terraform registry.
    43  func NewProviderMetadata(name string) *ProviderMetadata {
    44  	return &ProviderMetadata{
    45  		Name:      name,
    46  		Resources: make(map[string]*Resource),
    47  	}
    48  }
    49  
    50  func (r *Resource) addExampleManifest(file *hcl.File, body *hclsyntax.Block) error {
    51  	refs, err := r.findReferences("", file, body)
    52  	if err != nil {
    53  		return err
    54  	}
    55  	r.Examples = append(r.Examples, ResourceExample{
    56  		Name:       body.Labels[1],
    57  		References: refs,
    58  	})
    59  	return nil
    60  }
    61  
    62  func getResourceNameFromPath(path, resourcePrefix string) string {
    63  	tokens := strings.Split(filepath.Base(path), ".")
    64  	if len(tokens) < 2 {
    65  		return ""
    66  	}
    67  	prefix := ""
    68  	if len(resourcePrefix) != 0 {
    69  		prefix = resourcePrefix + "_"
    70  	}
    71  	return fmt.Sprintf("%s%s", prefix, tokens[0])
    72  }
    73  
    74  func (r *Resource) scrapeExamples(doc *html.Node, codeElXPath string, path string, resourcePrefix string, debug bool) error { //nolint: gocyclo
    75  	resourceName := r.Title
    76  	nodes := htmlquery.Find(doc, codeElXPath)
    77  	for _, n := range nodes {
    78  		parser := hclparse.NewParser()
    79  		f, diag := parser.ParseHCL([]byte(n.Data), "example.hcl")
    80  		if debug && diag != nil && diag.HasErrors() {
    81  			fmt.Println(errors.Wrapf(diag, "failed to parse example Terraform configuration for %q: Configuration:\n%s", resourceName, n.Data))
    82  		}
    83  		if f == nil {
    84  			continue
    85  		}
    86  		body, ok := f.Body.(*hclsyntax.Body)
    87  		if !ok {
    88  			return errors.Errorf("not an HCL Body: %s", n.Data)
    89  		}
    90  		trimmed := make(hclsyntax.Blocks, 0, len(body.Blocks))
    91  		for _, b := range body.Blocks {
    92  			if b.Type == blockResource {
    93  				trimmed = append(trimmed, b)
    94  			}
    95  		}
    96  		body.Blocks = trimmed
    97  		// first try an exact match to find the example
    98  		if len(resourceName) == 0 {
    99  			resourceName = getResourceNameFromPath(path, resourcePrefix)
   100  		}
   101  		if err := r.findExampleBlock(f, body.Blocks, &resourceName, true); err != nil {
   102  			return err
   103  		}
   104  		r.Name = resourceName
   105  	}
   106  
   107  	if r.Name == "" {
   108  		r.Name = resourceName
   109  	}
   110  	return nil
   111  }
   112  
   113  func (r *Resource) findReferences(parentPath string, file *hcl.File, b *hclsyntax.Block) (map[string]string, error) { //nolint: gocyclo
   114  	refs := make(map[string]string)
   115  	if parentPath == "" && b.Labels[0] != r.Name {
   116  		return refs, nil
   117  	}
   118  	for name, attr := range b.Body.Attributes {
   119  		e, ok := attr.Expr.(*hclsyntax.ScopeTraversalExpr)
   120  		if !ok {
   121  			continue
   122  		}
   123  		refName := name
   124  		if parentPath != "" {
   125  			refName = fmt.Sprintf("%s.%s", parentPath, refName)
   126  		}
   127  		ref := string(file.Bytes[e.Range().Start.Byte:e.Range().End.Byte])
   128  		if v, ok := refs[refName]; ok && v != ref {
   129  			return nil, errors.Errorf("attribute %s.%s refers to %s. New reference: %s", r.Name, refName, v, ref)
   130  		}
   131  		refs[refName] = ref
   132  	}
   133  	for _, nestedBlock := range b.Body.Blocks {
   134  		path := nestedBlock.Type
   135  		if parentPath != "" {
   136  			path = fmt.Sprintf("%s.%s", parentPath, path)
   137  		}
   138  		nestedRefs, err := r.findReferences(path, file, nestedBlock)
   139  		if err != nil {
   140  			return nil, errors.Wrapf(err, "cannot find references in nested block: %s", path)
   141  		}
   142  		for k, v := range nestedRefs {
   143  			refs[k] = v
   144  		}
   145  	}
   146  	return refs, nil
   147  }
   148  
   149  func suffixMatch(label, resourceName string, limit int) bool {
   150  	suffixParts := strings.Split(resourceName, "_")
   151  	for i := 0; i < len(suffixParts) && (limit == -1 || i <= limit); i++ {
   152  		s := strings.Join(suffixParts[i:], "_")
   153  		if strings.Contains(label, s) {
   154  			return true
   155  		}
   156  	}
   157  	return false
   158  }
   159  
   160  func convertManifest2JSON(file *hcl.File, b *hclsyntax.Block) (string, error) {
   161  	buff, err := convert.File(&hcl.File{
   162  		Body:  b.Body,
   163  		Bytes: file.Bytes,
   164  	}, convert.Options{})
   165  	if err != nil {
   166  		return "", errors.Wrap(err, "failed to format as JSON")
   167  	}
   168  	out := bytes.Buffer{}
   169  	err = json.Indent(&out, buff, "", "  ")
   170  	if err != nil {
   171  		return "", errors.Wrap(err, "unable to format JSON example manifest")
   172  	}
   173  	return out.String(), nil
   174  }
   175  
   176  func (r *Resource) findExampleBlock(file *hcl.File, blocks hclsyntax.Blocks, resourceName *string, exactMatch bool) error { //nolint:gocyclo
   177  	dependencies := make(map[string]string)
   178  	for _, b := range blocks {
   179  		depKey := fmt.Sprintf("%s.%s", b.Labels[0], b.Labels[1])
   180  		m, err := convertManifest2JSON(file, b)
   181  		if err != nil {
   182  			return errors.Wrap(err, "failed to convert example manifest to JSON")
   183  		}
   184  		if b.Labels[0] != *resourceName {
   185  			if exactMatch {
   186  				dependencies[depKey] = m
   187  				continue
   188  			}
   189  
   190  			if suffixMatch(b.Labels[0], *resourceName, 1) || (strings.Contains(*resourceName, b.Labels[0]) && strings.Count(*resourceName, "_") == strings.Count(b.Labels[0], "_")) {
   191  				*resourceName = b.Labels[0]
   192  				exactMatch = true
   193  			} else {
   194  				dependencies[depKey] = m
   195  				continue
   196  			}
   197  		}
   198  		r.Name = *resourceName
   199  		err = r.addExampleManifest(file, b)
   200  		r.Examples[len(r.Examples)-1].Manifest = m
   201  		r.Examples[len(r.Examples)-1].Dependencies = dependencies
   202  		if err != nil {
   203  			return errors.Wrap(err, "failed to add example manifest to resource")
   204  		}
   205  	}
   206  
   207  	if len(r.Examples) == 0 && exactMatch {
   208  		return r.findExampleBlock(file, blocks, resourceName, false)
   209  	}
   210  	return nil
   211  }
   212  
   213  func (r *Resource) scrapePrelude(doc *html.Node, preludeXPath string) error {
   214  	// parse prelude
   215  	nodes := htmlquery.Find(doc, preludeXPath)
   216  	if len(nodes) == 0 {
   217  		return errors.Errorf("failed to find the prelude of the document using the xpath expressions: %s", preludeXPath)
   218  	}
   219  
   220  	n := nodes[0]
   221  	lines := strings.Split(n.Data, "\n")
   222  	descIndex := -1
   223  	for i, l := range lines {
   224  		kv := strings.Split(l, ":")
   225  		if len(kv) < 2 {
   226  			continue
   227  		}
   228  		switch kv[0] {
   229  		case keyPageTitle:
   230  			r.Title = strings.TrimSpace(strings.ReplaceAll(kv[len(kv)-1], `"`, ""))
   231  
   232  		case keyDescription:
   233  			r.Description = kv[1]
   234  			descIndex = i
   235  
   236  		case keySubCategory:
   237  			r.SubCategory = strings.TrimSpace(strings.ReplaceAll(kv[1], `"`, ""))
   238  		}
   239  	}
   240  
   241  	if descIndex > -1 {
   242  		r.Description += strings.Join(lines[descIndex+1:], " ")
   243  	}
   244  	r.Description = strings.TrimSpace(strings.Replace(r.Description, "|-", "", 1))
   245  
   246  	return nil
   247  }
   248  
   249  func (r *Resource) scrapeFieldDocs(doc *html.Node, fieldXPath string) {
   250  	processed := make(map[*html.Node]struct{})
   251  	codeNodes := htmlquery.Find(doc, fieldXPath)
   252  	for _, n := range codeNodes {
   253  		attrName := ""
   254  		docStr := r.scrapeDocString(n, &attrName, processed)
   255  		if docStr == "" {
   256  			continue
   257  		}
   258  		if r.ArgumentDocs == nil {
   259  			r.ArgumentDocs = make(map[string]string)
   260  		}
   261  		if r.ArgumentDocs[attrName] != "" && r.ArgumentDocs[attrName] != strings.TrimSpace(docStr) {
   262  			continue
   263  		}
   264  		r.ArgumentDocs[attrName] = strings.TrimSpace(docStr)
   265  	}
   266  }
   267  
   268  // getRootPath extracts the root attribute name for the specified HTML node n,
   269  // from the preceding paragraph or header HTML nodes.
   270  func (r *Resource) getRootPath(n *html.Node) string {
   271  	var ulNode, pNode *html.Node
   272  	for ulNode = n.Parent; ulNode != nil && ulNode.Data != "ul"; ulNode = ulNode.Parent {
   273  	}
   274  	if ulNode == nil {
   275  		return ""
   276  	}
   277  	for pNode = ulNode.PrevSibling; pNode != nil && (pNode.Data != "p" || !regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode)))); pNode = pNode.PrevSibling {
   278  		// if it's an HTML header node
   279  		if regexHeaderNode.MatchString(pNode.Data) {
   280  			return r.extractRootFromHeader(pNode)
   281  		}
   282  	}
   283  	if pNode == nil {
   284  		return ""
   285  	}
   286  	return r.extractRootFromParagraph(pNode)
   287  }
   288  
   289  // extractRootFromHeader extracts the root Terraform attribute name
   290  // from the children of the specified header HTML node.
   291  func (r *Resource) extractRootFromHeader(pNode *html.Node) string {
   292  	headerText := extractText(pNode)
   293  	if _, ok := r.ArgumentDocs[headerText]; ok {
   294  		return headerText
   295  	}
   296  	sortedKeys := make([]string, 0, len(r.ArgumentDocs))
   297  	for k := range r.ArgumentDocs {
   298  		sortedKeys = append(sortedKeys, k)
   299  	}
   300  	sort.Strings(sortedKeys)
   301  	for _, k := range sortedKeys {
   302  		parts := strings.Split(k, ".")
   303  		if headerText == parts[len(parts)-1] {
   304  			return k
   305  		}
   306  	}
   307  	// try to convert header text to a hierarchical attribute name.
   308  	// For certain headers, the header text is attribute's relative (partial)
   309  	// hierarchical name separated with spaces.
   310  	if _, ok := r.ArgumentDocs[strings.ReplaceAll(headerText, " ", ".")]; ok {
   311  		return strings.ReplaceAll(headerText, " ", ".")
   312  	}
   313  	if regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode))) {
   314  		for _, s := range strings.Split(headerText, " ") {
   315  			if _, ok := r.ArgumentDocs[s]; ok {
   316  				return s
   317  			}
   318  		}
   319  	}
   320  	return ""
   321  }
   322  
   323  // extractRootFromParagraph extracts the root Terraform attribute name
   324  // from the children of the specified paragraph HTML node.
   325  func (r *Resource) extractRootFromParagraph(pNode *html.Node) string {
   326  	var codeNode *html.Node
   327  	for codeNode = pNode.FirstChild; codeNode != nil && codeNode.Data != "code"; codeNode = codeNode.NextSibling {
   328  		// intentionally left empty
   329  	}
   330  	if codeNode == nil || codeNode.FirstChild == nil {
   331  		return ""
   332  	}
   333  	prevLiNode := getPrevLiWithCodeText(codeNode.FirstChild.Data, pNode)
   334  	if prevLiNode == nil {
   335  		return codeNode.FirstChild.Data
   336  	}
   337  	root := r.getRootPath(prevLiNode)
   338  	if len(root) == 0 {
   339  		return codeNode.FirstChild.Data
   340  	}
   341  	return fmt.Sprintf("%s.%s", root, codeNode.FirstChild.Data)
   342  }
   343  
   344  // getPrevLiWithCodeText returns the list item node (in an UL) with
   345  // a code child with text `codeText`.
   346  func getPrevLiWithCodeText(codeText string, pNode *html.Node) *html.Node {
   347  	var ulNode, liNode *html.Node
   348  	for ulNode = pNode.PrevSibling; ulNode != nil && ulNode.Data != "ul"; ulNode = ulNode.PrevSibling {
   349  	}
   350  	if ulNode == nil {
   351  		return nil
   352  	}
   353  	for liNode = ulNode.FirstChild; liNode != nil; liNode = liNode.NextSibling {
   354  		if liNode.Data != "li" || liNode.FirstChild == nil || liNode.FirstChild.Data != "code" || liNode.FirstChild.FirstChild.Data != codeText {
   355  			continue
   356  		}
   357  		return liNode
   358  	}
   359  	return nil
   360  }
   361  
   362  // extractText extracts text from the children of an element node,
   363  // removing any HTML tags and leaving only text data.
   364  func extractText(n *html.Node) string {
   365  	switch n.Type { //nolint:exhaustive
   366  	case html.TextNode:
   367  		return n.Data
   368  	case html.ElementNode:
   369  		sb := strings.Builder{}
   370  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   371  			s := ""
   372  			if c.Type != html.TextNode {
   373  				s = extractText(c)
   374  			} else {
   375  				s = c.Data
   376  			}
   377  			if len(s) != 0 {
   378  				sb.WriteString(s)
   379  			}
   380  		}
   381  		return sb.String()
   382  	default:
   383  		return ""
   384  	}
   385  }
   386  
   387  func (r *Resource) scrapeDocString(n *html.Node, attrName *string, processed map[*html.Node]struct{}) string {
   388  	if _, ok := processed[n]; ok {
   389  		return ""
   390  	}
   391  	processed[n] = struct{}{}
   392  
   393  	if n.Type == html.ElementNode {
   394  		return r.scrapeDocString(n.FirstChild, attrName, processed)
   395  	}
   396  
   397  	sb := strings.Builder{}
   398  	if *attrName == "" {
   399  		*attrName = n.Data
   400  		if root := r.getRootPath(n); len(root) != 0 {
   401  			*attrName = fmt.Sprintf("%s.%s", root, *attrName)
   402  		}
   403  	} else {
   404  		sb.WriteString(n.Data)
   405  	}
   406  	s := n.Parent
   407  	for s = s.NextSibling; s != nil; s = s.NextSibling {
   408  		if _, ok := processed[s]; ok {
   409  			continue
   410  		}
   411  		processed[s] = struct{}{}
   412  
   413  		switch s.Type { //nolint:exhaustive
   414  		case html.TextNode:
   415  			sb.WriteString(s.Data)
   416  		case html.ElementNode:
   417  			if s.FirstChild == nil {
   418  				continue
   419  			}
   420  			sb.WriteString(r.scrapeDocString(s.FirstChild, attrName, processed))
   421  		}
   422  	}
   423  	return sb.String()
   424  }
   425  
   426  func (r *Resource) scrapeImportStatements(doc *html.Node, importXPath string) {
   427  	nodes := htmlquery.Find(doc, importXPath)
   428  	for _, n := range nodes {
   429  		r.ImportStatements = append(r.ImportStatements, strings.TrimSpace(n.Data))
   430  	}
   431  }
   432  
   433  // scrape scrapes resource metadata from the specified HTML doc.
   434  // filename is not always the precise resource name, hence,
   435  // it returns the resource name scraped from the doc.
   436  func (r *Resource) scrape(path string, config *ScrapeConfiguration) error {
   437  	source, err := os.ReadFile(filepath.Clean(path))
   438  	if err != nil {
   439  		return errors.Wrap(err, "failed to read markdown file")
   440  	}
   441  
   442  	var buff bytes.Buffer
   443  	if err := goldmark.Convert(source, &buff); err != nil {
   444  		return errors.Wrap(err, "failed to convert markdown")
   445  	}
   446  
   447  	doc, err := htmlquery.Parse(&buff)
   448  	if err != nil {
   449  		return errors.Wrap(err, "failed to parse HTML")
   450  	}
   451  
   452  	if err := r.scrapePrelude(doc, config.PreludeXPath); err != nil {
   453  		return err
   454  	}
   455  
   456  	r.scrapeFieldDocs(doc, config.FieldDocXPath)
   457  	r.scrapeImportStatements(doc, config.ImportXPath)
   458  
   459  	return r.scrapeExamples(doc, config.CodeXPath, path, config.ResourcePrefix, config.Debug)
   460  }
   461  
   462  // ScrapeConfiguration is a configurator for the scraper
   463  type ScrapeConfiguration struct {
   464  	// Debug Output debug messages
   465  	Debug bool
   466  	// RepoPath is the path of the Terraform native provider repo
   467  	RepoPath string
   468  	// CodeXPath Code XPath expression
   469  	CodeXPath string
   470  	// PreludeXPath Prelude XPath expression
   471  	PreludeXPath string
   472  	// FieldDocXPath Field documentation XPath expression
   473  	FieldDocXPath string
   474  	// ImportXPath Import statements XPath expression
   475  	ImportXPath string
   476  	// FileExtensions extensions of the files to be scraped
   477  	FileExtensions []string
   478  	// ResourcePrefix Terraform resource name prefix for the Terraform provider
   479  	ResourcePrefix string
   480  }
   481  
   482  func (sc *ScrapeConfiguration) hasExpectedExtension(fileName string) bool {
   483  	for _, e := range sc.FileExtensions {
   484  		if e == filepath.Ext(fileName) {
   485  			return true
   486  		}
   487  	}
   488  	return false
   489  }
   490  
   491  // ScrapeRepo scrape metadata from the configured Terraform native provider repo
   492  func (pm *ProviderMetadata) ScrapeRepo(config *ScrapeConfiguration) error {
   493  	return errors.Wrap(filepath.WalkDir(config.RepoPath, func(path string, d fs.DirEntry, err error) error {
   494  		if err != nil {
   495  			return errors.Wrap(err, "failed to traverse Terraform registry")
   496  		}
   497  		if d.IsDir() || !config.hasExpectedExtension(d.Name()) {
   498  			return nil
   499  		}
   500  		r := &Resource{}
   501  		// don't scrape if file is empty
   502  		filename := filepath.Clean(path)
   503  		b, err := os.ReadFile(filename)
   504  		if err != nil {
   505  			return errors.Wrap(err, "failed to read markdown file")
   506  		}
   507  		if len(b) == 1 {
   508  			fmt.Printf("skipping empty file: %s\n", filename)
   509  			return nil
   510  		}
   511  		if err := r.scrape(path, config); err != nil {
   512  			return errors.Wrapf(err, "failed to scrape resource metadata from path: %s", path)
   513  		}
   514  
   515  		pm.Resources[r.Name] = r
   516  		return nil
   517  	}), "cannot scrape Terraform registry")
   518  }
   519  
   520  // Store stores this scraped ProviderMetadata at the specified path
   521  func (pm *ProviderMetadata) Store(path string) error {
   522  	out, err := yaml.Marshal(pm)
   523  	if err != nil {
   524  		return errors.Wrap(err, "failed to marshal provider metadata to YAML")
   525  	}
   526  	return errors.Wrapf(os.WriteFile(path, out, 0600), "failed to write provider metada file: %s", path)
   527  }