github.com/anchore/syft@v1.4.2-0.20240516191711-1bec1fc5d397/syft/pkg/cataloger/internal/cpegenerate/generate.go (about)

     1  package cpegenerate
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	_ "embed"
     7  	"encoding/json"
     8  	"fmt"
     9  	"sort"
    10  	"strings"
    11  	"sync"
    12  
    13  	"github.com/scylladb/go-set/strset"
    14  
    15  	"github.com/anchore/syft/internal/log"
    16  	"github.com/anchore/syft/syft/cpe"
    17  	"github.com/anchore/syft/syft/pkg"
    18  	"github.com/anchore/syft/syft/pkg/cataloger/internal/cpegenerate/dictionary"
    19  )
    20  
    21  // knownVendors contains vendor strings that are known to exist in
    22  // the CPE database, so they will be preferred over other candidates:
    23  var knownVendors = strset.New("apache")
    24  
    25  func newCPE(product, vendor, version, targetSW string) *cpe.Attributes {
    26  	c := cpe.NewWithAny()
    27  	c.Part = "a"
    28  	c.Product = product
    29  	c.Vendor = vendor
    30  	c.Version = version
    31  	c.TargetSW = targetSW
    32  	if cpe.ValidateString(c.String()) != nil {
    33  		return nil
    34  	}
    35  	return &c
    36  }
    37  
    38  //go:embed dictionary/data/cpe-index.json
    39  var indexedCPEDictionaryData []byte
    40  
    41  var indexedCPEDictionary *dictionary.Indexed
    42  var indexedCPEDictionaryOnce sync.Once
    43  
    44  func GetIndexedDictionary() (_ *dictionary.Indexed, err error) {
    45  	indexedCPEDictionaryOnce.Do(func() {
    46  		err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary)
    47  	})
    48  
    49  	if err != nil {
    50  		return
    51  	}
    52  
    53  	if indexedCPEDictionary == nil {
    54  		err = fmt.Errorf("failed to unmarshal indexed CPE dictionary")
    55  		return
    56  	}
    57  
    58  	return indexedCPEDictionary, err
    59  }
    60  
    61  func FromDictionaryFind(p pkg.Package) ([]cpe.CPE, bool) {
    62  	dict, err := GetIndexedDictionary()
    63  	parsedCPEs := []cpe.CPE{}
    64  	if err != nil {
    65  		log.Debugf("CPE dictionary lookup not available: %+v", err)
    66  		return parsedCPEs, false
    67  	}
    68  
    69  	var (
    70  		cpes *dictionary.Set
    71  		ok   bool
    72  	)
    73  
    74  	switch p.Type {
    75  	case pkg.NpmPkg:
    76  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name]
    77  
    78  	case pkg.GemPkg:
    79  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name]
    80  
    81  	case pkg.PythonPkg:
    82  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name]
    83  
    84  	case pkg.JenkinsPluginPkg:
    85  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name]
    86  
    87  	case pkg.RustPkg:
    88  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name]
    89  
    90  	case pkg.PhpComposerPkg:
    91  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPComposer][p.Name]
    92  
    93  	case pkg.PhpPeclPkg:
    94  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPPecl][p.Name]
    95  
    96  	case pkg.GoModulePkg:
    97  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemGoModules][p.Name]
    98  
    99  	default:
   100  		// The dictionary doesn't support this package type yet.
   101  		return parsedCPEs, false
   102  	}
   103  
   104  	if !ok {
   105  		// The dictionary doesn't have a CPE for this package.
   106  		return parsedCPEs, false
   107  	}
   108  
   109  	for _, c := range cpes.List() {
   110  		parsedCPE, err := cpe.New(c, cpe.NVDDictionaryLookupSource)
   111  		if err != nil {
   112  			continue
   113  		}
   114  
   115  		parsedCPE.Attributes.Version = p.Version
   116  		parsedCPEs = append(parsedCPEs, parsedCPE)
   117  	}
   118  
   119  	if len(parsedCPEs) == 0 {
   120  		return []cpe.CPE{}, false
   121  	}
   122  
   123  	return parsedCPEs, true
   124  }
   125  
   126  // FromPackageAttributes Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to
   127  // generate the minimal set of representative CPEs, which implies that optional fields should not be included
   128  // (such as target SW).
   129  func FromPackageAttributes(p pkg.Package) []cpe.CPE {
   130  	vendors := candidateVendors(p)
   131  	products := candidateProducts(p)
   132  	if len(products) == 0 {
   133  		return nil
   134  	}
   135  
   136  	keys := strset.New()
   137  	cpes := make([]cpe.Attributes, 0)
   138  	for _, product := range products {
   139  		for _, vendor := range vendors {
   140  			// prevent duplicate entries...
   141  			key := fmt.Sprintf("%s|%s|%s", product, vendor, p.Version)
   142  			if keys.Has(key) {
   143  				continue
   144  			}
   145  			keys.Add(key)
   146  			// add a new entry...
   147  			if c := newCPE(product, vendor, p.Version, cpe.Any); c != nil {
   148  				cpes = append(cpes, *c)
   149  			}
   150  		}
   151  	}
   152  
   153  	// filter out any known combinations that don't accurately represent this package
   154  	cpes = filter(cpes, p, cpeFilters...)
   155  
   156  	sort.Sort(cpe.BySpecificity(cpes))
   157  	var result []cpe.CPE
   158  	for _, c := range cpes {
   159  		result = append(result, cpe.CPE{Attributes: c, Source: cpe.GeneratedSource})
   160  	}
   161  
   162  	return result
   163  }
   164  
   165  //nolint:funlen
   166  func candidateVendors(p pkg.Package) []string {
   167  	// in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that
   168  	// could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this
   169  	// are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities
   170  	// with CPEs where the vendor is the product name and doesn't appear to be derived from any available package
   171  	// metadata.
   172  	vendors := newFieldCandidateSet(candidateProducts(p)...)
   173  
   174  	switch p.Language {
   175  	case pkg.JavaScript:
   176  		// for JavaScript if we find node.js as a package then the vendor is "nodejs"
   177  		if p.Name == "node.js" {
   178  			vendors.addValue("nodejs")
   179  		}
   180  	case pkg.Ruby:
   181  		vendors.addValue("ruby-lang")
   182  	case pkg.Go:
   183  		// replace all candidates with only the golang-specific helper
   184  		vendors.clear()
   185  
   186  		vendor := candidateVendorForGo(p.Name)
   187  		if vendor != "" {
   188  			vendors.addValue(vendor)
   189  		}
   190  	}
   191  
   192  	switch p.Metadata.(type) {
   193  	case pkg.RpmDBEntry:
   194  		vendors.union(candidateVendorsForRPM(p))
   195  	case pkg.RubyGemspec:
   196  		vendors.union(candidateVendorsForRuby(p))
   197  	case pkg.PythonPackage:
   198  		vendors.union(candidateVendorsForPython(p))
   199  	case pkg.JavaArchive:
   200  		vendors.union(candidateVendorsForJava(p))
   201  	case pkg.ApkDBEntry:
   202  		vendors.union(candidateVendorsForAPK(p))
   203  	case pkg.NpmPackage:
   204  		vendors.union(candidateVendorsForJavascript(p))
   205  	case pkg.WordpressPluginEntry:
   206  		vendors.clear()
   207  		vendors.union(candidateVendorsForWordpressPlugin(p))
   208  	}
   209  
   210  	// We should no longer be generating vendor candidates with these values ["" and "*"]
   211  	// (since CPEs will match any other value)
   212  	vendors.removeByValue("")
   213  	vendors.removeByValue("*")
   214  
   215  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   216  	addDelimiterVariations(vendors)
   217  
   218  	// generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci])
   219  	addAllSubSelections(vendors)
   220  
   221  	// add more candidates based on the package info for each vendor candidate
   222  	for _, vendor := range vendors.uniqueValues() {
   223  		vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...)
   224  	}
   225  
   226  	// remove known mis
   227  	vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   228  
   229  	uniqueVendors := vendors.uniqueValues()
   230  
   231  	// if any known vendor was detected, pick that one.
   232  	for _, vendor := range uniqueVendors {
   233  		if knownVendors.Has(vendor) {
   234  			return []string{vendor}
   235  		}
   236  	}
   237  
   238  	return uniqueVendors
   239  }
   240  
   241  func candidateProducts(p pkg.Package) []string {
   242  	products := newFieldCandidateSet(p.Name)
   243  
   244  	_, hasJavaMetadata := p.Metadata.(pkg.JavaArchive)
   245  
   246  	switch {
   247  	case p.Language == pkg.Python:
   248  		if !strings.HasPrefix(p.Name, "python") {
   249  			products.addValue("python-" + p.Name)
   250  		}
   251  	case p.Language == pkg.Java || hasJavaMetadata:
   252  		products.addValue(candidateProductsForJava(p)...)
   253  	case p.Language == pkg.Go:
   254  		// replace all candidates with only the golang-specific helper
   255  		products.clear()
   256  
   257  		prod := candidateProductForGo(p.Name)
   258  		if prod != "" {
   259  			products.addValue(prod)
   260  		}
   261  	}
   262  
   263  	if _, hasAPKMetadata := p.Metadata.(pkg.ApkDBEntry); hasAPKMetadata {
   264  		products.union(candidateProductsForAPK(p))
   265  	}
   266  
   267  	if _, hasWordpressMetadata := p.Metadata.(pkg.WordpressPluginEntry); hasWordpressMetadata {
   268  		products.clear()
   269  		products.union(candidateProductsForWordpressPlugin(p))
   270  	}
   271  
   272  	// it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value)
   273  	products.removeByValue("")
   274  	products.removeByValue("*")
   275  
   276  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   277  	addDelimiterVariations(products)
   278  
   279  	// add known candidate additions
   280  	products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...)
   281  
   282  	// remove known candidate removals
   283  	products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   284  
   285  	return products.uniqueValues()
   286  }
   287  
   288  func addAllSubSelections(fields fieldCandidateSet) {
   289  	candidatesForVariations := fields.copy()
   290  	candidatesForVariations.removeWhere(subSelectionsDisallowed)
   291  
   292  	for _, candidate := range candidatesForVariations.values() {
   293  		fields.addValue(generateSubSelections(candidate)...)
   294  	}
   295  }
   296  
   297  // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections
   298  // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins].
   299  func generateSubSelections(field string) (results []string) {
   300  	scanner := bufio.NewScanner(strings.NewReader(field))
   301  	scanner.Split(scanByHyphenOrUnderscore)
   302  	var lastToken uint8
   303  	for scanner.Scan() {
   304  		rawCandidate := scanner.Text()
   305  		if len(rawCandidate) == 0 {
   306  			break
   307  		}
   308  
   309  		// trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since
   310  		// scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least
   311  		// prefixed.
   312  		candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore)
   313  
   314  		// capture the result (if there is content)
   315  		if len(candidate) > 0 {
   316  			if len(results) > 0 {
   317  				results = append(results, results[len(results)-1]+string(lastToken)+candidate)
   318  			} else {
   319  				results = append(results, candidate)
   320  			}
   321  		}
   322  
   323  		// keep track of the trailing separator for the next loop
   324  		lastToken = rawCandidate[len(rawCandidate)-1]
   325  	}
   326  	return results
   327  }
   328  
   329  // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores.
   330  func trimHyphenOrUnderscore(r rune) bool {
   331  	switch r {
   332  	case '-', '_':
   333  		return true
   334  	}
   335  	return false
   336  }
   337  
   338  // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split
   339  func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) {
   340  	if atEOF && len(data) == 0 {
   341  		return 0, nil, nil
   342  	}
   343  	if i := bytes.IndexAny(data, "-_"); i >= 0 {
   344  		return i + 1, data[0 : i+1], nil
   345  	}
   346  
   347  	if atEOF {
   348  		return len(data), data, nil
   349  	}
   350  
   351  	return 0, nil, nil
   352  }
   353  
   354  func addDelimiterVariations(fields fieldCandidateSet) {
   355  	candidatesForVariations := fields.copy()
   356  	candidatesForVariations.removeWhere(delimiterVariationsDisallowed)
   357  
   358  	for _, candidate := range candidatesForVariations.list() {
   359  		field := candidate.value
   360  		hasHyphen := strings.Contains(field, "-")
   361  		hasUnderscore := strings.Contains(field, "_")
   362  
   363  		if hasHyphen {
   364  			// provide variations of hyphen candidates with an underscore
   365  			newValue := strings.ReplaceAll(field, "-", "_")
   366  			underscoreCandidate := candidate
   367  			underscoreCandidate.value = newValue
   368  			fields.add(underscoreCandidate)
   369  		}
   370  
   371  		if hasUnderscore {
   372  			// provide variations of underscore candidates with a hyphen
   373  			newValue := strings.ReplaceAll(field, "_", "-")
   374  			hyphenCandidate := candidate
   375  			hyphenCandidate.value = newValue
   376  			fields.add(hyphenCandidate)
   377  		}
   378  	}
   379  }