github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/internal/cpegenerate/generate.go (about)

     1  package cpegenerate
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	_ "embed"
     7  	"encoding/json"
     8  	"fmt"
     9  	"regexp"
    10  	"sort"
    11  	"strings"
    12  	"sync"
    13  	"unicode"
    14  
    15  	"github.com/scylladb/go-set/strset"
    16  
    17  	"github.com/anchore/syft/internal/log"
    18  	"github.com/anchore/syft/syft/cpe"
    19  	"github.com/anchore/syft/syft/pkg"
    20  	"github.com/anchore/syft/syft/pkg/cataloger/internal/cpegenerate/dictionary"
    21  )
    22  
    23  // knownVendors contains vendor strings that are known to exist in
    24  // the CPE database, so they will be preferred over other candidates:
    25  var knownVendors = strset.New("apache")
    26  
    27  func newCPE(product, vendor, version, targetSW string) *cpe.Attributes {
    28  	c := cpe.NewWithAny()
    29  	c.Part = "a"
    30  	c.Product = product
    31  	c.Vendor = vendor
    32  	c.Version = version
    33  	c.TargetSW = targetSW
    34  	if cpe.ValidateString(c.String()) != nil {
    35  		return nil
    36  	}
    37  	return &c
    38  }
    39  
    40  //go:embed dictionary/data/cpe-index.json
    41  var indexedCPEDictionaryData []byte
    42  
    43  var indexedCPEDictionary *dictionary.Indexed
    44  var indexedCPEDictionaryOnce sync.Once
    45  
    46  func GetIndexedDictionary() (_ *dictionary.Indexed, err error) {
    47  	indexedCPEDictionaryOnce.Do(func() {
    48  		err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary)
    49  	})
    50  
    51  	if err != nil {
    52  		return
    53  	}
    54  
    55  	if indexedCPEDictionary == nil {
    56  		err = fmt.Errorf("failed to unmarshal indexed CPE dictionary")
    57  		return
    58  	}
    59  
    60  	return indexedCPEDictionary, err
    61  }
    62  
    63  func FromDictionaryFind(p pkg.Package) ([]cpe.CPE, bool) {
    64  	dict, err := GetIndexedDictionary()
    65  	if err != nil {
    66  		log.Debugf("CPE dictionary lookup not available: %+v", err)
    67  		return []cpe.CPE{}, false
    68  	}
    69  
    70  	var cpes *dictionary.Set
    71  	var ok bool
    72  
    73  	switch p.Type {
    74  	case pkg.NpmPkg:
    75  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name]
    76  
    77  	case pkg.GemPkg:
    78  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name]
    79  
    80  	case pkg.PythonPkg:
    81  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name]
    82  
    83  	case pkg.JenkinsPluginPkg:
    84  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name]
    85  
    86  	case pkg.RustPkg:
    87  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name]
    88  
    89  	case pkg.PhpComposerPkg:
    90  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPComposer][p.Name]
    91  
    92  	case pkg.PhpPeclPkg:
    93  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPPecl][p.Name]
    94  
    95  	case pkg.GoModulePkg:
    96  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemGoModules][p.Name]
    97  
    98  	case pkg.WordpressPluginPkg:
    99  		metadata, valid := p.Metadata.(pkg.WordpressPluginEntry)
   100  		if !valid {
   101  			return nil, false
   102  		}
   103  		cpes, ok = dict.EcosystemPackages[dictionary.EcosystemWordpressPlugins][metadata.PluginInstallDirectory]
   104  
   105  	case pkg.ModelPkg:
   106  		// ML models should not have CPEs as they are not traditional software packages
   107  		// and don't fit the vulnerability model used for software packages.
   108  		return nil, false
   109  	default:
   110  		// The dictionary doesn't support this package type yet.
   111  		return nil, false
   112  	}
   113  
   114  	if !ok {
   115  		// The dictionary doesn't have a CPE for this package.
   116  		return []cpe.CPE{}, false
   117  	}
   118  
   119  	parsedCPEs := []cpe.CPE{}
   120  	for _, c := range cpes.List() {
   121  		parsedCPE, err := cpe.New(c, cpe.NVDDictionaryLookupSource)
   122  		if err != nil {
   123  			continue
   124  		}
   125  
   126  		parsedCPE.Attributes.Version = p.Version
   127  		parsedCPEs = append(parsedCPEs, parsedCPE)
   128  	}
   129  
   130  	if len(parsedCPEs) == 0 {
   131  		return nil, false
   132  	}
   133  
   134  	sort.Sort(cpe.BySourceThenSpecificity(parsedCPEs))
   135  	return parsedCPEs, true
   136  }
   137  
   138  // FromPackageAttributes Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to
   139  // generate the minimal set of representative CPEs, which implies that optional fields should not be included
   140  // (such as target SW).
   141  func FromPackageAttributes(p pkg.Package) []cpe.CPE {
   142  	// ML models should not have CPEs as they are not traditional software packages
   143  	// and don't fit the vulnerability model used for software packages.
   144  	if p.Type == pkg.ModelPkg {
   145  		return nil
   146  	}
   147  
   148  	vendors := candidateVendors(p)
   149  	products := candidateProducts(p)
   150  	targetSWs := candidateTargetSw(p)
   151  	if len(products) == 0 {
   152  		return nil
   153  	}
   154  
   155  	keys := strset.New()
   156  	cpes := make([]cpe.Attributes, 0)
   157  	for _, ts := range targetSWs {
   158  		for _, product := range products {
   159  			for _, vendor := range vendors {
   160  				// prevent duplicate entries...
   161  				key := fmt.Sprintf("%s|%s|%s|%s", product, vendor, p.Version, ts)
   162  				if keys.Has(key) {
   163  					continue
   164  				}
   165  				keys.Add(key)
   166  				// add a new entry...
   167  				if c := newCPE(product, vendor, p.Version, ts); c != nil {
   168  					cpes = append(cpes, *c)
   169  				}
   170  			}
   171  		}
   172  	}
   173  
   174  	// filter out any known combinations that don't accurately represent this package
   175  	cpes = filter(cpes, p, cpeFilters...)
   176  
   177  	var result []cpe.CPE
   178  	for _, c := range cpes {
   179  		result = append(result, cpe.CPE{Attributes: c, Source: cpe.GeneratedSource})
   180  	}
   181  
   182  	sort.Sort(cpe.BySourceThenSpecificity(result))
   183  	return result
   184  }
   185  
   186  func candidateTargetSw(p pkg.Package) []string {
   187  	if p.Type == pkg.WordpressPluginPkg {
   188  		return []string{"wordpress"}
   189  	}
   190  	return []string{cpe.Any}
   191  }
   192  
   193  func candidateVendors(p pkg.Package) []string {
   194  	// in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that
   195  	// could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this
   196  	// are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities
   197  	// with CPEs where the vendor is the product name and doesn't appear to be derived from any available package
   198  	// metadata.
   199  	vendors := newFieldCandidateSet()
   200  	vendors.union(candidateProductSet(p))
   201  
   202  	switch p.Language {
   203  	case pkg.JavaScript:
   204  		// for JavaScript if we find node.js as a package then the vendor is "nodejs"
   205  		if p.Name == "node.js" {
   206  			vendors.addValue("nodejs")
   207  		}
   208  	case pkg.Ruby:
   209  		vendors.addValue("ruby-lang")
   210  	case pkg.Go:
   211  		// replace all candidates with only the golang-specific helper
   212  		vendors.clear()
   213  
   214  		vendor := candidateVendorForGo(p.Name)
   215  		if vendor != "" {
   216  			vendors.addValue(vendor)
   217  		}
   218  	}
   219  
   220  	switch p.Metadata.(type) {
   221  	case pkg.DotnetDepsEntry, pkg.DotnetPackagesLockEntry, pkg.DotnetPortableExecutableEntry:
   222  		vendors.clear()
   223  		vendors.union(candidateVendorsForDotnet(p))
   224  	case pkg.RpmDBEntry, pkg.RpmArchive:
   225  		vendors.union(candidateVendorsForRPM(p))
   226  	case pkg.RubyGemspec:
   227  		vendors.union(candidateVendorsForRuby(p))
   228  	case pkg.PythonPackage:
   229  		vendors.union(candidateVendorsForPython(p))
   230  	case pkg.JavaArchive:
   231  		vendors.union(candidateVendorsForJava(p))
   232  	case pkg.ApkDBEntry:
   233  		vendors.union(candidateVendorsForAPK(p))
   234  	case pkg.NpmPackage:
   235  		vendors.union(candidateVendorsForJavascript(p))
   236  	case pkg.PEBinary:
   237  		// Add PE-specific vendor hints (e.g. ghostscript -> artifex)
   238  		vendors.union(candidateVendorsForPE(p))
   239  	case pkg.WordpressPluginEntry:
   240  		vendors.clear()
   241  		vendors.union(candidateVendorsForWordpressPlugin(p))
   242  	}
   243  
   244  	if p.Type == pkg.BinaryPkg && endsWithNumber(p.Name) {
   245  		// add binary package digit-suffix variations (e.g. Qt5 -> Qt)
   246  		addBinaryPackageDigitVariations(vendors)
   247  	}
   248  
   249  	// We should no longer be generating vendor candidates with these values ["" and "*"]
   250  	// (since CPEs will match any other value)
   251  	vendors.removeByValue("")
   252  	vendors.removeByValue("*")
   253  
   254  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   255  	addDelimiterVariations(vendors)
   256  
   257  	// generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci])
   258  	addAllSubSelections(vendors)
   259  
   260  	// add more candidates based on the package info for each vendor candidate
   261  	for _, vendor := range vendors.uniqueValues() {
   262  		vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...)
   263  	}
   264  
   265  	// remove known mis
   266  	vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   267  
   268  	uniqueVendors := vendors.uniqueValues()
   269  
   270  	// if any known vendor was detected, pick that one.
   271  	for _, vendor := range uniqueVendors {
   272  		if knownVendors.Has(vendor) {
   273  			return []string{vendor}
   274  		}
   275  	}
   276  
   277  	return uniqueVendors
   278  }
   279  
   280  func candidateProducts(p pkg.Package) []string {
   281  	return candidateProductSet(p).uniqueValues()
   282  }
   283  
   284  func candidateProductSet(p pkg.Package) fieldCandidateSet {
   285  	products := newFieldCandidateSet(p.Name)
   286  
   287  	_, hasJavaMetadata := p.Metadata.(pkg.JavaArchive)
   288  
   289  	switch {
   290  	case p.Language == pkg.Dotnet || p.Type == pkg.DotnetPkg:
   291  		products.clear()
   292  		products.union(candidateProductsForDotnet(p))
   293  	case p.Language == pkg.Python || p.Type == pkg.PythonPkg:
   294  		if !strings.HasPrefix(p.Name, "python") {
   295  			products.addValue("python-" + p.Name)
   296  		}
   297  	case p.Language == pkg.Java || hasJavaMetadata || p.Type == pkg.JavaPkg:
   298  		products.addValue(candidateProductsForJava(p)...)
   299  	case p.Language == pkg.Go || p.Type == pkg.GoModulePkg:
   300  		// replace all candidates with only the golang-specific helper
   301  		products.clear()
   302  
   303  		prod := candidateProductForGo(p.Name)
   304  		if prod != "" {
   305  			products.addValue(prod)
   306  		}
   307  	case p.Type == pkg.BinaryPkg && endsWithNumber(p.Name):
   308  		// add binary package digit-suffix variations (e.g. Qt5 -> Qt)
   309  		addBinaryPackageDigitVariations(products)
   310  	}
   311  
   312  	switch p.Metadata.(type) {
   313  	case pkg.ApkDBEntry:
   314  		products.union(candidateProductsForAPK(p))
   315  	case pkg.PEBinary:
   316  		// Add PE-specific product hints (e.g. ghostscript)
   317  		products.union(candidateProductsForPE(p))
   318  	case pkg.WordpressPluginEntry:
   319  		products.clear()
   320  		products.union(candidateProductsForWordpressPlugin(p))
   321  	}
   322  
   323  	// it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value)
   324  	products.removeByValue("")
   325  	products.removeByValue("*")
   326  
   327  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   328  	addDelimiterVariations(products)
   329  
   330  	// add known candidate additions
   331  	products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...)
   332  
   333  	// remove known candidate removals
   334  	products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   335  
   336  	return products
   337  }
   338  
   339  func addAllSubSelections(fields fieldCandidateSet) {
   340  	candidatesForVariations := fields.copy()
   341  	candidatesForVariations.removeWhere(subSelectionsDisallowed)
   342  
   343  	for _, candidate := range candidatesForVariations.values() {
   344  		fields.addValue(generateSubSelections(candidate)...)
   345  	}
   346  }
   347  
   348  // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections
   349  // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins].
   350  func generateSubSelections(field string) (results []string) {
   351  	scanner := bufio.NewScanner(strings.NewReader(field))
   352  	scanner.Split(scanByHyphenOrUnderscore)
   353  	var lastToken uint8
   354  	for scanner.Scan() {
   355  		rawCandidate := scanner.Text()
   356  		if len(rawCandidate) == 0 {
   357  			break
   358  		}
   359  
   360  		// trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since
   361  		// scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least
   362  		// prefixed.
   363  		candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore)
   364  
   365  		// capture the result (if there is content)
   366  		if len(candidate) > 0 {
   367  			if len(results) > 0 {
   368  				results = append(results, results[len(results)-1]+string(lastToken)+candidate)
   369  			} else {
   370  				results = append(results, candidate)
   371  			}
   372  		}
   373  
   374  		// keep track of the trailing separator for the next loop
   375  		lastToken = rawCandidate[len(rawCandidate)-1]
   376  	}
   377  	return results
   378  }
   379  
   380  // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores.
   381  func trimHyphenOrUnderscore(r rune) bool {
   382  	switch r {
   383  	case '-', '_':
   384  		return true
   385  	}
   386  	return false
   387  }
   388  
   389  // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split
   390  func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) {
   391  	if atEOF && len(data) == 0 {
   392  		return 0, nil, nil
   393  	}
   394  	if i := bytes.IndexAny(data, "-_"); i >= 0 {
   395  		return i + 1, data[0 : i+1], nil
   396  	}
   397  
   398  	if atEOF {
   399  		return len(data), data, nil
   400  	}
   401  
   402  	return 0, nil, nil
   403  }
   404  
   405  func addDelimiterVariations(fields fieldCandidateSet) {
   406  	candidatesForVariations := fields.copy()
   407  	candidatesForVariations.removeWhere(delimiterVariationsDisallowed)
   408  
   409  	for _, candidate := range candidatesForVariations.list() {
   410  		field := candidate.value
   411  		hasHyphen := strings.Contains(field, "-")
   412  		hasUnderscore := strings.Contains(field, "_")
   413  
   414  		if hasHyphen {
   415  			// provide variations of hyphen candidates with an underscore
   416  			newValue := strings.ReplaceAll(field, "-", "_")
   417  			underscoreCandidate := candidate
   418  			underscoreCandidate.value = newValue
   419  			fields.add(underscoreCandidate)
   420  		}
   421  
   422  		if hasUnderscore {
   423  			// provide variations of underscore candidates with a hyphen
   424  			newValue := strings.ReplaceAll(field, "_", "-")
   425  			hyphenCandidate := candidate
   426  			hyphenCandidate.value = newValue
   427  			fields.add(hyphenCandidate)
   428  		}
   429  	}
   430  }
   431  
   432  // removeTrailingDigits removes all trailing digits from a string
   433  func removeTrailingDigits(s string) string {
   434  	re := regexp.MustCompile(`\d+$`)
   435  	return re.ReplaceAllString(s, "")
   436  }
   437  
   438  // addBinaryPackageDigitVariations adds variations with trailing digits removed for binary packages.For binary package types only, when the name ends with a digit, add a new variation with all suffix-digits removed (e.g. Qt5 -> Qt). This helps generate additional CPE permutations for better vulnerability matching.
   439  func addBinaryPackageDigitVariations(fields fieldCandidateSet) {
   440  	candidatesForVariations := fields.copy()
   441  	for _, candidate := range candidatesForVariations.values() {
   442  		// Check if the candidate ends with a digit
   443  		if len(candidate) > 0 && candidate[len(candidate)-1] >= '0' && candidate[len(candidate)-1] <= '9' {
   444  			// Create variation with all suffix digits removed
   445  			withoutDigits := removeTrailingDigits(candidate)
   446  			if withoutDigits != "" && withoutDigits != candidate {
   447  				fields.addValue(withoutDigits)
   448  			}
   449  		}
   450  	}
   451  }
   452  
   453  func endsWithNumber(s string) bool {
   454  	if len(s) == 0 {
   455  		return false
   456  	}
   457  	r := []rune(s)
   458  	last := r[len(r)-1]
   459  	return unicode.IsDigit(last)
   460  }