github.com/noqcks/syft@v0.0.0-20230920222752-a9e2c4e288e5/syft/pkg/cataloger/common/cpe/generate.go (about)

     1  package cpe
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	_ "embed"
     7  	"encoding/json"
     8  	"fmt"
     9  	"sort"
    10  	"strings"
    11  	"sync"
    12  
    13  	"github.com/facebookincubator/nvdtools/wfn"
    14  	"github.com/scylladb/go-set/strset"
    15  
    16  	"github.com/anchore/syft/internal"
    17  	"github.com/anchore/syft/internal/log"
    18  	"github.com/anchore/syft/syft/cpe"
    19  	"github.com/anchore/syft/syft/pkg"
    20  	"github.com/anchore/syft/syft/pkg/cataloger/common/cpe/dictionary"
    21  )
    22  
    23  // knownVendors contains vendor strings that are known to exist in
    24  // the CPE database, so they will be preferred over other candidates:
    25  var knownVendors = strset.New("apache")
    26  
    27  func newCPE(product, vendor, version, targetSW string) *wfn.Attributes {
    28  	c := *(wfn.NewAttributesWithAny())
    29  	c.Part = "a"
    30  	c.Product = product
    31  	c.Vendor = vendor
    32  	c.Version = version
    33  	c.TargetSW = targetSW
    34  	if cpe.ValidateString(cpe.String(c)) != nil {
    35  		return nil
    36  	}
    37  	return &c
    38  }
    39  
    40  //go:embed dictionary/data/cpe-index.json
    41  var indexedCPEDictionaryData []byte
    42  
    43  var indexedCPEDictionary *dictionary.Indexed
    44  var indexedCPEDictionaryOnce sync.Once
    45  
    46  func GetIndexedDictionary() (_ *dictionary.Indexed, err error) {
    47  	indexedCPEDictionaryOnce.Do(func() {
    48  		err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary)
    49  	})
    50  
    51  	if err != nil {
    52  		return
    53  	}
    54  
    55  	if indexedCPEDictionary == nil {
    56  		err = fmt.Errorf("failed to unmarshal indexed CPE dictionary")
    57  		return
    58  	}
    59  
    60  	return indexedCPEDictionary, err
    61  }
    62  
    63  func DictionaryFind(p pkg.Package) (cpe.CPE, bool) {
    64  	dict, err := GetIndexedDictionary()
    65  	if err != nil {
    66  		log.Debugf("dictionary CPE lookup not available: %+v", err)
    67  		return cpe.CPE{}, false
    68  	}
    69  
    70  	var (
    71  		cpeString string
    72  		ok        bool
    73  	)
    74  
    75  	switch p.Type {
    76  	case pkg.NpmPkg:
    77  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name]
    78  
    79  	case pkg.GemPkg:
    80  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name]
    81  
    82  	case pkg.PythonPkg:
    83  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name]
    84  
    85  	case pkg.JenkinsPluginPkg:
    86  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name]
    87  
    88  	case pkg.RustPkg:
    89  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name]
    90  
    91  	default:
    92  		// The dictionary doesn't support this package type yet.
    93  		return cpe.CPE{}, false
    94  	}
    95  
    96  	if !ok {
    97  		// The dictionary doesn't have a CPE for this package.
    98  		return cpe.CPE{}, false
    99  	}
   100  
   101  	parsedCPE, err := cpe.New(cpeString)
   102  	if err != nil {
   103  		return cpe.CPE{}, false
   104  	}
   105  
   106  	parsedCPE.Version = p.Version
   107  
   108  	return parsedCPE, true
   109  }
   110  
   111  // Generate Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to
   112  // generate the minimal set of representative CPEs, which implies that optional fields should not be included
   113  // (such as target SW).
   114  func Generate(p pkg.Package) []cpe.CPE {
   115  	vendors := candidateVendors(p)
   116  	products := candidateProducts(p)
   117  	if len(products) == 0 {
   118  		return nil
   119  	}
   120  
   121  	keys := internal.NewStringSet()
   122  	cpes := make([]cpe.CPE, 0)
   123  	for _, product := range products {
   124  		for _, vendor := range vendors {
   125  			// prevent duplicate entries...
   126  			key := fmt.Sprintf("%s|%s|%s", product, vendor, p.Version)
   127  			if keys.Contains(key) {
   128  				continue
   129  			}
   130  			keys.Add(key)
   131  			// add a new entry...
   132  			if c := newCPE(product, vendor, p.Version, wfn.Any); c != nil {
   133  				cpes = append(cpes, *c)
   134  			}
   135  		}
   136  	}
   137  
   138  	// filter out any known combinations that don't accurately represent this package
   139  	cpes = filter(cpes, p, cpeFilters...)
   140  
   141  	sort.Sort(cpe.BySpecificity(cpes))
   142  
   143  	return cpes
   144  }
   145  
   146  func candidateVendors(p pkg.Package) []string {
   147  	// in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that
   148  	// could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this
   149  	// are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities
   150  	// with CPEs where the vendor is the product name and doesn't appear to be derived from any available package
   151  	// metadata.
   152  	vendors := newFieldCandidateSet(candidateProducts(p)...)
   153  
   154  	switch p.Language {
   155  	case pkg.JavaScript:
   156  		// for JavaScript if we find node.js as a package then the vendor is "nodejs"
   157  		if p.Name == "node.js" {
   158  			vendors.addValue("nodejs")
   159  		}
   160  	case pkg.Ruby:
   161  		vendors.addValue("ruby-lang")
   162  	case pkg.Go:
   163  		// replace all candidates with only the golang-specific helper
   164  		vendors.clear()
   165  
   166  		vendor := candidateVendorForGo(p.Name)
   167  		if vendor != "" {
   168  			vendors.addValue(vendor)
   169  		}
   170  	}
   171  
   172  	switch p.MetadataType {
   173  	case pkg.RpmMetadataType:
   174  		vendors.union(candidateVendorsForRPM(p))
   175  	case pkg.GemMetadataType:
   176  		vendors.union(candidateVendorsForRuby(p))
   177  	case pkg.PythonPackageMetadataType:
   178  		vendors.union(candidateVendorsForPython(p))
   179  	case pkg.JavaMetadataType:
   180  		vendors.union(candidateVendorsForJava(p))
   181  	case pkg.ApkMetadataType:
   182  		vendors.union(candidateVendorsForAPK(p))
   183  	case pkg.NpmPackageJSONMetadataType:
   184  		vendors.union(candidateVendorsForJavaScript(p))
   185  	}
   186  
   187  	// We should no longer be generating vendor candidates with these values ["" and "*"]
   188  	// (since CPEs will match any other value)
   189  	vendors.removeByValue("")
   190  	vendors.removeByValue("*")
   191  
   192  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   193  	addDelimiterVariations(vendors)
   194  
   195  	// generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci])
   196  	addAllSubSelections(vendors)
   197  
   198  	// add more candidates based on the package info for each vendor candidate
   199  	for _, vendor := range vendors.uniqueValues() {
   200  		vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...)
   201  	}
   202  
   203  	// remove known mis
   204  	vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   205  
   206  	uniqueVendors := vendors.uniqueValues()
   207  
   208  	// if any known vendor was detected, pick that one.
   209  	for _, vendor := range uniqueVendors {
   210  		if knownVendors.Has(vendor) {
   211  			return []string{vendor}
   212  		}
   213  	}
   214  
   215  	return uniqueVendors
   216  }
   217  
   218  func candidateProducts(p pkg.Package) []string {
   219  	products := newFieldCandidateSet(p.Name)
   220  
   221  	switch {
   222  	case p.Language == pkg.Python:
   223  		if !strings.HasPrefix(p.Name, "python") {
   224  			products.addValue("python-" + p.Name)
   225  		}
   226  	case p.Language == pkg.Java || p.MetadataType == pkg.JavaMetadataType:
   227  		products.addValue(candidateProductsForJava(p)...)
   228  	case p.Language == pkg.Go:
   229  		// replace all candidates with only the golang-specific helper
   230  		products.clear()
   231  
   232  		prod := candidateProductForGo(p.Name)
   233  		if prod != "" {
   234  			products.addValue(prod)
   235  		}
   236  	}
   237  
   238  	if p.MetadataType == pkg.ApkMetadataType {
   239  		products.union(candidateProductsForAPK(p))
   240  	}
   241  
   242  	// it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value)
   243  	products.removeByValue("")
   244  	products.removeByValue("*")
   245  
   246  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   247  	addDelimiterVariations(products)
   248  
   249  	// add known candidate additions
   250  	products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...)
   251  
   252  	// remove known candidate removals
   253  	products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   254  
   255  	return products.uniqueValues()
   256  }
   257  
   258  func addAllSubSelections(fields fieldCandidateSet) {
   259  	candidatesForVariations := fields.copy()
   260  	candidatesForVariations.removeWhere(subSelectionsDisallowed)
   261  
   262  	for _, candidate := range candidatesForVariations.values() {
   263  		fields.addValue(generateSubSelections(candidate)...)
   264  	}
   265  }
   266  
   267  // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections
   268  // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins].
   269  func generateSubSelections(field string) (results []string) {
   270  	scanner := bufio.NewScanner(strings.NewReader(field))
   271  	scanner.Split(scanByHyphenOrUnderscore)
   272  	var lastToken uint8
   273  	for scanner.Scan() {
   274  		rawCandidate := scanner.Text()
   275  		if len(rawCandidate) == 0 {
   276  			break
   277  		}
   278  
   279  		// trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since
   280  		// scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least
   281  		// prefixed.
   282  		candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore)
   283  
   284  		// capture the result (if there is content)
   285  		if len(candidate) > 0 {
   286  			if len(results) > 0 {
   287  				results = append(results, results[len(results)-1]+string(lastToken)+candidate)
   288  			} else {
   289  				results = append(results, candidate)
   290  			}
   291  		}
   292  
   293  		// keep track of the trailing separator for the next loop
   294  		lastToken = rawCandidate[len(rawCandidate)-1]
   295  	}
   296  	return results
   297  }
   298  
   299  // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores.
   300  func trimHyphenOrUnderscore(r rune) bool {
   301  	switch r {
   302  	case '-', '_':
   303  		return true
   304  	}
   305  	return false
   306  }
   307  
   308  // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split
   309  func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) {
   310  	if atEOF && len(data) == 0 {
   311  		return 0, nil, nil
   312  	}
   313  	if i := bytes.IndexAny(data, "-_"); i >= 0 {
   314  		return i + 1, data[0 : i+1], nil
   315  	}
   316  
   317  	if atEOF {
   318  		return len(data), data, nil
   319  	}
   320  
   321  	return 0, nil, nil
   322  }
   323  
   324  func addDelimiterVariations(fields fieldCandidateSet) {
   325  	candidatesForVariations := fields.copy()
   326  	candidatesForVariations.removeWhere(delimiterVariationsDisallowed)
   327  
   328  	for _, candidate := range candidatesForVariations.list() {
   329  		field := candidate.value
   330  		hasHyphen := strings.Contains(field, "-")
   331  		hasUnderscore := strings.Contains(field, "_")
   332  
   333  		if hasHyphen {
   334  			// provide variations of hyphen candidates with an underscore
   335  			newValue := strings.ReplaceAll(field, "-", "_")
   336  			underscoreCandidate := candidate
   337  			underscoreCandidate.value = newValue
   338  			fields.add(underscoreCandidate)
   339  		}
   340  
   341  		if hasUnderscore {
   342  			// provide variations of underscore candidates with a hyphen
   343  			newValue := strings.ReplaceAll(field, "_", "-")
   344  			hyphenCandidate := candidate
   345  			hyphenCandidate.value = newValue
   346  			fields.add(hyphenCandidate)
   347  		}
   348  	}
   349  }