github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/pkg/cataloger/common/cpe/generate.go (about)

     1  package cpe
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	_ "embed"
     7  	"encoding/json"
     8  	"fmt"
     9  	"sort"
    10  	"strings"
    11  	"sync"
    12  
    13  	"github.com/facebookincubator/nvdtools/wfn"
    14  	"github.com/scylladb/go-set/strset"
    15  
    16  	"github.com/anchore/syft/syft/cpe"
    17  	"github.com/anchore/syft/syft/pkg"
    18  	"github.com/anchore/syft/syft/pkg/cataloger/common/cpe/dictionary"
    19  	"github.com/lineaje-labs/syft/internal/log"
    20  )
    21  
    22  // knownVendors contains vendor strings that are known to exist in
    23  // the CPE database, so they will be preferred over other candidates:
    24  var knownVendors = strset.New("apache")
    25  
    26  func newCPE(product, vendor, version, targetSW string) *wfn.Attributes {
    27  	c := *(wfn.NewAttributesWithAny())
    28  	c.Part = "a"
    29  	c.Product = product
    30  	c.Vendor = vendor
    31  	c.Version = version
    32  	c.TargetSW = targetSW
    33  	if cpe.ValidateString(cpe.String(c)) != nil {
    34  		return nil
    35  	}
    36  	return &c
    37  }
    38  
    39  //go:embed dictionary/data/cpe-index.json
    40  var indexedCPEDictionaryData []byte
    41  
    42  var indexedCPEDictionary *dictionary.Indexed
    43  var indexedCPEDictionaryOnce sync.Once
    44  
    45  func GetIndexedDictionary() (_ *dictionary.Indexed, err error) {
    46  	indexedCPEDictionaryOnce.Do(func() {
    47  		err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary)
    48  	})
    49  
    50  	if err != nil {
    51  		return
    52  	}
    53  
    54  	if indexedCPEDictionary == nil {
    55  		err = fmt.Errorf("failed to unmarshal indexed CPE dictionary")
    56  		return
    57  	}
    58  
    59  	return indexedCPEDictionary, err
    60  }
    61  
    62  func DictionaryFind(p pkg.Package) (cpe.CPE, bool) {
    63  	dict, err := GetIndexedDictionary()
    64  	if err != nil {
    65  		log.Debugf("dictionary CPE lookup not available: %+v", err)
    66  		return cpe.CPE{}, false
    67  	}
    68  
    69  	var (
    70  		cpeString string
    71  		ok        bool
    72  	)
    73  
    74  	switch p.Type {
    75  	case pkg.NpmPkg:
    76  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name]
    77  
    78  	case pkg.GemPkg:
    79  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name]
    80  
    81  	case pkg.PythonPkg:
    82  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name]
    83  
    84  	case pkg.JenkinsPluginPkg:
    85  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name]
    86  
    87  	case pkg.RustPkg:
    88  		cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name]
    89  
    90  	default:
    91  		// The dictionary doesn't support this package type yet.
    92  		return cpe.CPE{}, false
    93  	}
    94  
    95  	if !ok {
    96  		// The dictionary doesn't have a CPE for this package.
    97  		return cpe.CPE{}, false
    98  	}
    99  
   100  	parsedCPE, err := cpe.New(cpeString)
   101  	if err != nil {
   102  		return cpe.CPE{}, false
   103  	}
   104  
   105  	parsedCPE.Version = p.Version
   106  
   107  	return parsedCPE, true
   108  }
   109  
   110  // Generate Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to
   111  // generate the minimal set of representative CPEs, which implies that optional fields should not be included
   112  // (such as target SW).
   113  func Generate(p pkg.Package) []cpe.CPE {
   114  	vendors := candidateVendors(p)
   115  	products := candidateProducts(p)
   116  	if len(products) == 0 {
   117  		return nil
   118  	}
   119  
   120  	keys := strset.New()
   121  	cpes := make([]cpe.CPE, 0)
   122  	for _, product := range products {
   123  		for _, vendor := range vendors {
   124  			// prevent duplicate entries...
   125  			key := fmt.Sprintf("%s|%s|%s", product, vendor, p.Version)
   126  			if keys.Has(key) {
   127  				continue
   128  			}
   129  			keys.Add(key)
   130  			// add a new entry...
   131  			if c := newCPE(product, vendor, p.Version, wfn.Any); c != nil {
   132  				cpes = append(cpes, *c)
   133  			}
   134  		}
   135  	}
   136  
   137  	// filter out any known combinations that don't accurately represent this package
   138  	cpes = filter(cpes, p, cpeFilters...)
   139  
   140  	sort.Sort(cpe.BySpecificity(cpes))
   141  
   142  	return cpes
   143  }
   144  
   145  func candidateVendors(p pkg.Package) []string {
   146  	// in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that
   147  	// could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this
   148  	// are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities
   149  	// with CPEs where the vendor is the product name and doesn't appear to be derived from any available package
   150  	// metadata.
   151  	vendors := newFieldCandidateSet(candidateProducts(p)...)
   152  
   153  	switch p.Language {
   154  	case pkg.JavaScript:
   155  		// for JavaScript if we find node.js as a package then the vendor is "nodejs"
   156  		if p.Name == "node.js" {
   157  			vendors.addValue("nodejs")
   158  		}
   159  	case pkg.Ruby:
   160  		vendors.addValue("ruby-lang")
   161  	case pkg.Go:
   162  		// replace all candidates with only the golang-specific helper
   163  		vendors.clear()
   164  
   165  		vendor := candidateVendorForGo(p.Name)
   166  		if vendor != "" {
   167  			vendors.addValue(vendor)
   168  		}
   169  	}
   170  
   171  	switch p.Metadata.(type) {
   172  	case pkg.RpmDBEntry:
   173  		vendors.union(candidateVendorsForRPM(p))
   174  	case pkg.RubyGemspec:
   175  		vendors.union(candidateVendorsForRuby(p))
   176  	case pkg.PythonPackage:
   177  		vendors.union(candidateVendorsForPython(p))
   178  	case pkg.JavaArchive:
   179  		vendors.union(candidateVendorsForJava(p))
   180  	case pkg.ApkDBEntry:
   181  		vendors.union(candidateVendorsForAPK(p))
   182  	case pkg.NpmPackage:
   183  		vendors.union(candidateVendorsForJavascript(p))
   184  	}
   185  
   186  	// We should no longer be generating vendor candidates with these values ["" and "*"]
   187  	// (since CPEs will match any other value)
   188  	vendors.removeByValue("")
   189  	vendors.removeByValue("*")
   190  
   191  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   192  	addDelimiterVariations(vendors)
   193  
   194  	// generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci])
   195  	addAllSubSelections(vendors)
   196  
   197  	// add more candidates based on the package info for each vendor candidate
   198  	for _, vendor := range vendors.uniqueValues() {
   199  		vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...)
   200  	}
   201  
   202  	// remove known mis
   203  	vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   204  
   205  	uniqueVendors := vendors.uniqueValues()
   206  
   207  	// if any known vendor was detected, pick that one.
   208  	for _, vendor := range uniqueVendors {
   209  		if knownVendors.Has(vendor) {
   210  			return []string{vendor}
   211  		}
   212  	}
   213  
   214  	return uniqueVendors
   215  }
   216  
   217  func candidateProducts(p pkg.Package) []string {
   218  	products := newFieldCandidateSet(p.Name)
   219  
   220  	_, hasJavaMetadata := p.Metadata.(pkg.JavaArchive)
   221  
   222  	switch {
   223  	case p.Language == pkg.Python:
   224  		if !strings.HasPrefix(p.Name, "python") {
   225  			products.addValue("python-" + p.Name)
   226  		}
   227  	case p.Language == pkg.Java || hasJavaMetadata:
   228  		products.addValue(candidateProductsForJava(p)...)
   229  	case p.Language == pkg.Go:
   230  		// replace all candidates with only the golang-specific helper
   231  		products.clear()
   232  
   233  		prod := candidateProductForGo(p.Name)
   234  		if prod != "" {
   235  			products.addValue(prod)
   236  		}
   237  	}
   238  
   239  	if _, hasAPKMetadata := p.Metadata.(pkg.ApkDBEntry); hasAPKMetadata {
   240  		products.union(candidateProductsForAPK(p))
   241  	}
   242  
   243  	// it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value)
   244  	products.removeByValue("")
   245  	products.removeByValue("*")
   246  
   247  	// try swapping hyphens for underscores, vice versa, and removing separators altogether
   248  	addDelimiterVariations(products)
   249  
   250  	// add known candidate additions
   251  	products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...)
   252  
   253  	// remove known candidate removals
   254  	products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...)
   255  
   256  	return products.uniqueValues()
   257  }
   258  
   259  func addAllSubSelections(fields fieldCandidateSet) {
   260  	candidatesForVariations := fields.copy()
   261  	candidatesForVariations.removeWhere(subSelectionsDisallowed)
   262  
   263  	for _, candidate := range candidatesForVariations.values() {
   264  		fields.addValue(generateSubSelections(candidate)...)
   265  	}
   266  }
   267  
   268  // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections
   269  // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins].
   270  func generateSubSelections(field string) (results []string) {
   271  	scanner := bufio.NewScanner(strings.NewReader(field))
   272  	scanner.Split(scanByHyphenOrUnderscore)
   273  	var lastToken uint8
   274  	for scanner.Scan() {
   275  		rawCandidate := scanner.Text()
   276  		if len(rawCandidate) == 0 {
   277  			break
   278  		}
   279  
   280  		// trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since
   281  		// scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least
   282  		// prefixed.
   283  		candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore)
   284  
   285  		// capture the result (if there is content)
   286  		if len(candidate) > 0 {
   287  			if len(results) > 0 {
   288  				results = append(results, results[len(results)-1]+string(lastToken)+candidate)
   289  			} else {
   290  				results = append(results, candidate)
   291  			}
   292  		}
   293  
   294  		// keep track of the trailing separator for the next loop
   295  		lastToken = rawCandidate[len(rawCandidate)-1]
   296  	}
   297  	return results
   298  }
   299  
   300  // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores.
   301  func trimHyphenOrUnderscore(r rune) bool {
   302  	switch r {
   303  	case '-', '_':
   304  		return true
   305  	}
   306  	return false
   307  }
   308  
   309  // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split
   310  func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) {
   311  	if atEOF && len(data) == 0 {
   312  		return 0, nil, nil
   313  	}
   314  	if i := bytes.IndexAny(data, "-_"); i >= 0 {
   315  		return i + 1, data[0 : i+1], nil
   316  	}
   317  
   318  	if atEOF {
   319  		return len(data), data, nil
   320  	}
   321  
   322  	return 0, nil, nil
   323  }
   324  
   325  func addDelimiterVariations(fields fieldCandidateSet) {
   326  	candidatesForVariations := fields.copy()
   327  	candidatesForVariations.removeWhere(delimiterVariationsDisallowed)
   328  
   329  	for _, candidate := range candidatesForVariations.list() {
   330  		field := candidate.value
   331  		hasHyphen := strings.Contains(field, "-")
   332  		hasUnderscore := strings.Contains(field, "_")
   333  
   334  		if hasHyphen {
   335  			// provide variations of hyphen candidates with an underscore
   336  			newValue := strings.ReplaceAll(field, "-", "_")
   337  			underscoreCandidate := candidate
   338  			underscoreCandidate.value = newValue
   339  			fields.add(underscoreCandidate)
   340  		}
   341  
   342  		if hasUnderscore {
   343  			// provide variations of underscore candidates with a hyphen
   344  			newValue := strings.ReplaceAll(field, "_", "-")
   345  			hyphenCandidate := candidate
   346  			hyphenCandidate.value = newValue
   347  			fields.add(hyphenCandidate)
   348  		}
   349  	}
   350  }