github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/internal/cpegenerate/generate.go (about) 1 package cpegenerate 2 3 import ( 4 "bufio" 5 "bytes" 6 _ "embed" 7 "encoding/json" 8 "fmt" 9 "regexp" 10 "sort" 11 "strings" 12 "sync" 13 "unicode" 14 15 "github.com/scylladb/go-set/strset" 16 17 "github.com/anchore/syft/internal/log" 18 "github.com/anchore/syft/syft/cpe" 19 "github.com/anchore/syft/syft/pkg" 20 "github.com/anchore/syft/syft/pkg/cataloger/internal/cpegenerate/dictionary" 21 ) 22 23 // knownVendors contains vendor strings that are known to exist in 24 // the CPE database, so they will be preferred over other candidates: 25 var knownVendors = strset.New("apache") 26 27 func newCPE(product, vendor, version, targetSW string) *cpe.Attributes { 28 c := cpe.NewWithAny() 29 c.Part = "a" 30 c.Product = product 31 c.Vendor = vendor 32 c.Version = version 33 c.TargetSW = targetSW 34 if cpe.ValidateString(c.String()) != nil { 35 return nil 36 } 37 return &c 38 } 39 40 //go:embed dictionary/data/cpe-index.json 41 var indexedCPEDictionaryData []byte 42 43 var indexedCPEDictionary *dictionary.Indexed 44 var indexedCPEDictionaryOnce sync.Once 45 46 func GetIndexedDictionary() (_ *dictionary.Indexed, err error) { 47 indexedCPEDictionaryOnce.Do(func() { 48 err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary) 49 }) 50 51 if err != nil { 52 return 53 } 54 55 if indexedCPEDictionary == nil { 56 err = fmt.Errorf("failed to unmarshal indexed CPE dictionary") 57 return 58 } 59 60 return indexedCPEDictionary, err 61 } 62 63 func FromDictionaryFind(p pkg.Package) ([]cpe.CPE, bool) { 64 dict, err := GetIndexedDictionary() 65 if err != nil { 66 log.Debugf("CPE dictionary lookup not available: %+v", err) 67 return []cpe.CPE{}, false 68 } 69 70 var cpes *dictionary.Set 71 var ok bool 72 73 switch p.Type { 74 case pkg.NpmPkg: 75 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name] 76 77 case pkg.GemPkg: 78 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name] 79 80 case pkg.PythonPkg: 81 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name] 82 83 case pkg.JenkinsPluginPkg: 84 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name] 85 86 case pkg.RustPkg: 87 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name] 88 89 case pkg.PhpComposerPkg: 90 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPComposer][p.Name] 91 92 case pkg.PhpPeclPkg: 93 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPPecl][p.Name] 94 95 case pkg.GoModulePkg: 96 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemGoModules][p.Name] 97 98 case pkg.WordpressPluginPkg: 99 metadata, valid := p.Metadata.(pkg.WordpressPluginEntry) 100 if !valid { 101 return nil, false 102 } 103 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemWordpressPlugins][metadata.PluginInstallDirectory] 104 105 case pkg.ModelPkg: 106 // ML models should not have CPEs as they are not traditional software packages 107 // and don't fit the vulnerability model used for software packages. 108 return nil, false 109 default: 110 // The dictionary doesn't support this package type yet. 111 return nil, false 112 } 113 114 if !ok { 115 // The dictionary doesn't have a CPE for this package. 116 return []cpe.CPE{}, false 117 } 118 119 parsedCPEs := []cpe.CPE{} 120 for _, c := range cpes.List() { 121 parsedCPE, err := cpe.New(c, cpe.NVDDictionaryLookupSource) 122 if err != nil { 123 continue 124 } 125 126 parsedCPE.Attributes.Version = p.Version 127 parsedCPEs = append(parsedCPEs, parsedCPE) 128 } 129 130 if len(parsedCPEs) == 0 { 131 return nil, false 132 } 133 134 sort.Sort(cpe.BySourceThenSpecificity(parsedCPEs)) 135 return parsedCPEs, true 136 } 137 138 // FromPackageAttributes Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to 139 // generate the minimal set of representative CPEs, which implies that optional fields should not be included 140 // (such as target SW). 141 func FromPackageAttributes(p pkg.Package) []cpe.CPE { 142 // ML models should not have CPEs as they are not traditional software packages 143 // and don't fit the vulnerability model used for software packages. 144 if p.Type == pkg.ModelPkg { 145 return nil 146 } 147 148 vendors := candidateVendors(p) 149 products := candidateProducts(p) 150 targetSWs := candidateTargetSw(p) 151 if len(products) == 0 { 152 return nil 153 } 154 155 keys := strset.New() 156 cpes := make([]cpe.Attributes, 0) 157 for _, ts := range targetSWs { 158 for _, product := range products { 159 for _, vendor := range vendors { 160 // prevent duplicate entries... 161 key := fmt.Sprintf("%s|%s|%s|%s", product, vendor, p.Version, ts) 162 if keys.Has(key) { 163 continue 164 } 165 keys.Add(key) 166 // add a new entry... 167 if c := newCPE(product, vendor, p.Version, ts); c != nil { 168 cpes = append(cpes, *c) 169 } 170 } 171 } 172 } 173 174 // filter out any known combinations that don't accurately represent this package 175 cpes = filter(cpes, p, cpeFilters...) 176 177 var result []cpe.CPE 178 for _, c := range cpes { 179 result = append(result, cpe.CPE{Attributes: c, Source: cpe.GeneratedSource}) 180 } 181 182 sort.Sort(cpe.BySourceThenSpecificity(result)) 183 return result 184 } 185 186 func candidateTargetSw(p pkg.Package) []string { 187 if p.Type == pkg.WordpressPluginPkg { 188 return []string{"wordpress"} 189 } 190 return []string{cpe.Any} 191 } 192 193 func candidateVendors(p pkg.Package) []string { 194 // in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that 195 // could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this 196 // are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities 197 // with CPEs where the vendor is the product name and doesn't appear to be derived from any available package 198 // metadata. 199 vendors := newFieldCandidateSet() 200 vendors.union(candidateProductSet(p)) 201 202 switch p.Language { 203 case pkg.JavaScript: 204 // for JavaScript if we find node.js as a package then the vendor is "nodejs" 205 if p.Name == "node.js" { 206 vendors.addValue("nodejs") 207 } 208 case pkg.Ruby: 209 vendors.addValue("ruby-lang") 210 case pkg.Go: 211 // replace all candidates with only the golang-specific helper 212 vendors.clear() 213 214 vendor := candidateVendorForGo(p.Name) 215 if vendor != "" { 216 vendors.addValue(vendor) 217 } 218 } 219 220 switch p.Metadata.(type) { 221 case pkg.DotnetDepsEntry, pkg.DotnetPackagesLockEntry, pkg.DotnetPortableExecutableEntry: 222 vendors.clear() 223 vendors.union(candidateVendorsForDotnet(p)) 224 case pkg.RpmDBEntry, pkg.RpmArchive: 225 vendors.union(candidateVendorsForRPM(p)) 226 case pkg.RubyGemspec: 227 vendors.union(candidateVendorsForRuby(p)) 228 case pkg.PythonPackage: 229 vendors.union(candidateVendorsForPython(p)) 230 case pkg.JavaArchive: 231 vendors.union(candidateVendorsForJava(p)) 232 case pkg.ApkDBEntry: 233 vendors.union(candidateVendorsForAPK(p)) 234 case pkg.NpmPackage: 235 vendors.union(candidateVendorsForJavascript(p)) 236 case pkg.PEBinary: 237 // Add PE-specific vendor hints (e.g. ghostscript -> artifex) 238 vendors.union(candidateVendorsForPE(p)) 239 case pkg.WordpressPluginEntry: 240 vendors.clear() 241 vendors.union(candidateVendorsForWordpressPlugin(p)) 242 } 243 244 if p.Type == pkg.BinaryPkg && endsWithNumber(p.Name) { 245 // add binary package digit-suffix variations (e.g. Qt5 -> Qt) 246 addBinaryPackageDigitVariations(vendors) 247 } 248 249 // We should no longer be generating vendor candidates with these values ["" and "*"] 250 // (since CPEs will match any other value) 251 vendors.removeByValue("") 252 vendors.removeByValue("*") 253 254 // try swapping hyphens for underscores, vice versa, and removing separators altogether 255 addDelimiterVariations(vendors) 256 257 // generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci]) 258 addAllSubSelections(vendors) 259 260 // add more candidates based on the package info for each vendor candidate 261 for _, vendor := range vendors.uniqueValues() { 262 vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...) 263 } 264 265 // remove known mis 266 vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 267 268 uniqueVendors := vendors.uniqueValues() 269 270 // if any known vendor was detected, pick that one. 271 for _, vendor := range uniqueVendors { 272 if knownVendors.Has(vendor) { 273 return []string{vendor} 274 } 275 } 276 277 return uniqueVendors 278 } 279 280 func candidateProducts(p pkg.Package) []string { 281 return candidateProductSet(p).uniqueValues() 282 } 283 284 func candidateProductSet(p pkg.Package) fieldCandidateSet { 285 products := newFieldCandidateSet(p.Name) 286 287 _, hasJavaMetadata := p.Metadata.(pkg.JavaArchive) 288 289 switch { 290 case p.Language == pkg.Dotnet || p.Type == pkg.DotnetPkg: 291 products.clear() 292 products.union(candidateProductsForDotnet(p)) 293 case p.Language == pkg.Python || p.Type == pkg.PythonPkg: 294 if !strings.HasPrefix(p.Name, "python") { 295 products.addValue("python-" + p.Name) 296 } 297 case p.Language == pkg.Java || hasJavaMetadata || p.Type == pkg.JavaPkg: 298 products.addValue(candidateProductsForJava(p)...) 299 case p.Language == pkg.Go || p.Type == pkg.GoModulePkg: 300 // replace all candidates with only the golang-specific helper 301 products.clear() 302 303 prod := candidateProductForGo(p.Name) 304 if prod != "" { 305 products.addValue(prod) 306 } 307 case p.Type == pkg.BinaryPkg && endsWithNumber(p.Name): 308 // add binary package digit-suffix variations (e.g. Qt5 -> Qt) 309 addBinaryPackageDigitVariations(products) 310 } 311 312 switch p.Metadata.(type) { 313 case pkg.ApkDBEntry: 314 products.union(candidateProductsForAPK(p)) 315 case pkg.PEBinary: 316 // Add PE-specific product hints (e.g. ghostscript) 317 products.union(candidateProductsForPE(p)) 318 case pkg.WordpressPluginEntry: 319 products.clear() 320 products.union(candidateProductsForWordpressPlugin(p)) 321 } 322 323 // it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value) 324 products.removeByValue("") 325 products.removeByValue("*") 326 327 // try swapping hyphens for underscores, vice versa, and removing separators altogether 328 addDelimiterVariations(products) 329 330 // add known candidate additions 331 products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...) 332 333 // remove known candidate removals 334 products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 335 336 return products 337 } 338 339 func addAllSubSelections(fields fieldCandidateSet) { 340 candidatesForVariations := fields.copy() 341 candidatesForVariations.removeWhere(subSelectionsDisallowed) 342 343 for _, candidate := range candidatesForVariations.values() { 344 fields.addValue(generateSubSelections(candidate)...) 345 } 346 } 347 348 // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections 349 // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins]. 350 func generateSubSelections(field string) (results []string) { 351 scanner := bufio.NewScanner(strings.NewReader(field)) 352 scanner.Split(scanByHyphenOrUnderscore) 353 var lastToken uint8 354 for scanner.Scan() { 355 rawCandidate := scanner.Text() 356 if len(rawCandidate) == 0 { 357 break 358 } 359 360 // trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since 361 // scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least 362 // prefixed. 363 candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore) 364 365 // capture the result (if there is content) 366 if len(candidate) > 0 { 367 if len(results) > 0 { 368 results = append(results, results[len(results)-1]+string(lastToken)+candidate) 369 } else { 370 results = append(results, candidate) 371 } 372 } 373 374 // keep track of the trailing separator for the next loop 375 lastToken = rawCandidate[len(rawCandidate)-1] 376 } 377 return results 378 } 379 380 // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores. 381 func trimHyphenOrUnderscore(r rune) bool { 382 switch r { 383 case '-', '_': 384 return true 385 } 386 return false 387 } 388 389 // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split 390 func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) { 391 if atEOF && len(data) == 0 { 392 return 0, nil, nil 393 } 394 if i := bytes.IndexAny(data, "-_"); i >= 0 { 395 return i + 1, data[0 : i+1], nil 396 } 397 398 if atEOF { 399 return len(data), data, nil 400 } 401 402 return 0, nil, nil 403 } 404 405 func addDelimiterVariations(fields fieldCandidateSet) { 406 candidatesForVariations := fields.copy() 407 candidatesForVariations.removeWhere(delimiterVariationsDisallowed) 408 409 for _, candidate := range candidatesForVariations.list() { 410 field := candidate.value 411 hasHyphen := strings.Contains(field, "-") 412 hasUnderscore := strings.Contains(field, "_") 413 414 if hasHyphen { 415 // provide variations of hyphen candidates with an underscore 416 newValue := strings.ReplaceAll(field, "-", "_") 417 underscoreCandidate := candidate 418 underscoreCandidate.value = newValue 419 fields.add(underscoreCandidate) 420 } 421 422 if hasUnderscore { 423 // provide variations of underscore candidates with a hyphen 424 newValue := strings.ReplaceAll(field, "_", "-") 425 hyphenCandidate := candidate 426 hyphenCandidate.value = newValue 427 fields.add(hyphenCandidate) 428 } 429 } 430 } 431 432 // removeTrailingDigits removes all trailing digits from a string 433 func removeTrailingDigits(s string) string { 434 re := regexp.MustCompile(`\d+$`) 435 return re.ReplaceAllString(s, "") 436 } 437 438 // addBinaryPackageDigitVariations adds variations with trailing digits removed for binary packages.For binary package types only, when the name ends with a digit, add a new variation with all suffix-digits removed (e.g. Qt5 -> Qt). This helps generate additional CPE permutations for better vulnerability matching. 439 func addBinaryPackageDigitVariations(fields fieldCandidateSet) { 440 candidatesForVariations := fields.copy() 441 for _, candidate := range candidatesForVariations.values() { 442 // Check if the candidate ends with a digit 443 if len(candidate) > 0 && candidate[len(candidate)-1] >= '0' && candidate[len(candidate)-1] <= '9' { 444 // Create variation with all suffix digits removed 445 withoutDigits := removeTrailingDigits(candidate) 446 if withoutDigits != "" && withoutDigits != candidate { 447 fields.addValue(withoutDigits) 448 } 449 } 450 } 451 } 452 453 func endsWithNumber(s string) bool { 454 if len(s) == 0 { 455 return false 456 } 457 r := []rune(s) 458 last := r[len(r)-1] 459 return unicode.IsDigit(last) 460 }