github.com/anchore/syft@v1.4.2-0.20240516191711-1bec1fc5d397/syft/pkg/cataloger/internal/cpegenerate/generate.go (about) 1 package cpegenerate 2 3 import ( 4 "bufio" 5 "bytes" 6 _ "embed" 7 "encoding/json" 8 "fmt" 9 "sort" 10 "strings" 11 "sync" 12 13 "github.com/scylladb/go-set/strset" 14 15 "github.com/anchore/syft/internal/log" 16 "github.com/anchore/syft/syft/cpe" 17 "github.com/anchore/syft/syft/pkg" 18 "github.com/anchore/syft/syft/pkg/cataloger/internal/cpegenerate/dictionary" 19 ) 20 21 // knownVendors contains vendor strings that are known to exist in 22 // the CPE database, so they will be preferred over other candidates: 23 var knownVendors = strset.New("apache") 24 25 func newCPE(product, vendor, version, targetSW string) *cpe.Attributes { 26 c := cpe.NewWithAny() 27 c.Part = "a" 28 c.Product = product 29 c.Vendor = vendor 30 c.Version = version 31 c.TargetSW = targetSW 32 if cpe.ValidateString(c.String()) != nil { 33 return nil 34 } 35 return &c 36 } 37 38 //go:embed dictionary/data/cpe-index.json 39 var indexedCPEDictionaryData []byte 40 41 var indexedCPEDictionary *dictionary.Indexed 42 var indexedCPEDictionaryOnce sync.Once 43 44 func GetIndexedDictionary() (_ *dictionary.Indexed, err error) { 45 indexedCPEDictionaryOnce.Do(func() { 46 err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary) 47 }) 48 49 if err != nil { 50 return 51 } 52 53 if indexedCPEDictionary == nil { 54 err = fmt.Errorf("failed to unmarshal indexed CPE dictionary") 55 return 56 } 57 58 return indexedCPEDictionary, err 59 } 60 61 func FromDictionaryFind(p pkg.Package) ([]cpe.CPE, bool) { 62 dict, err := GetIndexedDictionary() 63 parsedCPEs := []cpe.CPE{} 64 if err != nil { 65 log.Debugf("CPE dictionary lookup not available: %+v", err) 66 return parsedCPEs, false 67 } 68 69 var ( 70 cpes *dictionary.Set 71 ok bool 72 ) 73 74 switch p.Type { 75 case pkg.NpmPkg: 76 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name] 77 78 case pkg.GemPkg: 79 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name] 80 81 case pkg.PythonPkg: 82 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name] 83 84 case pkg.JenkinsPluginPkg: 85 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name] 86 87 case pkg.RustPkg: 88 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name] 89 90 case pkg.PhpComposerPkg: 91 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPComposer][p.Name] 92 93 case pkg.PhpPeclPkg: 94 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemPHPPecl][p.Name] 95 96 case pkg.GoModulePkg: 97 cpes, ok = dict.EcosystemPackages[dictionary.EcosystemGoModules][p.Name] 98 99 default: 100 // The dictionary doesn't support this package type yet. 101 return parsedCPEs, false 102 } 103 104 if !ok { 105 // The dictionary doesn't have a CPE for this package. 106 return parsedCPEs, false 107 } 108 109 for _, c := range cpes.List() { 110 parsedCPE, err := cpe.New(c, cpe.NVDDictionaryLookupSource) 111 if err != nil { 112 continue 113 } 114 115 parsedCPE.Attributes.Version = p.Version 116 parsedCPEs = append(parsedCPEs, parsedCPE) 117 } 118 119 if len(parsedCPEs) == 0 { 120 return []cpe.CPE{}, false 121 } 122 123 return parsedCPEs, true 124 } 125 126 // FromPackageAttributes Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to 127 // generate the minimal set of representative CPEs, which implies that optional fields should not be included 128 // (such as target SW). 129 func FromPackageAttributes(p pkg.Package) []cpe.CPE { 130 vendors := candidateVendors(p) 131 products := candidateProducts(p) 132 if len(products) == 0 { 133 return nil 134 } 135 136 keys := strset.New() 137 cpes := make([]cpe.Attributes, 0) 138 for _, product := range products { 139 for _, vendor := range vendors { 140 // prevent duplicate entries... 141 key := fmt.Sprintf("%s|%s|%s", product, vendor, p.Version) 142 if keys.Has(key) { 143 continue 144 } 145 keys.Add(key) 146 // add a new entry... 147 if c := newCPE(product, vendor, p.Version, cpe.Any); c != nil { 148 cpes = append(cpes, *c) 149 } 150 } 151 } 152 153 // filter out any known combinations that don't accurately represent this package 154 cpes = filter(cpes, p, cpeFilters...) 155 156 sort.Sort(cpe.BySpecificity(cpes)) 157 var result []cpe.CPE 158 for _, c := range cpes { 159 result = append(result, cpe.CPE{Attributes: c, Source: cpe.GeneratedSource}) 160 } 161 162 return result 163 } 164 165 //nolint:funlen 166 func candidateVendors(p pkg.Package) []string { 167 // in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that 168 // could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this 169 // are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities 170 // with CPEs where the vendor is the product name and doesn't appear to be derived from any available package 171 // metadata. 172 vendors := newFieldCandidateSet(candidateProducts(p)...) 173 174 switch p.Language { 175 case pkg.JavaScript: 176 // for JavaScript if we find node.js as a package then the vendor is "nodejs" 177 if p.Name == "node.js" { 178 vendors.addValue("nodejs") 179 } 180 case pkg.Ruby: 181 vendors.addValue("ruby-lang") 182 case pkg.Go: 183 // replace all candidates with only the golang-specific helper 184 vendors.clear() 185 186 vendor := candidateVendorForGo(p.Name) 187 if vendor != "" { 188 vendors.addValue(vendor) 189 } 190 } 191 192 switch p.Metadata.(type) { 193 case pkg.RpmDBEntry: 194 vendors.union(candidateVendorsForRPM(p)) 195 case pkg.RubyGemspec: 196 vendors.union(candidateVendorsForRuby(p)) 197 case pkg.PythonPackage: 198 vendors.union(candidateVendorsForPython(p)) 199 case pkg.JavaArchive: 200 vendors.union(candidateVendorsForJava(p)) 201 case pkg.ApkDBEntry: 202 vendors.union(candidateVendorsForAPK(p)) 203 case pkg.NpmPackage: 204 vendors.union(candidateVendorsForJavascript(p)) 205 case pkg.WordpressPluginEntry: 206 vendors.clear() 207 vendors.union(candidateVendorsForWordpressPlugin(p)) 208 } 209 210 // We should no longer be generating vendor candidates with these values ["" and "*"] 211 // (since CPEs will match any other value) 212 vendors.removeByValue("") 213 vendors.removeByValue("*") 214 215 // try swapping hyphens for underscores, vice versa, and removing separators altogether 216 addDelimiterVariations(vendors) 217 218 // generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci]) 219 addAllSubSelections(vendors) 220 221 // add more candidates based on the package info for each vendor candidate 222 for _, vendor := range vendors.uniqueValues() { 223 vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...) 224 } 225 226 // remove known mis 227 vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 228 229 uniqueVendors := vendors.uniqueValues() 230 231 // if any known vendor was detected, pick that one. 232 for _, vendor := range uniqueVendors { 233 if knownVendors.Has(vendor) { 234 return []string{vendor} 235 } 236 } 237 238 return uniqueVendors 239 } 240 241 func candidateProducts(p pkg.Package) []string { 242 products := newFieldCandidateSet(p.Name) 243 244 _, hasJavaMetadata := p.Metadata.(pkg.JavaArchive) 245 246 switch { 247 case p.Language == pkg.Python: 248 if !strings.HasPrefix(p.Name, "python") { 249 products.addValue("python-" + p.Name) 250 } 251 case p.Language == pkg.Java || hasJavaMetadata: 252 products.addValue(candidateProductsForJava(p)...) 253 case p.Language == pkg.Go: 254 // replace all candidates with only the golang-specific helper 255 products.clear() 256 257 prod := candidateProductForGo(p.Name) 258 if prod != "" { 259 products.addValue(prod) 260 } 261 } 262 263 if _, hasAPKMetadata := p.Metadata.(pkg.ApkDBEntry); hasAPKMetadata { 264 products.union(candidateProductsForAPK(p)) 265 } 266 267 if _, hasWordpressMetadata := p.Metadata.(pkg.WordpressPluginEntry); hasWordpressMetadata { 268 products.clear() 269 products.union(candidateProductsForWordpressPlugin(p)) 270 } 271 272 // it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value) 273 products.removeByValue("") 274 products.removeByValue("*") 275 276 // try swapping hyphens for underscores, vice versa, and removing separators altogether 277 addDelimiterVariations(products) 278 279 // add known candidate additions 280 products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...) 281 282 // remove known candidate removals 283 products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 284 285 return products.uniqueValues() 286 } 287 288 func addAllSubSelections(fields fieldCandidateSet) { 289 candidatesForVariations := fields.copy() 290 candidatesForVariations.removeWhere(subSelectionsDisallowed) 291 292 for _, candidate := range candidatesForVariations.values() { 293 fields.addValue(generateSubSelections(candidate)...) 294 } 295 } 296 297 // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections 298 // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins]. 299 func generateSubSelections(field string) (results []string) { 300 scanner := bufio.NewScanner(strings.NewReader(field)) 301 scanner.Split(scanByHyphenOrUnderscore) 302 var lastToken uint8 303 for scanner.Scan() { 304 rawCandidate := scanner.Text() 305 if len(rawCandidate) == 0 { 306 break 307 } 308 309 // trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since 310 // scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least 311 // prefixed. 312 candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore) 313 314 // capture the result (if there is content) 315 if len(candidate) > 0 { 316 if len(results) > 0 { 317 results = append(results, results[len(results)-1]+string(lastToken)+candidate) 318 } else { 319 results = append(results, candidate) 320 } 321 } 322 323 // keep track of the trailing separator for the next loop 324 lastToken = rawCandidate[len(rawCandidate)-1] 325 } 326 return results 327 } 328 329 // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores. 330 func trimHyphenOrUnderscore(r rune) bool { 331 switch r { 332 case '-', '_': 333 return true 334 } 335 return false 336 } 337 338 // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split 339 func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) { 340 if atEOF && len(data) == 0 { 341 return 0, nil, nil 342 } 343 if i := bytes.IndexAny(data, "-_"); i >= 0 { 344 return i + 1, data[0 : i+1], nil 345 } 346 347 if atEOF { 348 return len(data), data, nil 349 } 350 351 return 0, nil, nil 352 } 353 354 func addDelimiterVariations(fields fieldCandidateSet) { 355 candidatesForVariations := fields.copy() 356 candidatesForVariations.removeWhere(delimiterVariationsDisallowed) 357 358 for _, candidate := range candidatesForVariations.list() { 359 field := candidate.value 360 hasHyphen := strings.Contains(field, "-") 361 hasUnderscore := strings.Contains(field, "_") 362 363 if hasHyphen { 364 // provide variations of hyphen candidates with an underscore 365 newValue := strings.ReplaceAll(field, "-", "_") 366 underscoreCandidate := candidate 367 underscoreCandidate.value = newValue 368 fields.add(underscoreCandidate) 369 } 370 371 if hasUnderscore { 372 // provide variations of underscore candidates with a hyphen 373 newValue := strings.ReplaceAll(field, "_", "-") 374 hyphenCandidate := candidate 375 hyphenCandidate.value = newValue 376 fields.add(hyphenCandidate) 377 } 378 } 379 }