github.com/noqcks/syft@v0.0.0-20230920222752-a9e2c4e288e5/syft/pkg/cataloger/common/cpe/generate.go (about) 1 package cpe 2 3 import ( 4 "bufio" 5 "bytes" 6 _ "embed" 7 "encoding/json" 8 "fmt" 9 "sort" 10 "strings" 11 "sync" 12 13 "github.com/facebookincubator/nvdtools/wfn" 14 "github.com/scylladb/go-set/strset" 15 16 "github.com/anchore/syft/internal" 17 "github.com/anchore/syft/internal/log" 18 "github.com/anchore/syft/syft/cpe" 19 "github.com/anchore/syft/syft/pkg" 20 "github.com/anchore/syft/syft/pkg/cataloger/common/cpe/dictionary" 21 ) 22 23 // knownVendors contains vendor strings that are known to exist in 24 // the CPE database, so they will be preferred over other candidates: 25 var knownVendors = strset.New("apache") 26 27 func newCPE(product, vendor, version, targetSW string) *wfn.Attributes { 28 c := *(wfn.NewAttributesWithAny()) 29 c.Part = "a" 30 c.Product = product 31 c.Vendor = vendor 32 c.Version = version 33 c.TargetSW = targetSW 34 if cpe.ValidateString(cpe.String(c)) != nil { 35 return nil 36 } 37 return &c 38 } 39 40 //go:embed dictionary/data/cpe-index.json 41 var indexedCPEDictionaryData []byte 42 43 var indexedCPEDictionary *dictionary.Indexed 44 var indexedCPEDictionaryOnce sync.Once 45 46 func GetIndexedDictionary() (_ *dictionary.Indexed, err error) { 47 indexedCPEDictionaryOnce.Do(func() { 48 err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary) 49 }) 50 51 if err != nil { 52 return 53 } 54 55 if indexedCPEDictionary == nil { 56 err = fmt.Errorf("failed to unmarshal indexed CPE dictionary") 57 return 58 } 59 60 return indexedCPEDictionary, err 61 } 62 63 func DictionaryFind(p pkg.Package) (cpe.CPE, bool) { 64 dict, err := GetIndexedDictionary() 65 if err != nil { 66 log.Debugf("dictionary CPE lookup not available: %+v", err) 67 return cpe.CPE{}, false 68 } 69 70 var ( 71 cpeString string 72 ok bool 73 ) 74 75 switch p.Type { 76 case pkg.NpmPkg: 77 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name] 78 79 case pkg.GemPkg: 80 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name] 81 82 case pkg.PythonPkg: 83 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name] 84 85 case pkg.JenkinsPluginPkg: 86 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name] 87 88 case pkg.RustPkg: 89 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name] 90 91 default: 92 // The dictionary doesn't support this package type yet. 93 return cpe.CPE{}, false 94 } 95 96 if !ok { 97 // The dictionary doesn't have a CPE for this package. 98 return cpe.CPE{}, false 99 } 100 101 parsedCPE, err := cpe.New(cpeString) 102 if err != nil { 103 return cpe.CPE{}, false 104 } 105 106 parsedCPE.Version = p.Version 107 108 return parsedCPE, true 109 } 110 111 // Generate Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to 112 // generate the minimal set of representative CPEs, which implies that optional fields should not be included 113 // (such as target SW). 114 func Generate(p pkg.Package) []cpe.CPE { 115 vendors := candidateVendors(p) 116 products := candidateProducts(p) 117 if len(products) == 0 { 118 return nil 119 } 120 121 keys := internal.NewStringSet() 122 cpes := make([]cpe.CPE, 0) 123 for _, product := range products { 124 for _, vendor := range vendors { 125 // prevent duplicate entries... 126 key := fmt.Sprintf("%s|%s|%s", product, vendor, p.Version) 127 if keys.Contains(key) { 128 continue 129 } 130 keys.Add(key) 131 // add a new entry... 132 if c := newCPE(product, vendor, p.Version, wfn.Any); c != nil { 133 cpes = append(cpes, *c) 134 } 135 } 136 } 137 138 // filter out any known combinations that don't accurately represent this package 139 cpes = filter(cpes, p, cpeFilters...) 140 141 sort.Sort(cpe.BySpecificity(cpes)) 142 143 return cpes 144 } 145 146 func candidateVendors(p pkg.Package) []string { 147 // in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that 148 // could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this 149 // are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities 150 // with CPEs where the vendor is the product name and doesn't appear to be derived from any available package 151 // metadata. 152 vendors := newFieldCandidateSet(candidateProducts(p)...) 153 154 switch p.Language { 155 case pkg.JavaScript: 156 // for JavaScript if we find node.js as a package then the vendor is "nodejs" 157 if p.Name == "node.js" { 158 vendors.addValue("nodejs") 159 } 160 case pkg.Ruby: 161 vendors.addValue("ruby-lang") 162 case pkg.Go: 163 // replace all candidates with only the golang-specific helper 164 vendors.clear() 165 166 vendor := candidateVendorForGo(p.Name) 167 if vendor != "" { 168 vendors.addValue(vendor) 169 } 170 } 171 172 switch p.MetadataType { 173 case pkg.RpmMetadataType: 174 vendors.union(candidateVendorsForRPM(p)) 175 case pkg.GemMetadataType: 176 vendors.union(candidateVendorsForRuby(p)) 177 case pkg.PythonPackageMetadataType: 178 vendors.union(candidateVendorsForPython(p)) 179 case pkg.JavaMetadataType: 180 vendors.union(candidateVendorsForJava(p)) 181 case pkg.ApkMetadataType: 182 vendors.union(candidateVendorsForAPK(p)) 183 case pkg.NpmPackageJSONMetadataType: 184 vendors.union(candidateVendorsForJavaScript(p)) 185 } 186 187 // We should no longer be generating vendor candidates with these values ["" and "*"] 188 // (since CPEs will match any other value) 189 vendors.removeByValue("") 190 vendors.removeByValue("*") 191 192 // try swapping hyphens for underscores, vice versa, and removing separators altogether 193 addDelimiterVariations(vendors) 194 195 // generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci]) 196 addAllSubSelections(vendors) 197 198 // add more candidates based on the package info for each vendor candidate 199 for _, vendor := range vendors.uniqueValues() { 200 vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...) 201 } 202 203 // remove known mis 204 vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 205 206 uniqueVendors := vendors.uniqueValues() 207 208 // if any known vendor was detected, pick that one. 209 for _, vendor := range uniqueVendors { 210 if knownVendors.Has(vendor) { 211 return []string{vendor} 212 } 213 } 214 215 return uniqueVendors 216 } 217 218 func candidateProducts(p pkg.Package) []string { 219 products := newFieldCandidateSet(p.Name) 220 221 switch { 222 case p.Language == pkg.Python: 223 if !strings.HasPrefix(p.Name, "python") { 224 products.addValue("python-" + p.Name) 225 } 226 case p.Language == pkg.Java || p.MetadataType == pkg.JavaMetadataType: 227 products.addValue(candidateProductsForJava(p)...) 228 case p.Language == pkg.Go: 229 // replace all candidates with only the golang-specific helper 230 products.clear() 231 232 prod := candidateProductForGo(p.Name) 233 if prod != "" { 234 products.addValue(prod) 235 } 236 } 237 238 if p.MetadataType == pkg.ApkMetadataType { 239 products.union(candidateProductsForAPK(p)) 240 } 241 242 // it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value) 243 products.removeByValue("") 244 products.removeByValue("*") 245 246 // try swapping hyphens for underscores, vice versa, and removing separators altogether 247 addDelimiterVariations(products) 248 249 // add known candidate additions 250 products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...) 251 252 // remove known candidate removals 253 products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 254 255 return products.uniqueValues() 256 } 257 258 func addAllSubSelections(fields fieldCandidateSet) { 259 candidatesForVariations := fields.copy() 260 candidatesForVariations.removeWhere(subSelectionsDisallowed) 261 262 for _, candidate := range candidatesForVariations.values() { 263 fields.addValue(generateSubSelections(candidate)...) 264 } 265 } 266 267 // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections 268 // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins]. 269 func generateSubSelections(field string) (results []string) { 270 scanner := bufio.NewScanner(strings.NewReader(field)) 271 scanner.Split(scanByHyphenOrUnderscore) 272 var lastToken uint8 273 for scanner.Scan() { 274 rawCandidate := scanner.Text() 275 if len(rawCandidate) == 0 { 276 break 277 } 278 279 // trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since 280 // scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least 281 // prefixed. 282 candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore) 283 284 // capture the result (if there is content) 285 if len(candidate) > 0 { 286 if len(results) > 0 { 287 results = append(results, results[len(results)-1]+string(lastToken)+candidate) 288 } else { 289 results = append(results, candidate) 290 } 291 } 292 293 // keep track of the trailing separator for the next loop 294 lastToken = rawCandidate[len(rawCandidate)-1] 295 } 296 return results 297 } 298 299 // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores. 300 func trimHyphenOrUnderscore(r rune) bool { 301 switch r { 302 case '-', '_': 303 return true 304 } 305 return false 306 } 307 308 // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split 309 func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) { 310 if atEOF && len(data) == 0 { 311 return 0, nil, nil 312 } 313 if i := bytes.IndexAny(data, "-_"); i >= 0 { 314 return i + 1, data[0 : i+1], nil 315 } 316 317 if atEOF { 318 return len(data), data, nil 319 } 320 321 return 0, nil, nil 322 } 323 324 func addDelimiterVariations(fields fieldCandidateSet) { 325 candidatesForVariations := fields.copy() 326 candidatesForVariations.removeWhere(delimiterVariationsDisallowed) 327 328 for _, candidate := range candidatesForVariations.list() { 329 field := candidate.value 330 hasHyphen := strings.Contains(field, "-") 331 hasUnderscore := strings.Contains(field, "_") 332 333 if hasHyphen { 334 // provide variations of hyphen candidates with an underscore 335 newValue := strings.ReplaceAll(field, "-", "_") 336 underscoreCandidate := candidate 337 underscoreCandidate.value = newValue 338 fields.add(underscoreCandidate) 339 } 340 341 if hasUnderscore { 342 // provide variations of underscore candidates with a hyphen 343 newValue := strings.ReplaceAll(field, "_", "-") 344 hyphenCandidate := candidate 345 hyphenCandidate.value = newValue 346 fields.add(hyphenCandidate) 347 } 348 } 349 }