github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/pkg/cataloger/common/cpe/generate.go (about) 1 package cpe 2 3 import ( 4 "bufio" 5 "bytes" 6 _ "embed" 7 "encoding/json" 8 "fmt" 9 "sort" 10 "strings" 11 "sync" 12 13 "github.com/facebookincubator/nvdtools/wfn" 14 "github.com/scylladb/go-set/strset" 15 16 "github.com/anchore/syft/syft/cpe" 17 "github.com/anchore/syft/syft/pkg" 18 "github.com/anchore/syft/syft/pkg/cataloger/common/cpe/dictionary" 19 "github.com/lineaje-labs/syft/internal/log" 20 ) 21 22 // knownVendors contains vendor strings that are known to exist in 23 // the CPE database, so they will be preferred over other candidates: 24 var knownVendors = strset.New("apache") 25 26 func newCPE(product, vendor, version, targetSW string) *wfn.Attributes { 27 c := *(wfn.NewAttributesWithAny()) 28 c.Part = "a" 29 c.Product = product 30 c.Vendor = vendor 31 c.Version = version 32 c.TargetSW = targetSW 33 if cpe.ValidateString(cpe.String(c)) != nil { 34 return nil 35 } 36 return &c 37 } 38 39 //go:embed dictionary/data/cpe-index.json 40 var indexedCPEDictionaryData []byte 41 42 var indexedCPEDictionary *dictionary.Indexed 43 var indexedCPEDictionaryOnce sync.Once 44 45 func GetIndexedDictionary() (_ *dictionary.Indexed, err error) { 46 indexedCPEDictionaryOnce.Do(func() { 47 err = json.Unmarshal(indexedCPEDictionaryData, &indexedCPEDictionary) 48 }) 49 50 if err != nil { 51 return 52 } 53 54 if indexedCPEDictionary == nil { 55 err = fmt.Errorf("failed to unmarshal indexed CPE dictionary") 56 return 57 } 58 59 return indexedCPEDictionary, err 60 } 61 62 func DictionaryFind(p pkg.Package) (cpe.CPE, bool) { 63 dict, err := GetIndexedDictionary() 64 if err != nil { 65 log.Debugf("dictionary CPE lookup not available: %+v", err) 66 return cpe.CPE{}, false 67 } 68 69 var ( 70 cpeString string 71 ok bool 72 ) 73 74 switch p.Type { 75 case pkg.NpmPkg: 76 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemNPM][p.Name] 77 78 case pkg.GemPkg: 79 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRubyGems][p.Name] 80 81 case pkg.PythonPkg: 82 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemPyPI][p.Name] 83 84 case pkg.JenkinsPluginPkg: 85 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemJenkinsPlugins][p.Name] 86 87 case pkg.RustPkg: 88 cpeString, ok = dict.EcosystemPackages[dictionary.EcosystemRustCrates][p.Name] 89 90 default: 91 // The dictionary doesn't support this package type yet. 92 return cpe.CPE{}, false 93 } 94 95 if !ok { 96 // The dictionary doesn't have a CPE for this package. 97 return cpe.CPE{}, false 98 } 99 100 parsedCPE, err := cpe.New(cpeString) 101 if err != nil { 102 return cpe.CPE{}, false 103 } 104 105 parsedCPE.Version = p.Version 106 107 return parsedCPE, true 108 } 109 110 // Generate Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to 111 // generate the minimal set of representative CPEs, which implies that optional fields should not be included 112 // (such as target SW). 113 func Generate(p pkg.Package) []cpe.CPE { 114 vendors := candidateVendors(p) 115 products := candidateProducts(p) 116 if len(products) == 0 { 117 return nil 118 } 119 120 keys := strset.New() 121 cpes := make([]cpe.CPE, 0) 122 for _, product := range products { 123 for _, vendor := range vendors { 124 // prevent duplicate entries... 125 key := fmt.Sprintf("%s|%s|%s", product, vendor, p.Version) 126 if keys.Has(key) { 127 continue 128 } 129 keys.Add(key) 130 // add a new entry... 131 if c := newCPE(product, vendor, p.Version, wfn.Any); c != nil { 132 cpes = append(cpes, *c) 133 } 134 } 135 } 136 137 // filter out any known combinations that don't accurately represent this package 138 cpes = filter(cpes, p, cpeFilters...) 139 140 sort.Sort(cpe.BySpecificity(cpes)) 141 142 return cpes 143 } 144 145 func candidateVendors(p pkg.Package) []string { 146 // in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that 147 // could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this 148 // are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities 149 // with CPEs where the vendor is the product name and doesn't appear to be derived from any available package 150 // metadata. 151 vendors := newFieldCandidateSet(candidateProducts(p)...) 152 153 switch p.Language { 154 case pkg.JavaScript: 155 // for JavaScript if we find node.js as a package then the vendor is "nodejs" 156 if p.Name == "node.js" { 157 vendors.addValue("nodejs") 158 } 159 case pkg.Ruby: 160 vendors.addValue("ruby-lang") 161 case pkg.Go: 162 // replace all candidates with only the golang-specific helper 163 vendors.clear() 164 165 vendor := candidateVendorForGo(p.Name) 166 if vendor != "" { 167 vendors.addValue(vendor) 168 } 169 } 170 171 switch p.Metadata.(type) { 172 case pkg.RpmDBEntry: 173 vendors.union(candidateVendorsForRPM(p)) 174 case pkg.RubyGemspec: 175 vendors.union(candidateVendorsForRuby(p)) 176 case pkg.PythonPackage: 177 vendors.union(candidateVendorsForPython(p)) 178 case pkg.JavaArchive: 179 vendors.union(candidateVendorsForJava(p)) 180 case pkg.ApkDBEntry: 181 vendors.union(candidateVendorsForAPK(p)) 182 case pkg.NpmPackage: 183 vendors.union(candidateVendorsForJavascript(p)) 184 } 185 186 // We should no longer be generating vendor candidates with these values ["" and "*"] 187 // (since CPEs will match any other value) 188 vendors.removeByValue("") 189 vendors.removeByValue("*") 190 191 // try swapping hyphens for underscores, vice versa, and removing separators altogether 192 addDelimiterVariations(vendors) 193 194 // generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci]) 195 addAllSubSelections(vendors) 196 197 // add more candidates based on the package info for each vendor candidate 198 for _, vendor := range vendors.uniqueValues() { 199 vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...) 200 } 201 202 // remove known mis 203 vendors.removeByValue(findVendorsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 204 205 uniqueVendors := vendors.uniqueValues() 206 207 // if any known vendor was detected, pick that one. 208 for _, vendor := range uniqueVendors { 209 if knownVendors.Has(vendor) { 210 return []string{vendor} 211 } 212 } 213 214 return uniqueVendors 215 } 216 217 func candidateProducts(p pkg.Package) []string { 218 products := newFieldCandidateSet(p.Name) 219 220 _, hasJavaMetadata := p.Metadata.(pkg.JavaArchive) 221 222 switch { 223 case p.Language == pkg.Python: 224 if !strings.HasPrefix(p.Name, "python") { 225 products.addValue("python-" + p.Name) 226 } 227 case p.Language == pkg.Java || hasJavaMetadata: 228 products.addValue(candidateProductsForJava(p)...) 229 case p.Language == pkg.Go: 230 // replace all candidates with only the golang-specific helper 231 products.clear() 232 233 prod := candidateProductForGo(p.Name) 234 if prod != "" { 235 products.addValue(prod) 236 } 237 } 238 239 if _, hasAPKMetadata := p.Metadata.(pkg.ApkDBEntry); hasAPKMetadata { 240 products.union(candidateProductsForAPK(p)) 241 } 242 243 // it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value) 244 products.removeByValue("") 245 products.removeByValue("*") 246 247 // try swapping hyphens for underscores, vice versa, and removing separators altogether 248 addDelimiterVariations(products) 249 250 // add known candidate additions 251 products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...) 252 253 // remove known candidate removals 254 products.removeByValue(findProductsToRemove(defaultCandidateRemovals, p.Type, p.Name)...) 255 256 return products.uniqueValues() 257 } 258 259 func addAllSubSelections(fields fieldCandidateSet) { 260 candidatesForVariations := fields.copy() 261 candidatesForVariations.removeWhere(subSelectionsDisallowed) 262 263 for _, candidate := range candidatesForVariations.values() { 264 fields.addValue(generateSubSelections(candidate)...) 265 } 266 } 267 268 // generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections 269 // that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins]. 270 func generateSubSelections(field string) (results []string) { 271 scanner := bufio.NewScanner(strings.NewReader(field)) 272 scanner.Split(scanByHyphenOrUnderscore) 273 var lastToken uint8 274 for scanner.Scan() { 275 rawCandidate := scanner.Text() 276 if len(rawCandidate) == 0 { 277 break 278 } 279 280 // trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since 281 // scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least 282 // prefixed. 283 candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore) 284 285 // capture the result (if there is content) 286 if len(candidate) > 0 { 287 if len(results) > 0 { 288 results = append(results, results[len(results)-1]+string(lastToken)+candidate) 289 } else { 290 results = append(results, candidate) 291 } 292 } 293 294 // keep track of the trailing separator for the next loop 295 lastToken = rawCandidate[len(rawCandidate)-1] 296 } 297 return results 298 } 299 300 // trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores. 301 func trimHyphenOrUnderscore(r rune) bool { 302 switch r { 303 case '-', '_': 304 return true 305 } 306 return false 307 } 308 309 // scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split 310 func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) { 311 if atEOF && len(data) == 0 { 312 return 0, nil, nil 313 } 314 if i := bytes.IndexAny(data, "-_"); i >= 0 { 315 return i + 1, data[0 : i+1], nil 316 } 317 318 if atEOF { 319 return len(data), data, nil 320 } 321 322 return 0, nil, nil 323 } 324 325 func addDelimiterVariations(fields fieldCandidateSet) { 326 candidatesForVariations := fields.copy() 327 candidatesForVariations.removeWhere(delimiterVariationsDisallowed) 328 329 for _, candidate := range candidatesForVariations.list() { 330 field := candidate.value 331 hasHyphen := strings.Contains(field, "-") 332 hasUnderscore := strings.Contains(field, "_") 333 334 if hasHyphen { 335 // provide variations of hyphen candidates with an underscore 336 newValue := strings.ReplaceAll(field, "-", "_") 337 underscoreCandidate := candidate 338 underscoreCandidate.value = newValue 339 fields.add(underscoreCandidate) 340 } 341 342 if hasUnderscore { 343 // provide variations of underscore candidates with a hyphen 344 newValue := strings.ReplaceAll(field, "_", "-") 345 hyphenCandidate := candidate 346 hyphenCandidate.value = newValue 347 fields.add(hyphenCandidate) 348 } 349 } 350 }