github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/python/parse_wheel_egg.go (about) 1 package python 2 3 import ( 4 "bufio" 5 "context" 6 "encoding/json" 7 "io" 8 "path" 9 "path/filepath" 10 "strings" 11 12 "github.com/scylladb/go-set/strset" 13 14 "github.com/anchore/syft/internal" 15 "github.com/anchore/syft/internal/log" 16 "github.com/anchore/syft/internal/unknown" 17 "github.com/anchore/syft/syft/artifact" 18 "github.com/anchore/syft/syft/file" 19 "github.com/anchore/syft/syft/pkg" 20 "github.com/anchore/syft/syft/pkg/cataloger/generic" 21 "github.com/anchore/syft/syft/pkg/cataloger/internal/licenses" 22 ) 23 24 // parseWheelOrEgg takes the primary metadata file reference and returns the python package it represents. Contained 25 // fields are governed by the PyPA core metadata specification (https://packaging.python.org/en/latest/specifications/core-metadata/). 26 func parseWheelOrEgg(ctx context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { 27 pd, sources, err := assembleEggOrWheelMetadata(resolver, reader.Location) 28 29 if pd == nil { 30 return nil, nil, err 31 } 32 33 // This can happen for Python 2.7 where it is reported from an egg-info, but Python is 34 // the actual runtime, it isn't a "package". The special-casing here allows to skip it 35 if pd.Name == "Python" { 36 return nil, nil, err 37 } 38 39 pkgs := []pkg.Package{ 40 newPackageForPackage( 41 *pd, 42 findLicenses(ctx, resolver, *pd), 43 sources..., 44 ), 45 } 46 47 return pkgs, nil, err 48 } 49 50 // fetchInstalledFiles finds a corresponding installed-files.txt file for the given python package metadata file and returns the set of file records contained. 51 func fetchInstalledFiles(resolver file.Resolver, metadataLocation file.Location, sitePackagesRootPath string) (files []pkg.PythonFileRecord, sources []file.Location, retErr error) { 52 // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory 53 // or for an image... for an image the METADATA file may be present within multiple layers, so it is important 54 // to reconcile the installed-files.txt path to the same layer (or the next adjacent lower layer). 55 56 // find the installed-files.txt file relative to the directory where the METADATA file resides (in path AND layer structure) 57 installedFilesPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "installed-files.txt") 58 installedFilesRef := resolver.RelativeFileByPath(metadataLocation, installedFilesPath) 59 60 if installedFilesRef != nil { 61 sources = append(sources, installedFilesRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 62 63 installedFilesContents, err := resolver.FileContentsByLocation(*installedFilesRef) 64 if err != nil { 65 return nil, nil, err 66 } 67 defer internal.CloseAndLogError(installedFilesContents, installedFilesPath) 68 69 // parse the installed-files contents 70 installedFiles, err := parseInstalledFiles(installedFilesContents, metadataLocation.RealPath, sitePackagesRootPath) 71 if err != nil { 72 retErr = unknown.Newf(*installedFilesRef, "unable to parse installed-files.txt for python package: %w", retErr) 73 } 74 75 files = append(files, installedFiles...) 76 } 77 return files, sources, nil 78 } 79 80 // fetchRecordFiles finds a corresponding RECORD file for the given python package metadata file and returns the set of file records contained. 81 func fetchRecordFiles(resolver file.Resolver, metadataLocation file.Location) (files []pkg.PythonFileRecord, sources []file.Location, retErr error) { 82 // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory 83 // or for an image... for an image the METADATA file may be present within multiple layers, so it is important 84 // to reconcile the RECORD path to the same layer (or the next adjacent lower layer). 85 86 // find the RECORD file relative to the directory where the METADATA file resides (in path AND layer structure) 87 recordPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "RECORD") 88 recordRef := resolver.RelativeFileByPath(metadataLocation, recordPath) 89 90 if recordRef != nil { 91 sources = append(sources, recordRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 92 93 recordContents, err := resolver.FileContentsByLocation(*recordRef) 94 if err != nil { 95 return nil, nil, err 96 } 97 defer internal.CloseAndLogError(recordContents, recordPath) 98 99 // parse the record contents 100 var records []pkg.PythonFileRecord 101 records, retErr = parseWheelOrEggRecord(file.NewLocationReadCloser(*recordRef, recordContents)) 102 103 files = append(files, records...) 104 } 105 return files, sources, retErr 106 } 107 108 // fetchTopLevelPackages finds a corresponding top_level.txt file for the given python package metadata file and returns the set of package names contained. 109 func fetchTopLevelPackages(resolver file.Resolver, metadataLocation file.Location) (pkgs []string, sources []file.Location, err error) { 110 // a top_level.txt file specifies the python top-level packages (provided by this python package) installed into site-packages 111 parentDir := filepath.Dir(metadataLocation.RealPath) 112 topLevelPath := filepath.Join(parentDir, "top_level.txt") 113 topLevelLocation := resolver.RelativeFileByPath(metadataLocation, topLevelPath) 114 115 if topLevelLocation == nil { 116 return nil, nil, nil 117 } 118 119 sources = append(sources, topLevelLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 120 121 topLevelContents, err := resolver.FileContentsByLocation(*topLevelLocation) 122 if err != nil { 123 return nil, nil, err 124 } 125 defer internal.CloseAndLogError(topLevelContents, topLevelLocation.AccessPath) 126 127 scanner := bufio.NewScanner(topLevelContents) 128 for scanner.Scan() { 129 pkgs = append(pkgs, scanner.Text()) 130 } 131 132 if err := scanner.Err(); err != nil { 133 return nil, nil, err 134 } 135 136 return pkgs, sources, nil 137 } 138 139 type directURLOrigin struct { 140 URL string `json:"url"` 141 VCSInfo vcsInfo `json:"vcs_info"` 142 ArchiveInfo archiveInfo `json:"archive_info"` 143 DirInfo dirInfo `json:"dir_info"` 144 } 145 146 type dirInfo struct { 147 Editable bool `json:"editable"` 148 } 149 150 type archiveInfo struct { 151 Hash string `json:"hash"` 152 } 153 154 type vcsInfo struct { 155 CommitID string `json:"commit_id"` 156 VCS string `json:"vcs"` 157 RequestedRevision string `json:"requested_revision"` 158 } 159 160 func fetchDirectURLData(resolver file.Resolver, metadataLocation file.Location) (d *pkg.PythonDirectURLOriginInfo, sources []file.Location, err error) { 161 parentDir := filepath.Dir(metadataLocation.RealPath) 162 directURLPath := filepath.Join(parentDir, "direct_url.json") 163 directURLLocation := resolver.RelativeFileByPath(metadataLocation, directURLPath) 164 165 if directURLLocation == nil { 166 return nil, nil, nil 167 } 168 169 sources = append(sources, directURLLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 170 171 directURLContents, err := resolver.FileContentsByLocation(*directURLLocation) 172 if err != nil { 173 return nil, nil, err 174 } 175 defer internal.CloseAndLogError(directURLContents, directURLLocation.AccessPath) 176 177 buffer, err := io.ReadAll(directURLContents) 178 if err != nil { 179 return nil, nil, err 180 } 181 182 var directURLJson directURLOrigin 183 if err := json.Unmarshal(buffer, &directURLJson); err != nil { 184 return nil, nil, err 185 } 186 187 return &pkg.PythonDirectURLOriginInfo{ 188 URL: directURLJson.URL, 189 CommitID: directURLJson.VCSInfo.CommitID, 190 VCS: directURLJson.VCSInfo.VCS, 191 }, sources, nil 192 } 193 194 // assembleEggOrWheelMetadata discovers and accumulates python package metadata from multiple file sources and returns a single metadata object as well as a list of files where the metadata was derived from. 195 func assembleEggOrWheelMetadata(resolver file.Resolver, metadataLocation file.Location) (*parsedData, []file.Location, error) { 196 var sources = []file.Location{ 197 metadataLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 198 } 199 200 metadataContents, err := resolver.FileContentsByLocation(metadataLocation) 201 if err != nil { 202 return nil, nil, err 203 } 204 defer internal.CloseAndLogError(metadataContents, metadataLocation.AccessPath) 205 206 pd, err := parseWheelOrEggMetadata(file.NewLocationReadCloser(metadataLocation, metadataContents)) 207 if err != nil { 208 return nil, nil, err 209 } 210 211 if pd.Name == "" { 212 return nil, nil, nil 213 } 214 215 // attach any python files found for the given wheel/egg installation 216 var errs error 217 r, s, err := fetchRecordFiles(resolver, metadataLocation) 218 if err != nil { 219 errs = unknown.Joinf(errs, "could not read python package RECORD file: %w", err) 220 } 221 if len(r) == 0 { 222 r, s, err = fetchInstalledFiles(resolver, metadataLocation, pd.SitePackagesRootPath) 223 if err != nil { 224 errs = unknown.Joinf(errs, "could not read python package installed-files.txt: %w", err) 225 } 226 } 227 228 sources = append(sources, s...) 229 pd.Files = r 230 231 // attach any top-level package names found for the given wheel/egg installation 232 p, s, err := fetchTopLevelPackages(resolver, metadataLocation) 233 if err != nil { 234 errs = unknown.Joinf(errs, "could not read python package top_level.txt: %w", err) 235 } 236 sources = append(sources, s...) 237 pd.TopLevelPackages = p 238 239 // attach any direct-url package data found for the given wheel/egg installation 240 d, s, err := fetchDirectURLData(resolver, metadataLocation) 241 if err != nil { 242 errs = unknown.Joinf(errs, "could not read python package direct_url.json: %w", err) 243 } 244 245 sources = append(sources, s...) 246 pd.DirectURLOrigin = d 247 return &pd, sources, errs 248 } 249 250 func findLicenses(ctx context.Context, resolver file.Resolver, m parsedData) pkg.LicenseSet { 251 var out []pkg.License 252 253 licenseLocations := file.NewLocationSet() 254 if m.LicenseFilePath != "" { 255 locs, err := resolver.FilesByPath(m.LicenseFilePath) 256 if err != nil { 257 log.WithFields("error", err, "path", m.LicenseFilePath).Trace("unable to resolve python license file") 258 } else { 259 licenseLocations.Add(locs...) 260 } 261 } 262 263 switch { 264 case m.LicenseExpression != "" || m.Licenses != "": 265 out = licenses.NewFromValues(ctx, licenseLocations.ToSlice(), m.LicenseExpression, m.Licenses) 266 case !licenseLocations.Empty(): 267 out = licenses.FindAtLocations(ctx, resolver, licenseLocations.ToSlice()...) 268 269 default: 270 // search for known license paths from RECORDS file 271 parent := path.Base(path.Dir(m.DistInfoLocation.Path())) 272 candidatePaths := strset.New() 273 for _, f := range m.Files { 274 if !strings.HasPrefix(f.Path, parent) || strings.Count(f.Path, "/") > 1 { 275 continue 276 } 277 278 if licenses.IsLicenseFile(filepath.Base(f.Path)) { 279 candidatePaths.Add(path.Join(m.SitePackagesRootPath, f.Path)) 280 } 281 } 282 283 out = licenses.FindAtPaths(ctx, resolver, candidatePaths.List()...) 284 } 285 return pkg.NewLicenseSet(out...) 286 }