github.com/anchore/syft@v1.4.2-0.20240516191711-1bec1fc5d397/syft/pkg/cataloger/python/parse_wheel_egg.go (about) 1 package python 2 3 import ( 4 "bufio" 5 "context" 6 "encoding/json" 7 "fmt" 8 "io" 9 "path/filepath" 10 11 "github.com/anchore/syft/internal" 12 "github.com/anchore/syft/internal/log" 13 "github.com/anchore/syft/syft/artifact" 14 "github.com/anchore/syft/syft/file" 15 "github.com/anchore/syft/syft/pkg" 16 "github.com/anchore/syft/syft/pkg/cataloger/generic" 17 ) 18 19 // parseWheelOrEgg takes the primary metadata file reference and returns the python package it represents. Contained 20 // fields are governed by the PyPA core metadata specification (https://packaging.python.org/en/latest/specifications/core-metadata/). 21 func parseWheelOrEgg(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { 22 pd, sources, err := assembleEggOrWheelMetadata(resolver, reader.Location) 23 if err != nil { 24 return nil, nil, err 25 } 26 if pd == nil { 27 return nil, nil, nil 28 } 29 30 // This can happen for Python 2.7 where it is reported from an egg-info, but Python is 31 // the actual runtime, it isn't a "package". The special-casing here allows to skip it 32 if pd.Name == "Python" { 33 return nil, nil, nil 34 } 35 36 pkgs := []pkg.Package{newPackageForPackage(resolver, *pd, sources...)} 37 38 return pkgs, nil, nil 39 } 40 41 // fetchInstalledFiles finds a corresponding installed-files.txt file for the given python package metadata file and returns the set of file records contained. 42 func fetchInstalledFiles(resolver file.Resolver, metadataLocation file.Location, sitePackagesRootPath string) (files []pkg.PythonFileRecord, sources []file.Location, err error) { 43 // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory 44 // or for an image... for an image the METADATA file may be present within multiple layers, so it is important 45 // to reconcile the installed-files.txt path to the same layer (or the next adjacent lower layer). 46 47 // find the installed-files.txt file relative to the directory where the METADATA file resides (in path AND layer structure) 48 installedFilesPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "installed-files.txt") 49 installedFilesRef := resolver.RelativeFileByPath(metadataLocation, installedFilesPath) 50 51 if installedFilesRef != nil { 52 sources = append(sources, installedFilesRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 53 54 installedFilesContents, err := resolver.FileContentsByLocation(*installedFilesRef) 55 if err != nil { 56 return nil, nil, err 57 } 58 defer internal.CloseAndLogError(installedFilesContents, installedFilesPath) 59 60 // parse the installed-files contents 61 installedFiles, err := parseInstalledFiles(installedFilesContents, metadataLocation.RealPath, sitePackagesRootPath) 62 if err != nil { 63 log.Warnf("unable to parse installed-files.txt for python package=%+v: %w", metadataLocation.RealPath, err) 64 return files, sources, nil 65 } 66 67 files = append(files, installedFiles...) 68 } 69 return files, sources, nil 70 } 71 72 // fetchRecordFiles finds a corresponding RECORD file for the given python package metadata file and returns the set of file records contained. 73 func fetchRecordFiles(resolver file.Resolver, metadataLocation file.Location) (files []pkg.PythonFileRecord, sources []file.Location, err error) { 74 // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory 75 // or for an image... for an image the METADATA file may be present within multiple layers, so it is important 76 // to reconcile the RECORD path to the same layer (or the next adjacent lower layer). 77 78 // find the RECORD file relative to the directory where the METADATA file resides (in path AND layer structure) 79 recordPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "RECORD") 80 recordRef := resolver.RelativeFileByPath(metadataLocation, recordPath) 81 82 if recordRef != nil { 83 sources = append(sources, recordRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 84 85 recordContents, err := resolver.FileContentsByLocation(*recordRef) 86 if err != nil { 87 return nil, nil, err 88 } 89 defer internal.CloseAndLogError(recordContents, recordPath) 90 91 // parse the record contents 92 records := parseWheelOrEggRecord(recordContents) 93 94 files = append(files, records...) 95 } 96 return files, sources, nil 97 } 98 99 // fetchTopLevelPackages finds a corresponding top_level.txt file for the given python package metadata file and returns the set of package names contained. 100 func fetchTopLevelPackages(resolver file.Resolver, metadataLocation file.Location) (pkgs []string, sources []file.Location, err error) { 101 // a top_level.txt file specifies the python top-level packages (provided by this python package) installed into site-packages 102 parentDir := filepath.Dir(metadataLocation.RealPath) 103 topLevelPath := filepath.Join(parentDir, "top_level.txt") 104 topLevelLocation := resolver.RelativeFileByPath(metadataLocation, topLevelPath) 105 106 if topLevelLocation == nil { 107 return nil, nil, nil 108 } 109 110 sources = append(sources, topLevelLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 111 112 topLevelContents, err := resolver.FileContentsByLocation(*topLevelLocation) 113 if err != nil { 114 return nil, nil, err 115 } 116 defer internal.CloseAndLogError(topLevelContents, topLevelLocation.AccessPath) 117 118 scanner := bufio.NewScanner(topLevelContents) 119 for scanner.Scan() { 120 pkgs = append(pkgs, scanner.Text()) 121 } 122 123 if err := scanner.Err(); err != nil { 124 return nil, nil, fmt.Errorf("could not read python package top_level.txt: %w", err) 125 } 126 127 return pkgs, sources, nil 128 } 129 130 type directURLOrigin struct { 131 URL string `json:"url"` 132 VCSInfo vcsInfo `json:"vcs_info"` 133 ArchiveInfo archiveInfo `json:"archive_info"` 134 DirInfo dirInfo `json:"dir_info"` 135 } 136 137 type dirInfo struct { 138 Editable bool `json:"editable"` 139 } 140 141 type archiveInfo struct { 142 Hash string `json:"hash"` 143 } 144 145 type vcsInfo struct { 146 CommitID string `json:"commit_id"` 147 VCS string `json:"vcs"` 148 RequestedRevision string `json:"requested_revision"` 149 } 150 151 func fetchDirectURLData(resolver file.Resolver, metadataLocation file.Location) (d *pkg.PythonDirectURLOriginInfo, sources []file.Location, err error) { 152 parentDir := filepath.Dir(metadataLocation.RealPath) 153 directURLPath := filepath.Join(parentDir, "direct_url.json") 154 directURLLocation := resolver.RelativeFileByPath(metadataLocation, directURLPath) 155 156 if directURLLocation == nil { 157 return nil, nil, nil 158 } 159 160 sources = append(sources, directURLLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 161 162 directURLContents, err := resolver.FileContentsByLocation(*directURLLocation) 163 if err != nil { 164 return nil, nil, err 165 } 166 defer internal.CloseAndLogError(directURLContents, directURLLocation.AccessPath) 167 168 buffer, err := io.ReadAll(directURLContents) 169 if err != nil { 170 return nil, nil, err 171 } 172 173 var directURLJson directURLOrigin 174 if err := json.Unmarshal(buffer, &directURLJson); err != nil { 175 return nil, nil, err 176 } 177 178 return &pkg.PythonDirectURLOriginInfo{ 179 URL: directURLJson.URL, 180 CommitID: directURLJson.VCSInfo.CommitID, 181 VCS: directURLJson.VCSInfo.VCS, 182 }, sources, nil 183 } 184 185 // assembleEggOrWheelMetadata discovers and accumulates python package metadata from multiple file sources and returns a single metadata object as well as a list of files where the metadata was derived from. 186 func assembleEggOrWheelMetadata(resolver file.Resolver, metadataLocation file.Location) (*parsedData, []file.Location, error) { 187 var sources = []file.Location{ 188 metadataLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 189 } 190 191 metadataContents, err := resolver.FileContentsByLocation(metadataLocation) 192 if err != nil { 193 return nil, nil, err 194 } 195 defer internal.CloseAndLogError(metadataContents, metadataLocation.AccessPath) 196 197 pd, err := parseWheelOrEggMetadata(metadataLocation.RealPath, metadataContents) 198 if err != nil { 199 return nil, nil, err 200 } 201 202 if pd.Name == "" { 203 return nil, nil, nil 204 } 205 206 // attach any python files found for the given wheel/egg installation 207 r, s, err := fetchRecordFiles(resolver, metadataLocation) 208 if err != nil { 209 return nil, nil, err 210 } 211 if len(r) == 0 { 212 r, s, err = fetchInstalledFiles(resolver, metadataLocation, pd.SitePackagesRootPath) 213 if err != nil { 214 return nil, nil, err 215 } 216 } 217 218 sources = append(sources, s...) 219 pd.Files = r 220 221 // attach any top-level package names found for the given wheel/egg installation 222 p, s, err := fetchTopLevelPackages(resolver, metadataLocation) 223 if err != nil { 224 return nil, nil, err 225 } 226 sources = append(sources, s...) 227 pd.TopLevelPackages = p 228 229 // attach any direct-url package data found for the given wheel/egg installation 230 d, s, err := fetchDirectURLData(resolver, metadataLocation) 231 if err != nil { 232 return nil, nil, err 233 } 234 235 sources = append(sources, s...) 236 pd.DirectURLOrigin = d 237 return &pd, sources, nil 238 }