github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/pkg/cataloger/python/parse_wheel_egg.go (about) 1 package python 2 3 import ( 4 "bufio" 5 "encoding/json" 6 "fmt" 7 "io" 8 "path/filepath" 9 10 "github.com/anchore/syft/syft/artifact" 11 "github.com/anchore/syft/syft/file" 12 "github.com/anchore/syft/syft/pkg" 13 "github.com/anchore/syft/syft/pkg/cataloger/generic" 14 "github.com/lineaje-labs/syft/internal" 15 "github.com/lineaje-labs/syft/internal/log" 16 ) 17 18 // parseWheelOrEgg takes the primary metadata file reference and returns the python package it represents. Contained 19 // fields are governed by the PyPA core metadata specification (https://packaging.python.org/en/latest/specifications/core-metadata/). 20 func parseWheelOrEgg( 21 resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser, 22 ) ([]pkg.Package, []artifact.Relationship, error) { 23 pd, sources, err := assembleEggOrWheelMetadata(resolver, reader.Location) 24 if err != nil { 25 return nil, nil, err 26 } 27 if pd == nil { 28 return nil, nil, nil 29 } 30 31 // This can happen for Python 2.7 where it is reported from an egg-info, but Python is 32 // the actual runtime, it isn't a "package". The special-casing here allows to skip it 33 if pd.Name == "Python" { 34 return nil, nil, nil 35 } 36 37 pkgs := []pkg.Package{newPackageForPackage(*pd, sources...)} 38 39 return pkgs, nil, nil 40 } 41 42 // fetchRecordFiles finds a corresponding installed-files.txt file for the given python package metadata file and returns the set of file records contained. 43 func fetchInstalledFiles( 44 resolver file.Resolver, metadataLocation file.Location, sitePackagesRootPath string, 45 ) (files []pkg.PythonFileRecord, sources []file.Location, err error) { 46 // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory 47 // or for an image... for an image the METADATA file may be present within multiple layers, so it is important 48 // to reconcile the installed-files.txt path to the same layer (or the next adjacent lower layer). 49 50 // find the installed-files.txt file relative to the directory where the METADATA file resides (in path AND layer structure) 51 installedFilesPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "installed-files.txt") 52 installedFilesRef := resolver.RelativeFileByPath(metadataLocation, installedFilesPath) 53 54 if installedFilesRef != nil { 55 sources = append(sources, installedFilesRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 56 57 installedFilesContents, err := resolver.FileContentsByLocation(*installedFilesRef) 58 if err != nil { 59 return nil, nil, err 60 } 61 defer internal.CloseAndLogError(installedFilesContents, installedFilesPath) 62 63 // parse the installed-files contents 64 installedFiles, err := parseInstalledFiles(installedFilesContents, metadataLocation.RealPath, sitePackagesRootPath) 65 if err != nil { 66 log.Warnf("unable to parse installed-files.txt for python package=%+v: %w", metadataLocation.RealPath, err) 67 return files, sources, nil 68 } 69 70 files = append(files, installedFiles...) 71 } 72 return files, sources, nil 73 } 74 75 // fetchRecordFiles finds a corresponding RECORD file for the given python package metadata file and returns the set of file records contained. 76 func fetchRecordFiles( 77 resolver file.Resolver, metadataLocation file.Location, 78 ) (files []pkg.PythonFileRecord, sources []file.Location, err error) { 79 // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory 80 // or for an image... for an image the METADATA file may be present within multiple layers, so it is important 81 // to reconcile the RECORD path to the same layer (or the next adjacent lower layer). 82 83 // find the RECORD file relative to the directory where the METADATA file resides (in path AND layer structure) 84 recordPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "RECORD") 85 recordRef := resolver.RelativeFileByPath(metadataLocation, recordPath) 86 87 if recordRef != nil { 88 sources = append(sources, recordRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 89 90 recordContents, err := resolver.FileContentsByLocation(*recordRef) 91 if err != nil { 92 return nil, nil, err 93 } 94 defer internal.CloseAndLogError(recordContents, recordPath) 95 96 // parse the record contents 97 records := parseWheelOrEggRecord(recordContents) 98 99 files = append(files, records...) 100 } 101 return files, sources, nil 102 } 103 104 // fetchTopLevelPackages finds a corresponding top_level.txt file for the given python package metadata file and returns the set of package names contained. 105 func fetchTopLevelPackages( 106 resolver file.Resolver, metadataLocation file.Location, 107 ) (pkgs []string, sources []file.Location, err error) { 108 // a top_level.txt file specifies the python top-level packages (provided by this python package) installed into site-packages 109 parentDir := filepath.Dir(metadataLocation.RealPath) 110 topLevelPath := filepath.Join(parentDir, "top_level.txt") 111 topLevelLocation := resolver.RelativeFileByPath(metadataLocation, topLevelPath) 112 113 if topLevelLocation == nil { 114 return nil, nil, nil 115 } 116 117 sources = append(sources, topLevelLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 118 119 topLevelContents, err := resolver.FileContentsByLocation(*topLevelLocation) 120 if err != nil { 121 return nil, nil, err 122 } 123 defer internal.CloseAndLogError(topLevelContents, topLevelLocation.AccessPath) 124 125 scanner := bufio.NewScanner(topLevelContents) 126 for scanner.Scan() { 127 pkgs = append(pkgs, scanner.Text()) 128 } 129 130 if err := scanner.Err(); err != nil { 131 return nil, nil, fmt.Errorf("could not read python package top_level.txt: %w", err) 132 } 133 134 return pkgs, sources, nil 135 } 136 137 type directURLOrigin struct { 138 URL string `json:"url"` 139 VCSInfo vcsInfo `json:"vcs_info"` 140 ArchiveInfo archiveInfo `json:"archive_info"` 141 DirInfo dirInfo `json:"dir_info"` 142 } 143 144 type dirInfo struct { 145 Editable bool `json:"editable"` 146 } 147 148 type archiveInfo struct { 149 Hash string `json:"hash"` 150 } 151 152 type vcsInfo struct { 153 CommitID string `json:"commit_id"` 154 VCS string `json:"vcs"` 155 RequestedRevision string `json:"requested_revision"` 156 } 157 158 func fetchDirectURLData( 159 resolver file.Resolver, metadataLocation file.Location, 160 ) (d *pkg.PythonDirectURLOriginInfo, sources []file.Location, err error) { 161 parentDir := filepath.Dir(metadataLocation.RealPath) 162 directURLPath := filepath.Join(parentDir, "direct_url.json") 163 directURLLocation := resolver.RelativeFileByPath(metadataLocation, directURLPath) 164 165 if directURLLocation == nil { 166 return nil, nil, nil 167 } 168 169 sources = append(sources, directURLLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) 170 171 directURLContents, err := resolver.FileContentsByLocation(*directURLLocation) 172 if err != nil { 173 return nil, nil, err 174 } 175 defer internal.CloseAndLogError(directURLContents, directURLLocation.AccessPath) 176 177 buffer, err := io.ReadAll(directURLContents) 178 if err != nil { 179 return nil, nil, err 180 } 181 182 var directURLJson directURLOrigin 183 if err := json.Unmarshal(buffer, &directURLJson); err != nil { 184 return nil, nil, err 185 } 186 187 return &pkg.PythonDirectURLOriginInfo{ 188 URL: directURLJson.URL, 189 CommitID: directURLJson.VCSInfo.CommitID, 190 VCS: directURLJson.VCSInfo.VCS, 191 }, sources, nil 192 } 193 194 // assembleEggOrWheelMetadata discovers and accumulates python package metadata from multiple file sources and returns a single metadata object as well as a list of files where the metadata was derived from. 195 func assembleEggOrWheelMetadata( 196 resolver file.Resolver, metadataLocation file.Location, 197 ) (*parsedData, []file.Location, error) { 198 var sources = []file.Location{ 199 metadataLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 200 } 201 202 metadataContents, err := resolver.FileContentsByLocation(metadataLocation) 203 if err != nil { 204 return nil, nil, err 205 } 206 defer internal.CloseAndLogError(metadataContents, metadataLocation.AccessPath) 207 208 pd, err := parseWheelOrEggMetadata(metadataLocation.RealPath, metadataContents) 209 if err != nil { 210 return nil, nil, err 211 } 212 213 if pd.Name == "" { 214 return nil, nil, nil 215 } 216 217 // attach any python files found for the given wheel/egg installation 218 r, s, err := fetchRecordFiles(resolver, metadataLocation) 219 if err != nil { 220 return nil, nil, err 221 } 222 if len(r) == 0 { 223 r, s, err = fetchInstalledFiles(resolver, metadataLocation, pd.SitePackagesRootPath) 224 if err != nil { 225 return nil, nil, err 226 } 227 } 228 229 sources = append(sources, s...) 230 pd.Files = r 231 232 // attach any top-level package names found for the given wheel/egg installation 233 p, s, err := fetchTopLevelPackages(resolver, metadataLocation) 234 if err != nil { 235 return nil, nil, err 236 } 237 sources = append(sources, s...) 238 pd.TopLevelPackages = p 239 240 // attach any direct-url package data found for the given wheel/egg installation 241 d, s, err := fetchDirectURLData(resolver, metadataLocation) 242 if err != nil { 243 return nil, nil, err 244 } 245 246 sources = append(sources, s...) 247 pd.DirectURLOrigin = d 248 return &pd, sources, nil 249 }