github.com/anchore/syft@v1.4.2-0.20240516191711-1bec1fc5d397/syft/pkg/cataloger/python/parse_wheel_egg.go (about)

     1  package python
     2  
     3  import (
     4  	"bufio"
     5  	"context"
     6  	"encoding/json"
     7  	"fmt"
     8  	"io"
     9  	"path/filepath"
    10  
    11  	"github.com/anchore/syft/internal"
    12  	"github.com/anchore/syft/internal/log"
    13  	"github.com/anchore/syft/syft/artifact"
    14  	"github.com/anchore/syft/syft/file"
    15  	"github.com/anchore/syft/syft/pkg"
    16  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    17  )
    18  
    19  // parseWheelOrEgg takes the primary metadata file reference and returns the python package it represents. Contained
    20  // fields are governed by the PyPA core metadata specification (https://packaging.python.org/en/latest/specifications/core-metadata/).
    21  func parseWheelOrEgg(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    22  	pd, sources, err := assembleEggOrWheelMetadata(resolver, reader.Location)
    23  	if err != nil {
    24  		return nil, nil, err
    25  	}
    26  	if pd == nil {
    27  		return nil, nil, nil
    28  	}
    29  
    30  	// This can happen for Python 2.7 where it is reported from an egg-info, but Python is
    31  	// the actual runtime, it isn't a "package". The special-casing here allows to skip it
    32  	if pd.Name == "Python" {
    33  		return nil, nil, nil
    34  	}
    35  
    36  	pkgs := []pkg.Package{newPackageForPackage(resolver, *pd, sources...)}
    37  
    38  	return pkgs, nil, nil
    39  }
    40  
    41  // fetchInstalledFiles finds a corresponding installed-files.txt file for the given python package metadata file and returns the set of file records contained.
    42  func fetchInstalledFiles(resolver file.Resolver, metadataLocation file.Location, sitePackagesRootPath string) (files []pkg.PythonFileRecord, sources []file.Location, err error) {
    43  	// we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory
    44  	// or for an image... for an image the METADATA file may be present within multiple layers, so it is important
    45  	// to reconcile the installed-files.txt path to the same layer (or the next adjacent lower layer).
    46  
    47  	// find the installed-files.txt file relative to the directory where the METADATA file resides (in path AND layer structure)
    48  	installedFilesPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "installed-files.txt")
    49  	installedFilesRef := resolver.RelativeFileByPath(metadataLocation, installedFilesPath)
    50  
    51  	if installedFilesRef != nil {
    52  		sources = append(sources, installedFilesRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
    53  
    54  		installedFilesContents, err := resolver.FileContentsByLocation(*installedFilesRef)
    55  		if err != nil {
    56  			return nil, nil, err
    57  		}
    58  		defer internal.CloseAndLogError(installedFilesContents, installedFilesPath)
    59  
    60  		// parse the installed-files contents
    61  		installedFiles, err := parseInstalledFiles(installedFilesContents, metadataLocation.RealPath, sitePackagesRootPath)
    62  		if err != nil {
    63  			log.Warnf("unable to parse installed-files.txt for python package=%+v: %w", metadataLocation.RealPath, err)
    64  			return files, sources, nil
    65  		}
    66  
    67  		files = append(files, installedFiles...)
    68  	}
    69  	return files, sources, nil
    70  }
    71  
    72  // fetchRecordFiles finds a corresponding RECORD file for the given python package metadata file and returns the set of file records contained.
    73  func fetchRecordFiles(resolver file.Resolver, metadataLocation file.Location) (files []pkg.PythonFileRecord, sources []file.Location, err error) {
    74  	// we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory
    75  	// or for an image... for an image the METADATA file may be present within multiple layers, so it is important
    76  	// to reconcile the RECORD path to the same layer (or the next adjacent lower layer).
    77  
    78  	// find the RECORD file relative to the directory where the METADATA file resides (in path AND layer structure)
    79  	recordPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "RECORD")
    80  	recordRef := resolver.RelativeFileByPath(metadataLocation, recordPath)
    81  
    82  	if recordRef != nil {
    83  		sources = append(sources, recordRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
    84  
    85  		recordContents, err := resolver.FileContentsByLocation(*recordRef)
    86  		if err != nil {
    87  			return nil, nil, err
    88  		}
    89  		defer internal.CloseAndLogError(recordContents, recordPath)
    90  
    91  		// parse the record contents
    92  		records := parseWheelOrEggRecord(recordContents)
    93  
    94  		files = append(files, records...)
    95  	}
    96  	return files, sources, nil
    97  }
    98  
    99  // fetchTopLevelPackages finds a corresponding top_level.txt file for the given python package metadata file and returns the set of package names contained.
   100  func fetchTopLevelPackages(resolver file.Resolver, metadataLocation file.Location) (pkgs []string, sources []file.Location, err error) {
   101  	// a top_level.txt file specifies the python top-level packages (provided by this python package) installed into site-packages
   102  	parentDir := filepath.Dir(metadataLocation.RealPath)
   103  	topLevelPath := filepath.Join(parentDir, "top_level.txt")
   104  	topLevelLocation := resolver.RelativeFileByPath(metadataLocation, topLevelPath)
   105  
   106  	if topLevelLocation == nil {
   107  		return nil, nil, nil
   108  	}
   109  
   110  	sources = append(sources, topLevelLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
   111  
   112  	topLevelContents, err := resolver.FileContentsByLocation(*topLevelLocation)
   113  	if err != nil {
   114  		return nil, nil, err
   115  	}
   116  	defer internal.CloseAndLogError(topLevelContents, topLevelLocation.AccessPath)
   117  
   118  	scanner := bufio.NewScanner(topLevelContents)
   119  	for scanner.Scan() {
   120  		pkgs = append(pkgs, scanner.Text())
   121  	}
   122  
   123  	if err := scanner.Err(); err != nil {
   124  		return nil, nil, fmt.Errorf("could not read python package top_level.txt: %w", err)
   125  	}
   126  
   127  	return pkgs, sources, nil
   128  }
   129  
   130  type directURLOrigin struct {
   131  	URL         string      `json:"url"`
   132  	VCSInfo     vcsInfo     `json:"vcs_info"`
   133  	ArchiveInfo archiveInfo `json:"archive_info"`
   134  	DirInfo     dirInfo     `json:"dir_info"`
   135  }
   136  
   137  type dirInfo struct {
   138  	Editable bool `json:"editable"`
   139  }
   140  
   141  type archiveInfo struct {
   142  	Hash string `json:"hash"`
   143  }
   144  
   145  type vcsInfo struct {
   146  	CommitID          string `json:"commit_id"`
   147  	VCS               string `json:"vcs"`
   148  	RequestedRevision string `json:"requested_revision"`
   149  }
   150  
   151  func fetchDirectURLData(resolver file.Resolver, metadataLocation file.Location) (d *pkg.PythonDirectURLOriginInfo, sources []file.Location, err error) {
   152  	parentDir := filepath.Dir(metadataLocation.RealPath)
   153  	directURLPath := filepath.Join(parentDir, "direct_url.json")
   154  	directURLLocation := resolver.RelativeFileByPath(metadataLocation, directURLPath)
   155  
   156  	if directURLLocation == nil {
   157  		return nil, nil, nil
   158  	}
   159  
   160  	sources = append(sources, directURLLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
   161  
   162  	directURLContents, err := resolver.FileContentsByLocation(*directURLLocation)
   163  	if err != nil {
   164  		return nil, nil, err
   165  	}
   166  	defer internal.CloseAndLogError(directURLContents, directURLLocation.AccessPath)
   167  
   168  	buffer, err := io.ReadAll(directURLContents)
   169  	if err != nil {
   170  		return nil, nil, err
   171  	}
   172  
   173  	var directURLJson directURLOrigin
   174  	if err := json.Unmarshal(buffer, &directURLJson); err != nil {
   175  		return nil, nil, err
   176  	}
   177  
   178  	return &pkg.PythonDirectURLOriginInfo{
   179  		URL:      directURLJson.URL,
   180  		CommitID: directURLJson.VCSInfo.CommitID,
   181  		VCS:      directURLJson.VCSInfo.VCS,
   182  	}, sources, nil
   183  }
   184  
   185  // assembleEggOrWheelMetadata discovers and accumulates python package metadata from multiple file sources and returns a single metadata object as well as a list of files where the metadata was derived from.
   186  func assembleEggOrWheelMetadata(resolver file.Resolver, metadataLocation file.Location) (*parsedData, []file.Location, error) {
   187  	var sources = []file.Location{
   188  		metadataLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   189  	}
   190  
   191  	metadataContents, err := resolver.FileContentsByLocation(metadataLocation)
   192  	if err != nil {
   193  		return nil, nil, err
   194  	}
   195  	defer internal.CloseAndLogError(metadataContents, metadataLocation.AccessPath)
   196  
   197  	pd, err := parseWheelOrEggMetadata(metadataLocation.RealPath, metadataContents)
   198  	if err != nil {
   199  		return nil, nil, err
   200  	}
   201  
   202  	if pd.Name == "" {
   203  		return nil, nil, nil
   204  	}
   205  
   206  	// attach any python files found for the given wheel/egg installation
   207  	r, s, err := fetchRecordFiles(resolver, metadataLocation)
   208  	if err != nil {
   209  		return nil, nil, err
   210  	}
   211  	if len(r) == 0 {
   212  		r, s, err = fetchInstalledFiles(resolver, metadataLocation, pd.SitePackagesRootPath)
   213  		if err != nil {
   214  			return nil, nil, err
   215  		}
   216  	}
   217  
   218  	sources = append(sources, s...)
   219  	pd.Files = r
   220  
   221  	// attach any top-level package names found for the given wheel/egg installation
   222  	p, s, err := fetchTopLevelPackages(resolver, metadataLocation)
   223  	if err != nil {
   224  		return nil, nil, err
   225  	}
   226  	sources = append(sources, s...)
   227  	pd.TopLevelPackages = p
   228  
   229  	// attach any direct-url package data found for the given wheel/egg installation
   230  	d, s, err := fetchDirectURLData(resolver, metadataLocation)
   231  	if err != nil {
   232  		return nil, nil, err
   233  	}
   234  
   235  	sources = append(sources, s...)
   236  	pd.DirectURLOrigin = d
   237  	return &pd, sources, nil
   238  }