github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/python/parse_wheel_egg.go (about)

     1  package python
     2  
     3  import (
     4  	"bufio"
     5  	"context"
     6  	"encoding/json"
     7  	"io"
     8  	"path"
     9  	"path/filepath"
    10  	"strings"
    11  
    12  	"github.com/scylladb/go-set/strset"
    13  
    14  	"github.com/anchore/syft/internal"
    15  	"github.com/anchore/syft/internal/log"
    16  	"github.com/anchore/syft/internal/unknown"
    17  	"github.com/anchore/syft/syft/artifact"
    18  	"github.com/anchore/syft/syft/file"
    19  	"github.com/anchore/syft/syft/pkg"
    20  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    21  	"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
    22  )
    23  
    24  // parseWheelOrEgg takes the primary metadata file reference and returns the python package it represents. Contained
    25  // fields are governed by the PyPA core metadata specification (https://packaging.python.org/en/latest/specifications/core-metadata/).
    26  func parseWheelOrEgg(ctx context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    27  	pd, sources, err := assembleEggOrWheelMetadata(resolver, reader.Location)
    28  
    29  	if pd == nil {
    30  		return nil, nil, err
    31  	}
    32  
    33  	// This can happen for Python 2.7 where it is reported from an egg-info, but Python is
    34  	// the actual runtime, it isn't a "package". The special-casing here allows to skip it
    35  	if pd.Name == "Python" {
    36  		return nil, nil, err
    37  	}
    38  
    39  	pkgs := []pkg.Package{
    40  		newPackageForPackage(
    41  			*pd,
    42  			findLicenses(ctx, resolver, *pd),
    43  			sources...,
    44  		),
    45  	}
    46  
    47  	return pkgs, nil, err
    48  }
    49  
    50  // fetchInstalledFiles finds a corresponding installed-files.txt file for the given python package metadata file and returns the set of file records contained.
    51  func fetchInstalledFiles(resolver file.Resolver, metadataLocation file.Location, sitePackagesRootPath string) (files []pkg.PythonFileRecord, sources []file.Location, retErr error) {
    52  	// we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory
    53  	// or for an image... for an image the METADATA file may be present within multiple layers, so it is important
    54  	// to reconcile the installed-files.txt path to the same layer (or the next adjacent lower layer).
    55  
    56  	// find the installed-files.txt file relative to the directory where the METADATA file resides (in path AND layer structure)
    57  	installedFilesPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "installed-files.txt")
    58  	installedFilesRef := resolver.RelativeFileByPath(metadataLocation, installedFilesPath)
    59  
    60  	if installedFilesRef != nil {
    61  		sources = append(sources, installedFilesRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
    62  
    63  		installedFilesContents, err := resolver.FileContentsByLocation(*installedFilesRef)
    64  		if err != nil {
    65  			return nil, nil, err
    66  		}
    67  		defer internal.CloseAndLogError(installedFilesContents, installedFilesPath)
    68  
    69  		// parse the installed-files contents
    70  		installedFiles, err := parseInstalledFiles(installedFilesContents, metadataLocation.RealPath, sitePackagesRootPath)
    71  		if err != nil {
    72  			retErr = unknown.Newf(*installedFilesRef, "unable to parse installed-files.txt for python package: %w", retErr)
    73  		}
    74  
    75  		files = append(files, installedFiles...)
    76  	}
    77  	return files, sources, nil
    78  }
    79  
    80  // fetchRecordFiles finds a corresponding RECORD file for the given python package metadata file and returns the set of file records contained.
    81  func fetchRecordFiles(resolver file.Resolver, metadataLocation file.Location) (files []pkg.PythonFileRecord, sources []file.Location, retErr error) {
    82  	// we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory
    83  	// or for an image... for an image the METADATA file may be present within multiple layers, so it is important
    84  	// to reconcile the RECORD path to the same layer (or the next adjacent lower layer).
    85  
    86  	// find the RECORD file relative to the directory where the METADATA file resides (in path AND layer structure)
    87  	recordPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "RECORD")
    88  	recordRef := resolver.RelativeFileByPath(metadataLocation, recordPath)
    89  
    90  	if recordRef != nil {
    91  		sources = append(sources, recordRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
    92  
    93  		recordContents, err := resolver.FileContentsByLocation(*recordRef)
    94  		if err != nil {
    95  			return nil, nil, err
    96  		}
    97  		defer internal.CloseAndLogError(recordContents, recordPath)
    98  
    99  		// parse the record contents
   100  		var records []pkg.PythonFileRecord
   101  		records, retErr = parseWheelOrEggRecord(file.NewLocationReadCloser(*recordRef, recordContents))
   102  
   103  		files = append(files, records...)
   104  	}
   105  	return files, sources, retErr
   106  }
   107  
   108  // fetchTopLevelPackages finds a corresponding top_level.txt file for the given python package metadata file and returns the set of package names contained.
   109  func fetchTopLevelPackages(resolver file.Resolver, metadataLocation file.Location) (pkgs []string, sources []file.Location, err error) {
   110  	// a top_level.txt file specifies the python top-level packages (provided by this python package) installed into site-packages
   111  	parentDir := filepath.Dir(metadataLocation.RealPath)
   112  	topLevelPath := filepath.Join(parentDir, "top_level.txt")
   113  	topLevelLocation := resolver.RelativeFileByPath(metadataLocation, topLevelPath)
   114  
   115  	if topLevelLocation == nil {
   116  		return nil, nil, nil
   117  	}
   118  
   119  	sources = append(sources, topLevelLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
   120  
   121  	topLevelContents, err := resolver.FileContentsByLocation(*topLevelLocation)
   122  	if err != nil {
   123  		return nil, nil, err
   124  	}
   125  	defer internal.CloseAndLogError(topLevelContents, topLevelLocation.AccessPath)
   126  
   127  	scanner := bufio.NewScanner(topLevelContents)
   128  	for scanner.Scan() {
   129  		pkgs = append(pkgs, scanner.Text())
   130  	}
   131  
   132  	if err := scanner.Err(); err != nil {
   133  		return nil, nil, err
   134  	}
   135  
   136  	return pkgs, sources, nil
   137  }
   138  
   139  type directURLOrigin struct {
   140  	URL         string      `json:"url"`
   141  	VCSInfo     vcsInfo     `json:"vcs_info"`
   142  	ArchiveInfo archiveInfo `json:"archive_info"`
   143  	DirInfo     dirInfo     `json:"dir_info"`
   144  }
   145  
   146  type dirInfo struct {
   147  	Editable bool `json:"editable"`
   148  }
   149  
   150  type archiveInfo struct {
   151  	Hash string `json:"hash"`
   152  }
   153  
   154  type vcsInfo struct {
   155  	CommitID          string `json:"commit_id"`
   156  	VCS               string `json:"vcs"`
   157  	RequestedRevision string `json:"requested_revision"`
   158  }
   159  
   160  func fetchDirectURLData(resolver file.Resolver, metadataLocation file.Location) (d *pkg.PythonDirectURLOriginInfo, sources []file.Location, err error) {
   161  	parentDir := filepath.Dir(metadataLocation.RealPath)
   162  	directURLPath := filepath.Join(parentDir, "direct_url.json")
   163  	directURLLocation := resolver.RelativeFileByPath(metadataLocation, directURLPath)
   164  
   165  	if directURLLocation == nil {
   166  		return nil, nil, nil
   167  	}
   168  
   169  	sources = append(sources, directURLLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
   170  
   171  	directURLContents, err := resolver.FileContentsByLocation(*directURLLocation)
   172  	if err != nil {
   173  		return nil, nil, err
   174  	}
   175  	defer internal.CloseAndLogError(directURLContents, directURLLocation.AccessPath)
   176  
   177  	buffer, err := io.ReadAll(directURLContents)
   178  	if err != nil {
   179  		return nil, nil, err
   180  	}
   181  
   182  	var directURLJson directURLOrigin
   183  	if err := json.Unmarshal(buffer, &directURLJson); err != nil {
   184  		return nil, nil, err
   185  	}
   186  
   187  	return &pkg.PythonDirectURLOriginInfo{
   188  		URL:      directURLJson.URL,
   189  		CommitID: directURLJson.VCSInfo.CommitID,
   190  		VCS:      directURLJson.VCSInfo.VCS,
   191  	}, sources, nil
   192  }
   193  
   194  // assembleEggOrWheelMetadata discovers and accumulates python package metadata from multiple file sources and returns a single metadata object as well as a list of files where the metadata was derived from.
   195  func assembleEggOrWheelMetadata(resolver file.Resolver, metadataLocation file.Location) (*parsedData, []file.Location, error) {
   196  	var sources = []file.Location{
   197  		metadataLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   198  	}
   199  
   200  	metadataContents, err := resolver.FileContentsByLocation(metadataLocation)
   201  	if err != nil {
   202  		return nil, nil, err
   203  	}
   204  	defer internal.CloseAndLogError(metadataContents, metadataLocation.AccessPath)
   205  
   206  	pd, err := parseWheelOrEggMetadata(file.NewLocationReadCloser(metadataLocation, metadataContents))
   207  	if err != nil {
   208  		return nil, nil, err
   209  	}
   210  
   211  	if pd.Name == "" {
   212  		return nil, nil, nil
   213  	}
   214  
   215  	// attach any python files found for the given wheel/egg installation
   216  	var errs error
   217  	r, s, err := fetchRecordFiles(resolver, metadataLocation)
   218  	if err != nil {
   219  		errs = unknown.Joinf(errs, "could not read python package RECORD file: %w", err)
   220  	}
   221  	if len(r) == 0 {
   222  		r, s, err = fetchInstalledFiles(resolver, metadataLocation, pd.SitePackagesRootPath)
   223  		if err != nil {
   224  			errs = unknown.Joinf(errs, "could not read python package installed-files.txt: %w", err)
   225  		}
   226  	}
   227  
   228  	sources = append(sources, s...)
   229  	pd.Files = r
   230  
   231  	// attach any top-level package names found for the given wheel/egg installation
   232  	p, s, err := fetchTopLevelPackages(resolver, metadataLocation)
   233  	if err != nil {
   234  		errs = unknown.Joinf(errs, "could not read python package top_level.txt: %w", err)
   235  	}
   236  	sources = append(sources, s...)
   237  	pd.TopLevelPackages = p
   238  
   239  	// attach any direct-url package data found for the given wheel/egg installation
   240  	d, s, err := fetchDirectURLData(resolver, metadataLocation)
   241  	if err != nil {
   242  		errs = unknown.Joinf(errs, "could not read python package direct_url.json: %w", err)
   243  	}
   244  
   245  	sources = append(sources, s...)
   246  	pd.DirectURLOrigin = d
   247  	return &pd, sources, errs
   248  }
   249  
   250  func findLicenses(ctx context.Context, resolver file.Resolver, m parsedData) pkg.LicenseSet {
   251  	var out []pkg.License
   252  
   253  	licenseLocations := file.NewLocationSet()
   254  	if m.LicenseFilePath != "" {
   255  		locs, err := resolver.FilesByPath(m.LicenseFilePath)
   256  		if err != nil {
   257  			log.WithFields("error", err, "path", m.LicenseFilePath).Trace("unable to resolve python license file")
   258  		} else {
   259  			licenseLocations.Add(locs...)
   260  		}
   261  	}
   262  
   263  	switch {
   264  	case m.LicenseExpression != "" || m.Licenses != "":
   265  		out = licenses.NewFromValues(ctx, licenseLocations.ToSlice(), m.LicenseExpression, m.Licenses)
   266  	case !licenseLocations.Empty():
   267  		out = licenses.FindAtLocations(ctx, resolver, licenseLocations.ToSlice()...)
   268  
   269  	default:
   270  		// search for known license paths from RECORDS file
   271  		parent := path.Base(path.Dir(m.DistInfoLocation.Path()))
   272  		candidatePaths := strset.New()
   273  		for _, f := range m.Files {
   274  			if !strings.HasPrefix(f.Path, parent) || strings.Count(f.Path, "/") > 1 {
   275  				continue
   276  			}
   277  
   278  			if licenses.IsLicenseFile(filepath.Base(f.Path)) {
   279  				candidatePaths.Add(path.Join(m.SitePackagesRootPath, f.Path))
   280  			}
   281  		}
   282  
   283  		out = licenses.FindAtPaths(ctx, resolver, candidatePaths.List()...)
   284  	}
   285  	return pkg.NewLicenseSet(out...)
   286  }