github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/pkg/cataloger/python/parse_wheel_egg.go (about)

     1  package python
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/json"
     6  	"fmt"
     7  	"io"
     8  	"path/filepath"
     9  
    10  	"github.com/anchore/syft/syft/artifact"
    11  	"github.com/anchore/syft/syft/file"
    12  	"github.com/anchore/syft/syft/pkg"
    13  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    14  	"github.com/lineaje-labs/syft/internal"
    15  	"github.com/lineaje-labs/syft/internal/log"
    16  )
    17  
    18  // parseWheelOrEgg takes the primary metadata file reference and returns the python package it represents. Contained
    19  // fields are governed by the PyPA core metadata specification (https://packaging.python.org/en/latest/specifications/core-metadata/).
    20  func parseWheelOrEgg(
    21  	resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser,
    22  ) ([]pkg.Package, []artifact.Relationship, error) {
    23  	pd, sources, err := assembleEggOrWheelMetadata(resolver, reader.Location)
    24  	if err != nil {
    25  		return nil, nil, err
    26  	}
    27  	if pd == nil {
    28  		return nil, nil, nil
    29  	}
    30  
    31  	// This can happen for Python 2.7 where it is reported from an egg-info, but Python is
    32  	// the actual runtime, it isn't a "package". The special-casing here allows to skip it
    33  	if pd.Name == "Python" {
    34  		return nil, nil, nil
    35  	}
    36  
    37  	pkgs := []pkg.Package{newPackageForPackage(*pd, sources...)}
    38  
    39  	return pkgs, nil, nil
    40  }
    41  
    42  // fetchRecordFiles finds a corresponding installed-files.txt file for the given python package metadata file and returns the set of file records contained.
    43  func fetchInstalledFiles(
    44  	resolver file.Resolver, metadataLocation file.Location, sitePackagesRootPath string,
    45  ) (files []pkg.PythonFileRecord, sources []file.Location, err error) {
    46  	// we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory
    47  	// or for an image... for an image the METADATA file may be present within multiple layers, so it is important
    48  	// to reconcile the installed-files.txt path to the same layer (or the next adjacent lower layer).
    49  
    50  	// find the installed-files.txt file relative to the directory where the METADATA file resides (in path AND layer structure)
    51  	installedFilesPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "installed-files.txt")
    52  	installedFilesRef := resolver.RelativeFileByPath(metadataLocation, installedFilesPath)
    53  
    54  	if installedFilesRef != nil {
    55  		sources = append(sources, installedFilesRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
    56  
    57  		installedFilesContents, err := resolver.FileContentsByLocation(*installedFilesRef)
    58  		if err != nil {
    59  			return nil, nil, err
    60  		}
    61  		defer internal.CloseAndLogError(installedFilesContents, installedFilesPath)
    62  
    63  		// parse the installed-files contents
    64  		installedFiles, err := parseInstalledFiles(installedFilesContents, metadataLocation.RealPath, sitePackagesRootPath)
    65  		if err != nil {
    66  			log.Warnf("unable to parse installed-files.txt for python package=%+v: %w", metadataLocation.RealPath, err)
    67  			return files, sources, nil
    68  		}
    69  
    70  		files = append(files, installedFiles...)
    71  	}
    72  	return files, sources, nil
    73  }
    74  
    75  // fetchRecordFiles finds a corresponding RECORD file for the given python package metadata file and returns the set of file records contained.
    76  func fetchRecordFiles(
    77  	resolver file.Resolver, metadataLocation file.Location,
    78  ) (files []pkg.PythonFileRecord, sources []file.Location, err error) {
    79  	// we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory
    80  	// or for an image... for an image the METADATA file may be present within multiple layers, so it is important
    81  	// to reconcile the RECORD path to the same layer (or the next adjacent lower layer).
    82  
    83  	// find the RECORD file relative to the directory where the METADATA file resides (in path AND layer structure)
    84  	recordPath := filepath.Join(filepath.Dir(metadataLocation.RealPath), "RECORD")
    85  	recordRef := resolver.RelativeFileByPath(metadataLocation, recordPath)
    86  
    87  	if recordRef != nil {
    88  		sources = append(sources, recordRef.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
    89  
    90  		recordContents, err := resolver.FileContentsByLocation(*recordRef)
    91  		if err != nil {
    92  			return nil, nil, err
    93  		}
    94  		defer internal.CloseAndLogError(recordContents, recordPath)
    95  
    96  		// parse the record contents
    97  		records := parseWheelOrEggRecord(recordContents)
    98  
    99  		files = append(files, records...)
   100  	}
   101  	return files, sources, nil
   102  }
   103  
   104  // fetchTopLevelPackages finds a corresponding top_level.txt file for the given python package metadata file and returns the set of package names contained.
   105  func fetchTopLevelPackages(
   106  	resolver file.Resolver, metadataLocation file.Location,
   107  ) (pkgs []string, sources []file.Location, err error) {
   108  	// a top_level.txt file specifies the python top-level packages (provided by this python package) installed into site-packages
   109  	parentDir := filepath.Dir(metadataLocation.RealPath)
   110  	topLevelPath := filepath.Join(parentDir, "top_level.txt")
   111  	topLevelLocation := resolver.RelativeFileByPath(metadataLocation, topLevelPath)
   112  
   113  	if topLevelLocation == nil {
   114  		return nil, nil, nil
   115  	}
   116  
   117  	sources = append(sources, topLevelLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
   118  
   119  	topLevelContents, err := resolver.FileContentsByLocation(*topLevelLocation)
   120  	if err != nil {
   121  		return nil, nil, err
   122  	}
   123  	defer internal.CloseAndLogError(topLevelContents, topLevelLocation.AccessPath)
   124  
   125  	scanner := bufio.NewScanner(topLevelContents)
   126  	for scanner.Scan() {
   127  		pkgs = append(pkgs, scanner.Text())
   128  	}
   129  
   130  	if err := scanner.Err(); err != nil {
   131  		return nil, nil, fmt.Errorf("could not read python package top_level.txt: %w", err)
   132  	}
   133  
   134  	return pkgs, sources, nil
   135  }
   136  
   137  type directURLOrigin struct {
   138  	URL         string      `json:"url"`
   139  	VCSInfo     vcsInfo     `json:"vcs_info"`
   140  	ArchiveInfo archiveInfo `json:"archive_info"`
   141  	DirInfo     dirInfo     `json:"dir_info"`
   142  }
   143  
   144  type dirInfo struct {
   145  	Editable bool `json:"editable"`
   146  }
   147  
   148  type archiveInfo struct {
   149  	Hash string `json:"hash"`
   150  }
   151  
   152  type vcsInfo struct {
   153  	CommitID          string `json:"commit_id"`
   154  	VCS               string `json:"vcs"`
   155  	RequestedRevision string `json:"requested_revision"`
   156  }
   157  
   158  func fetchDirectURLData(
   159  	resolver file.Resolver, metadataLocation file.Location,
   160  ) (d *pkg.PythonDirectURLOriginInfo, sources []file.Location, err error) {
   161  	parentDir := filepath.Dir(metadataLocation.RealPath)
   162  	directURLPath := filepath.Join(parentDir, "direct_url.json")
   163  	directURLLocation := resolver.RelativeFileByPath(metadataLocation, directURLPath)
   164  
   165  	if directURLLocation == nil {
   166  		return nil, nil, nil
   167  	}
   168  
   169  	sources = append(sources, directURLLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
   170  
   171  	directURLContents, err := resolver.FileContentsByLocation(*directURLLocation)
   172  	if err != nil {
   173  		return nil, nil, err
   174  	}
   175  	defer internal.CloseAndLogError(directURLContents, directURLLocation.AccessPath)
   176  
   177  	buffer, err := io.ReadAll(directURLContents)
   178  	if err != nil {
   179  		return nil, nil, err
   180  	}
   181  
   182  	var directURLJson directURLOrigin
   183  	if err := json.Unmarshal(buffer, &directURLJson); err != nil {
   184  		return nil, nil, err
   185  	}
   186  
   187  	return &pkg.PythonDirectURLOriginInfo{
   188  		URL:      directURLJson.URL,
   189  		CommitID: directURLJson.VCSInfo.CommitID,
   190  		VCS:      directURLJson.VCSInfo.VCS,
   191  	}, sources, nil
   192  }
   193  
   194  // assembleEggOrWheelMetadata discovers and accumulates python package metadata from multiple file sources and returns a single metadata object as well as a list of files where the metadata was derived from.
   195  func assembleEggOrWheelMetadata(
   196  	resolver file.Resolver, metadataLocation file.Location,
   197  ) (*parsedData, []file.Location, error) {
   198  	var sources = []file.Location{
   199  		metadataLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   200  	}
   201  
   202  	metadataContents, err := resolver.FileContentsByLocation(metadataLocation)
   203  	if err != nil {
   204  		return nil, nil, err
   205  	}
   206  	defer internal.CloseAndLogError(metadataContents, metadataLocation.AccessPath)
   207  
   208  	pd, err := parseWheelOrEggMetadata(metadataLocation.RealPath, metadataContents)
   209  	if err != nil {
   210  		return nil, nil, err
   211  	}
   212  
   213  	if pd.Name == "" {
   214  		return nil, nil, nil
   215  	}
   216  
   217  	// attach any python files found for the given wheel/egg installation
   218  	r, s, err := fetchRecordFiles(resolver, metadataLocation)
   219  	if err != nil {
   220  		return nil, nil, err
   221  	}
   222  	if len(r) == 0 {
   223  		r, s, err = fetchInstalledFiles(resolver, metadataLocation, pd.SitePackagesRootPath)
   224  		if err != nil {
   225  			return nil, nil, err
   226  		}
   227  	}
   228  
   229  	sources = append(sources, s...)
   230  	pd.Files = r
   231  
   232  	// attach any top-level package names found for the given wheel/egg installation
   233  	p, s, err := fetchTopLevelPackages(resolver, metadataLocation)
   234  	if err != nil {
   235  		return nil, nil, err
   236  	}
   237  	sources = append(sources, s...)
   238  	pd.TopLevelPackages = p
   239  
   240  	// attach any direct-url package data found for the given wheel/egg installation
   241  	d, s, err := fetchDirectURLData(resolver, metadataLocation)
   242  	if err != nil {
   243  		return nil, nil, err
   244  	}
   245  
   246  	sources = append(sources, s...)
   247  	pd.DirectURLOrigin = d
   248  	return &pd, sources, nil
   249  }