github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/python/wheelegg/wheelegg.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package wheelegg extracts wheel and egg files.
    16  package wheelegg
    17  
    18  import (
    19  	"archive/zip"
    20  	"bufio"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"net/textproto"
    26  	"path/filepath"
    27  	"strings"
    28  
    29  	"github.com/google/osv-scalibr/extractor"
    30  	"github.com/google/osv-scalibr/extractor/filesystem"
    31  	"github.com/google/osv-scalibr/extractor/filesystem/internal/units"
    32  	"github.com/google/osv-scalibr/extractor/filesystem/simplefileapi"
    33  	scalibrfs "github.com/google/osv-scalibr/fs"
    34  	"github.com/google/osv-scalibr/inventory"
    35  	"github.com/google/osv-scalibr/plugin"
    36  	"github.com/google/osv-scalibr/purl"
    37  	"github.com/google/osv-scalibr/stats"
    38  )
    39  
    40  const (
    41  	// Name is the unique name of this extractor.
    42  	Name = "python/wheelegg"
    43  
    44  	// defaultMaxFileSizeBytes is the maximum file size an extractor will unmarshal.
    45  	// If Extract gets a bigger file, it will return an error.
    46  	defaultMaxFileSizeBytes = 100 * units.MiB
    47  )
    48  
    49  // Extractor extracts python packages from wheel/egg files.
    50  type Extractor struct {
    51  	maxFileSizeBytes int64
    52  	stats            stats.Collector
    53  }
    54  
    55  // Config is the configuration for the Extractor.
    56  type Config struct {
    57  	// MaxFileSizeBytes is the maximum file size this extractor will unmarshal. If
    58  	// `FileRequired` gets a bigger file, it will return false,
    59  	MaxFileSizeBytes int64
    60  	// Stats is a stats collector for reporting metrics.
    61  	Stats stats.Collector
    62  }
    63  
    64  // DefaultConfig returns the default configuration for the wheel/egg extractor.
    65  func DefaultConfig() Config {
    66  	return Config{
    67  		MaxFileSizeBytes: defaultMaxFileSizeBytes,
    68  		Stats:            nil,
    69  	}
    70  }
    71  
    72  // New returns a wheel/egg extractor.
    73  //
    74  // For most use cases, initialize with:
    75  // ```
    76  // e := New(DefaultConfig())
    77  // ```
    78  func New(cfg Config) *Extractor {
    79  	return &Extractor{
    80  		maxFileSizeBytes: cfg.MaxFileSizeBytes,
    81  		stats:            cfg.Stats,
    82  	}
    83  }
    84  
    85  // NewDefault returns an extractor with the default config settings.
    86  func NewDefault() filesystem.Extractor { return New(DefaultConfig()) }
    87  
    88  // Name of the extractor.
    89  func (e Extractor) Name() string { return Name }
    90  
    91  // Version of the extractor.
    92  func (e Extractor) Version() int { return 0 }
    93  
    94  // Requirements of the extractor.
    95  func (e Extractor) Requirements() *plugin.Capabilities { return &plugin.Capabilities{} }
    96  
    97  var (
    98  	requiredFiles = []string{
    99  		// Metadata format
   100  		"EGG-INFO/PKG-INFO",
   101  		".egg-info",
   102  		".egg-info/PKG-INFO",
   103  		".dist-info/METADATA",
   104  		// zip file with Metadata files inside.
   105  		".egg",
   106  		".whl",
   107  	}
   108  )
   109  
   110  // FileRequired returns true if the specified file matches python Metadata file
   111  // patterns.
   112  func (e Extractor) FileRequired(api filesystem.FileAPI) bool {
   113  	path := api.Path()
   114  	// For Windows
   115  	normalizedPath := filepath.ToSlash(path)
   116  
   117  	for _, r := range requiredFiles {
   118  		if strings.HasSuffix(normalizedPath, r) {
   119  			fileinfo, err := api.Stat()
   120  			if err != nil {
   121  				return false
   122  			}
   123  
   124  			// We only want to skip the file for being too large if it is a relevant
   125  			// file at all, so we check the file size after checking the file suffix.
   126  			if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes {
   127  				e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultSizeLimitExceeded)
   128  				return false
   129  			}
   130  
   131  			e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultOK)
   132  			return true
   133  		}
   134  	}
   135  	return false
   136  }
   137  
   138  func (e Extractor) reportFileRequired(path string, fileSizeBytes int64, result stats.FileRequiredResult) {
   139  	if e.stats == nil {
   140  		return
   141  	}
   142  	e.stats.AfterFileRequired(e.Name(), &stats.FileRequiredStats{
   143  		Path:          path,
   144  		Result:        result,
   145  		FileSizeBytes: fileSizeBytes,
   146  	})
   147  }
   148  
   149  // Extract extracts packages from wheel/egg files passed through the scan input.
   150  // For .egg files, input.Info.Size() is required to unzip the file.
   151  func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) {
   152  	var err error
   153  	var pkgs []*extractor.Package
   154  	if strings.HasSuffix(input.Path, ".egg") || strings.HasSuffix(input.Path, ".whl") {
   155  		// TODO(b/280417821): In case extractZip returns no packages, we could parse the filename.
   156  		pkgs, err = e.extractZip(ctx, input)
   157  	} else {
   158  		var p *extractor.Package
   159  		if p, err = e.extractSingleFile(input.Reader, input.Path); p != nil {
   160  			pkgs = []*extractor.Package{p}
   161  		}
   162  	}
   163  
   164  	if e.stats != nil {
   165  		var fileSizeBytes int64
   166  		if input.Info != nil {
   167  			fileSizeBytes = input.Info.Size()
   168  		}
   169  		e.stats.AfterFileExtracted(e.Name(), &stats.FileExtractedStats{
   170  			Path:          input.Path,
   171  			Result:        filesystem.ExtractorErrorToFileExtractedResult(err),
   172  			FileSizeBytes: fileSizeBytes,
   173  		})
   174  	}
   175  	return inventory.Inventory{Packages: pkgs}, err
   176  }
   177  
   178  // ErrSizeNotSet will trigger when Info.Size() is not set.
   179  var ErrSizeNotSet = errors.New("input.Info is nil, but should have Size set")
   180  
   181  func (e Extractor) extractZip(ctx context.Context, input *filesystem.ScanInput) ([]*extractor.Package, error) {
   182  	r, err := scalibrfs.NewReaderAt(input.Reader)
   183  	if err != nil {
   184  		return nil, fmt.Errorf("newReaderAt: %w", err)
   185  	}
   186  
   187  	if input.Info == nil {
   188  		return nil, ErrSizeNotSet
   189  	}
   190  	s := input.Info.Size()
   191  	zr, err := zip.NewReader(r, s)
   192  	if err != nil {
   193  		return nil, fmt.Errorf("zip.NewReader: %w", err)
   194  	}
   195  	pkgs := []*extractor.Package{}
   196  	for _, f := range zr.File {
   197  		if ctx.Err() != nil {
   198  			return nil, ctx.Err()
   199  		}
   200  
   201  		if !e.FileRequired(simplefileapi.New(f.Name, f.FileInfo())) {
   202  			continue
   203  		}
   204  		p, err := e.openAndExtract(f, input)
   205  		if err != nil {
   206  			return pkgs, err
   207  		}
   208  		pkgs = append(pkgs, p)
   209  	}
   210  	return pkgs, nil
   211  }
   212  
   213  func (e Extractor) openAndExtract(f *zip.File, input *filesystem.ScanInput) (*extractor.Package, error) {
   214  	r, err := f.Open()
   215  	if err != nil {
   216  		return nil, fmt.Errorf("f.Open(%s): %w", f.Name, err)
   217  	}
   218  	defer r.Close()
   219  
   220  	// TODO(b/280438976): Store the path inside the zip file.
   221  	p, err := e.extractSingleFile(r, input.Path)
   222  	if err != nil {
   223  		return nil, err
   224  	}
   225  
   226  	return p, nil
   227  }
   228  
   229  func (e Extractor) extractSingleFile(r io.Reader, path string) (*extractor.Package, error) {
   230  	p, err := parse(r)
   231  	if err != nil {
   232  		return nil, fmt.Errorf("wheelegg.parse: %w", err)
   233  	}
   234  
   235  	p.Locations = []string{path}
   236  	return p, nil
   237  }
   238  
   239  func parse(r io.Reader) (*extractor.Package, error) {
   240  	rd := textproto.NewReader(bufio.NewReader(r))
   241  	h, err := rd.ReadMIMEHeader()
   242  	name := h.Get("Name")
   243  	version := h.Get("version")
   244  	if name == "" || version == "" {
   245  		// In case we got name and version but also an error, we ignore the error. This can happen in
   246  		// malformed files like passlib 1.7.4.
   247  		if err != nil {
   248  			return nil, fmt.Errorf("ReadMIMEHeader(): %w %s %s", err, h.Get("Name"), h.Get("version"))
   249  		}
   250  		return nil, fmt.Errorf("Name or version is empty (name: %q, version: %q)", name, version)
   251  	}
   252  
   253  	return &extractor.Package{
   254  		Name:     name,
   255  		Version:  version,
   256  		PURLType: purl.TypePyPi,
   257  		Metadata: &PythonPackageMetadata{
   258  			Author:      h.Get("Author"),
   259  			AuthorEmail: h.Get("Author-email"),
   260  		},
   261  	}, nil
   262  }