github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/python/requirements/requirements.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package requirements extracts requirements files.
    16  package requirements
    17  
    18  import (
    19  	"bufio"
    20  	"context"
    21  	"io"
    22  	"path/filepath"
    23  	"regexp"
    24  	"strings"
    25  
    26  	"github.com/google/osv-scalibr/extractor"
    27  	"github.com/google/osv-scalibr/extractor/filesystem"
    28  	scalibrfs "github.com/google/osv-scalibr/fs"
    29  	"github.com/google/osv-scalibr/inventory"
    30  	"github.com/google/osv-scalibr/log"
    31  	"github.com/google/osv-scalibr/plugin"
    32  	"github.com/google/osv-scalibr/purl"
    33  	"github.com/google/osv-scalibr/stats"
    34  )
    35  
    36  const (
    37  	// Name is the unique name of this extractor.
    38  	Name = "python/requirements"
    39  )
    40  
    41  var (
    42  	// Regex matching comments in requirements files.
    43  	// https://github.com/pypa/pip/blob/72a32e/src/pip/_internal/req/req_file.py#L492
    44  	reComment = regexp.MustCompile(`(^|\s+)#.*$`)
    45  	// We currently don't handle the following constraints.
    46  	// * Version wildcards (*)
    47  	// * Less than (<)
    48  	// * Not equal to (!=)
    49  	// * Multiple constraints (,)
    50  	reUnsupportedConstraints        = regexp.MustCompile(`\*|<[^=]|,|!=`)
    51  	reWhitespace                    = regexp.MustCompile(`[ \t\r]`)
    52  	reValidPkg                      = regexp.MustCompile(`^\w(\w|-)+$`)
    53  	reEnvVar                        = regexp.MustCompile(`(?P<var>\$\{(?P<name>[A-Z0-9_]+)\})`)
    54  	reExtras                        = regexp.MustCompile(`\[[^\[\]]*\]`)
    55  	reTextAfterFirstOptionInclusive = regexp.MustCompile(`(?:--hash|--global-option|--config-settings|-C).*`)
    56  	reHashOption                    = regexp.MustCompile(`--hash=(.+?)(?:$|\s)`)
    57  )
    58  
    59  // Config is the configuration for the Extractor.
    60  type Config struct {
    61  	// Stats is a stats collector for reporting metrics.
    62  	Stats stats.Collector
    63  	// MaxFileSizeBytes is the maximum file size this extractor will unmarshal. If
    64  	// `FileRequired` gets a bigger file, it will return false,
    65  	MaxFileSizeBytes int64
    66  }
    67  
    68  // DefaultConfig returns the default configuration for the extractor.
    69  func DefaultConfig() Config {
    70  	return Config{
    71  		Stats:            nil,
    72  		MaxFileSizeBytes: 0,
    73  	}
    74  }
    75  
    76  // Extractor extracts python packages from requirements.txt files.
    77  type Extractor struct {
    78  	stats            stats.Collector
    79  	maxFileSizeBytes int64
    80  }
    81  
    82  // New returns a requirements.txt extractor.
    83  //
    84  // For most use cases, initialize with:
    85  // ```
    86  // e := New(DefaultConfig())
    87  // ```
    88  func New(cfg Config) *Extractor {
    89  	return &Extractor{
    90  		stats:            cfg.Stats,
    91  		maxFileSizeBytes: cfg.MaxFileSizeBytes,
    92  	}
    93  }
    94  
    95  // NewDefault returns an extractor with the default config settings.
    96  func NewDefault() filesystem.Extractor { return New(DefaultConfig()) }
    97  
    98  // Name of the extractor.
    99  func (e Extractor) Name() string { return Name }
   100  
   101  // Version of the extractor.
   102  func (e Extractor) Version() int { return 0 }
   103  
   104  // Requirements of the extractor.
   105  func (e Extractor) Requirements() *plugin.Capabilities {
   106  	return &plugin.Capabilities{}
   107  }
   108  
   109  // FileRequired returns true if the specified file matches python Metadata file
   110  // patterns.
   111  func (e Extractor) FileRequired(api filesystem.FileAPI) bool {
   112  	path := api.Path()
   113  	if filepath.Ext(path) != ".txt" || !strings.Contains(filepath.Base(path), "requirements") {
   114  		return false
   115  	}
   116  
   117  	fileinfo, err := api.Stat()
   118  	if err != nil {
   119  		return false
   120  	}
   121  	if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes {
   122  		e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultSizeLimitExceeded)
   123  		return false
   124  	}
   125  
   126  	e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultOK)
   127  	return true
   128  }
   129  
   130  func (e Extractor) reportFileRequired(path string, fileSizeBytes int64, result stats.FileRequiredResult) {
   131  	if e.stats == nil {
   132  		return
   133  	}
   134  	e.stats.AfterFileRequired(e.Name(), &stats.FileRequiredStats{
   135  		Path:          path,
   136  		Result:        result,
   137  		FileSizeBytes: fileSizeBytes,
   138  	})
   139  }
   140  
   141  type pathQueue []string
   142  
   143  // Extract extracts packages from requirements files passed through the scan input.
   144  func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) {
   145  	// Additional paths to recursive files found during extraction.
   146  	var extraPaths pathQueue
   147  	var pkgs []*extractor.Package
   148  	newRepos, newPaths, err := extractFromPath(input.Reader, input.Path)
   149  	if err != nil {
   150  		return inventory.Inventory{}, err
   151  	}
   152  	if e.stats != nil {
   153  		e.exportStats(input, err)
   154  	}
   155  	extraPaths = append(extraPaths, newPaths...)
   156  	pkgs = append(pkgs, newRepos...)
   157  
   158  	// Process all the recursive files that we found.
   159  	extraPKG := extractFromExtraPaths(input.Path, extraPaths, input.FS)
   160  	pkgs = append(pkgs, extraPKG...)
   161  
   162  	return inventory.Inventory{Packages: pkgs}, nil
   163  }
   164  
   165  func extractFromExtraPaths(initPath string, extraPaths pathQueue, fs scalibrfs.FS) []*extractor.Package {
   166  	// File paths with packages already found in this extraction.
   167  	// We store these to remove duplicates in diamond dependency cases and prevent
   168  	// infinite loops in misconfigured lockfiles with cyclical deps.
   169  	var found = map[string]bool{initPath: true}
   170  	var pkgs []*extractor.Package
   171  
   172  	for len(extraPaths) > 0 {
   173  		path := extraPaths[0]
   174  		extraPaths = extraPaths[1:]
   175  		if _, exists := found[path]; exists {
   176  			continue
   177  		}
   178  		newPKG, newPaths, err := openAndExtractFromFile(path, fs)
   179  		if err != nil {
   180  			log.Warnf("openAndExtractFromFile(%s): %v", path, err)
   181  			continue
   182  		}
   183  		found[path] = true
   184  		extraPaths = append(extraPaths, newPaths...)
   185  		for _, p := range newPKG {
   186  			// Note the path through which we refer to this requirements.txt file.
   187  			p.Locations = append([]string{initPath}, p.Locations...)
   188  		}
   189  		pkgs = append(pkgs, newPKG...)
   190  	}
   191  
   192  	return pkgs
   193  }
   194  
   195  func openAndExtractFromFile(path string, fs scalibrfs.FS) ([]*extractor.Package, pathQueue, error) {
   196  	reader, err := fs.Open(filepath.ToSlash(path))
   197  	if err != nil {
   198  		return nil, nil, err
   199  	}
   200  	defer reader.Close()
   201  	return extractFromPath(reader, path)
   202  }
   203  
   204  func extractFromPath(reader io.Reader, path string) ([]*extractor.Package, pathQueue, error) {
   205  	var pkgs []*extractor.Package
   206  	var extraPaths pathQueue
   207  	s := bufio.NewScanner(reader)
   208  	for s.Scan() {
   209  		l := readLine(s, &strings.Builder{})
   210  		// Per-requirement options may be present. We extract the --hash options, and discard the others.
   211  		l, hashOptions := splitPerRequirementOptions(l)
   212  		requirement := strings.TrimSpace(l)
   213  
   214  		l = removeWhiteSpaces(l)
   215  		l = ignorePythonSpecifier(l)
   216  		l = removeExtras(l)
   217  
   218  		if len(l) == 0 {
   219  			continue
   220  		}
   221  
   222  		// Extract paths to referenced requirements.txt files for further processing.
   223  		if after, ok := strings.CutPrefix(l, "-r"); ok {
   224  			// Path is relative to the current requirement file's dir.
   225  			extraPaths = append(extraPaths, filepath.Join(filepath.Dir(path), after))
   226  		}
   227  
   228  		if strings.HasPrefix(l, "-") {
   229  			// Global options other than -r are not implemented.
   230  			// https://pip.pypa.io/en/stable/reference/requirements-file-format/#global-options
   231  			// TODO(b/286213823): Implement metric
   232  			continue
   233  		}
   234  
   235  		name, version, comp := getLowestVersion(l)
   236  		if name == "" {
   237  			continue
   238  		}
   239  		if version == "" && comp != "" {
   240  			// Version should be non-empty if there is comparator
   241  			continue
   242  		}
   243  		if !isValidPackage(name) {
   244  			// TODO(b/286213823): Implement Metric
   245  			continue
   246  		}
   247  
   248  		pkgs = append(pkgs, &extractor.Package{
   249  			Name:      name,
   250  			Version:   version,
   251  			PURLType:  purl.TypePyPi,
   252  			Locations: []string{filepath.ToSlash(path)},
   253  			Metadata: &Metadata{
   254  				HashCheckingModeValues: hashOptions,
   255  				VersionComparator:      comp,
   256  				Requirement:            requirement,
   257  			},
   258  		})
   259  	}
   260  
   261  	return pkgs, extraPaths, s.Err()
   262  }
   263  
   264  // readLine reads a line from the scanner, removes comments and joins it with
   265  // the next line if it ends with a backslash.
   266  func readLine(scanner *bufio.Scanner, builder *strings.Builder) string {
   267  	l := scanner.Text()
   268  	l = removeComments(l)
   269  
   270  	if hasEnvVariable(l) {
   271  		// Ignore env variables
   272  		// https://github.com/pypa/pip/blob/72a32e/src/pip/_internal/req/req_file.py#L503
   273  		// TODO(b/286213823): Implement metric
   274  		return ""
   275  	}
   276  
   277  	if strings.HasSuffix(l, `\`) {
   278  		builder.WriteString(l[:len(l)-1])
   279  		scanner.Scan()
   280  		return readLine(scanner, builder)
   281  	}
   282  
   283  	builder.WriteString(l)
   284  
   285  	return builder.String()
   286  }
   287  
   288  func (e Extractor) exportStats(input *filesystem.ScanInput, err error) {
   289  	var fileSizeBytes int64
   290  	if input.Info != nil {
   291  		fileSizeBytes = input.Info.Size()
   292  	}
   293  	e.stats.AfterFileExtracted(e.Name(), &stats.FileExtractedStats{
   294  		Path:          input.Path,
   295  		Result:        filesystem.ExtractorErrorToFileExtractedResult(err),
   296  		FileSizeBytes: fileSizeBytes,
   297  	})
   298  }
   299  
   300  func nameFromRequirement(s string) string {
   301  	for _, sep := range []string{"===", "==", ">=", "<=", "~=", "!=", "<"} {
   302  		s, _, _ = strings.Cut(s, sep)
   303  	}
   304  	return s
   305  }
   306  
   307  func getLowestVersion(s string) (name, version, comparator string) {
   308  	// TODO(b/286213823): Implement metric
   309  	if reUnsupportedConstraints.FindString(s) != "" {
   310  		// Return the name so the package will be in the list for dependency resolution.
   311  		return nameFromRequirement(s), "", ""
   312  	}
   313  
   314  	t := []string{}
   315  	separators := []string{"===", "==", ">=", "<=", "~="}
   316  	comp := ""
   317  	for _, sep := range separators {
   318  		if strings.Contains(s, sep) {
   319  			t = strings.SplitN(s, sep, 2)
   320  			comp = sep
   321  			break
   322  		}
   323  	}
   324  
   325  	if len(t) == 0 {
   326  		// Length of t being 0 indicates that there is no separator.
   327  		return s, "", ""
   328  	}
   329  	if len(t) != 2 {
   330  		return "", "", ""
   331  	}
   332  
   333  	// For all other separators the lowest version is the one we found.
   334  	return t[0], t[1], comp
   335  }
   336  
   337  func removeComments(s string) string {
   338  	return reComment.ReplaceAllString(s, "")
   339  }
   340  
   341  func removeWhiteSpaces(s string) string {
   342  	return reWhitespace.ReplaceAllString(s, "")
   343  }
   344  
   345  func ignorePythonSpecifier(s string) string {
   346  	return strings.SplitN(s, ";", 2)[0]
   347  }
   348  
   349  func isValidPackage(s string) bool {
   350  	return reValidPkg.MatchString(s)
   351  }
   352  
   353  func removeExtras(s string) string {
   354  	return reExtras.ReplaceAllString(s, "")
   355  }
   356  
   357  func hasEnvVariable(s string) bool {
   358  	return reEnvVar.FindString(s) != ""
   359  }
   360  
   361  // splitPerRequirementOptions removes from the input all text after the first per requirement option
   362  // and returns the remaining input along with the values of the --hash options. See the documentation
   363  // in https://pip.pypa.io/en/stable/reference/requirements-file-format/#per-requirement-options.
   364  func splitPerRequirementOptions(s string) (string, []string) {
   365  	hashes := []string{}
   366  	for _, hashOptionMatch := range reHashOption.FindAllStringSubmatch(s, -1) {
   367  		hashes = append(hashes, hashOptionMatch[1])
   368  	}
   369  	return reTextAfterFirstOptionInclusive.ReplaceAllString(s, ""), hashes
   370  }