github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/python/requirements/requirements.go (about) 1 // Copyright 2025 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package requirements extracts requirements files. 16 package requirements 17 18 import ( 19 "bufio" 20 "context" 21 "io" 22 "path/filepath" 23 "regexp" 24 "strings" 25 26 "github.com/google/osv-scalibr/extractor" 27 "github.com/google/osv-scalibr/extractor/filesystem" 28 scalibrfs "github.com/google/osv-scalibr/fs" 29 "github.com/google/osv-scalibr/inventory" 30 "github.com/google/osv-scalibr/log" 31 "github.com/google/osv-scalibr/plugin" 32 "github.com/google/osv-scalibr/purl" 33 "github.com/google/osv-scalibr/stats" 34 ) 35 36 const ( 37 // Name is the unique name of this extractor. 38 Name = "python/requirements" 39 ) 40 41 var ( 42 // Regex matching comments in requirements files. 43 // https://github.com/pypa/pip/blob/72a32e/src/pip/_internal/req/req_file.py#L492 44 reComment = regexp.MustCompile(`(^|\s+)#.*$`) 45 // We currently don't handle the following constraints. 46 // * Version wildcards (*) 47 // * Less than (<) 48 // * Not equal to (!=) 49 // * Multiple constraints (,) 50 reUnsupportedConstraints = regexp.MustCompile(`\*|<[^=]|,|!=`) 51 reWhitespace = regexp.MustCompile(`[ \t\r]`) 52 reValidPkg = regexp.MustCompile(`^\w(\w|-)+$`) 53 reEnvVar = regexp.MustCompile(`(?P<var>\$\{(?P<name>[A-Z0-9_]+)\})`) 54 reExtras = regexp.MustCompile(`\[[^\[\]]*\]`) 55 reTextAfterFirstOptionInclusive = regexp.MustCompile(`(?:--hash|--global-option|--config-settings|-C).*`) 56 reHashOption = regexp.MustCompile(`--hash=(.+?)(?:$|\s)`) 57 ) 58 59 // Config is the configuration for the Extractor. 60 type Config struct { 61 // Stats is a stats collector for reporting metrics. 62 Stats stats.Collector 63 // MaxFileSizeBytes is the maximum file size this extractor will unmarshal. If 64 // `FileRequired` gets a bigger file, it will return false, 65 MaxFileSizeBytes int64 66 } 67 68 // DefaultConfig returns the default configuration for the extractor. 69 func DefaultConfig() Config { 70 return Config{ 71 Stats: nil, 72 MaxFileSizeBytes: 0, 73 } 74 } 75 76 // Extractor extracts python packages from requirements.txt files. 77 type Extractor struct { 78 stats stats.Collector 79 maxFileSizeBytes int64 80 } 81 82 // New returns a requirements.txt extractor. 83 // 84 // For most use cases, initialize with: 85 // ``` 86 // e := New(DefaultConfig()) 87 // ``` 88 func New(cfg Config) *Extractor { 89 return &Extractor{ 90 stats: cfg.Stats, 91 maxFileSizeBytes: cfg.MaxFileSizeBytes, 92 } 93 } 94 95 // NewDefault returns an extractor with the default config settings. 96 func NewDefault() filesystem.Extractor { return New(DefaultConfig()) } 97 98 // Name of the extractor. 99 func (e Extractor) Name() string { return Name } 100 101 // Version of the extractor. 102 func (e Extractor) Version() int { return 0 } 103 104 // Requirements of the extractor. 105 func (e Extractor) Requirements() *plugin.Capabilities { 106 return &plugin.Capabilities{} 107 } 108 109 // FileRequired returns true if the specified file matches python Metadata file 110 // patterns. 111 func (e Extractor) FileRequired(api filesystem.FileAPI) bool { 112 path := api.Path() 113 if filepath.Ext(path) != ".txt" || !strings.Contains(filepath.Base(path), "requirements") { 114 return false 115 } 116 117 fileinfo, err := api.Stat() 118 if err != nil { 119 return false 120 } 121 if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes { 122 e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultSizeLimitExceeded) 123 return false 124 } 125 126 e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultOK) 127 return true 128 } 129 130 func (e Extractor) reportFileRequired(path string, fileSizeBytes int64, result stats.FileRequiredResult) { 131 if e.stats == nil { 132 return 133 } 134 e.stats.AfterFileRequired(e.Name(), &stats.FileRequiredStats{ 135 Path: path, 136 Result: result, 137 FileSizeBytes: fileSizeBytes, 138 }) 139 } 140 141 type pathQueue []string 142 143 // Extract extracts packages from requirements files passed through the scan input. 144 func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) { 145 // Additional paths to recursive files found during extraction. 146 var extraPaths pathQueue 147 var pkgs []*extractor.Package 148 newRepos, newPaths, err := extractFromPath(input.Reader, input.Path) 149 if err != nil { 150 return inventory.Inventory{}, err 151 } 152 if e.stats != nil { 153 e.exportStats(input, err) 154 } 155 extraPaths = append(extraPaths, newPaths...) 156 pkgs = append(pkgs, newRepos...) 157 158 // Process all the recursive files that we found. 159 extraPKG := extractFromExtraPaths(input.Path, extraPaths, input.FS) 160 pkgs = append(pkgs, extraPKG...) 161 162 return inventory.Inventory{Packages: pkgs}, nil 163 } 164 165 func extractFromExtraPaths(initPath string, extraPaths pathQueue, fs scalibrfs.FS) []*extractor.Package { 166 // File paths with packages already found in this extraction. 167 // We store these to remove duplicates in diamond dependency cases and prevent 168 // infinite loops in misconfigured lockfiles with cyclical deps. 169 var found = map[string]bool{initPath: true} 170 var pkgs []*extractor.Package 171 172 for len(extraPaths) > 0 { 173 path := extraPaths[0] 174 extraPaths = extraPaths[1:] 175 if _, exists := found[path]; exists { 176 continue 177 } 178 newPKG, newPaths, err := openAndExtractFromFile(path, fs) 179 if err != nil { 180 log.Warnf("openAndExtractFromFile(%s): %v", path, err) 181 continue 182 } 183 found[path] = true 184 extraPaths = append(extraPaths, newPaths...) 185 for _, p := range newPKG { 186 // Note the path through which we refer to this requirements.txt file. 187 p.Locations = append([]string{initPath}, p.Locations...) 188 } 189 pkgs = append(pkgs, newPKG...) 190 } 191 192 return pkgs 193 } 194 195 func openAndExtractFromFile(path string, fs scalibrfs.FS) ([]*extractor.Package, pathQueue, error) { 196 reader, err := fs.Open(filepath.ToSlash(path)) 197 if err != nil { 198 return nil, nil, err 199 } 200 defer reader.Close() 201 return extractFromPath(reader, path) 202 } 203 204 func extractFromPath(reader io.Reader, path string) ([]*extractor.Package, pathQueue, error) { 205 var pkgs []*extractor.Package 206 var extraPaths pathQueue 207 s := bufio.NewScanner(reader) 208 for s.Scan() { 209 l := readLine(s, &strings.Builder{}) 210 // Per-requirement options may be present. We extract the --hash options, and discard the others. 211 l, hashOptions := splitPerRequirementOptions(l) 212 requirement := strings.TrimSpace(l) 213 214 l = removeWhiteSpaces(l) 215 l = ignorePythonSpecifier(l) 216 l = removeExtras(l) 217 218 if len(l) == 0 { 219 continue 220 } 221 222 // Extract paths to referenced requirements.txt files for further processing. 223 if after, ok := strings.CutPrefix(l, "-r"); ok { 224 // Path is relative to the current requirement file's dir. 225 extraPaths = append(extraPaths, filepath.Join(filepath.Dir(path), after)) 226 } 227 228 if strings.HasPrefix(l, "-") { 229 // Global options other than -r are not implemented. 230 // https://pip.pypa.io/en/stable/reference/requirements-file-format/#global-options 231 // TODO(b/286213823): Implement metric 232 continue 233 } 234 235 name, version, comp := getLowestVersion(l) 236 if name == "" { 237 continue 238 } 239 if version == "" && comp != "" { 240 // Version should be non-empty if there is comparator 241 continue 242 } 243 if !isValidPackage(name) { 244 // TODO(b/286213823): Implement Metric 245 continue 246 } 247 248 pkgs = append(pkgs, &extractor.Package{ 249 Name: name, 250 Version: version, 251 PURLType: purl.TypePyPi, 252 Locations: []string{filepath.ToSlash(path)}, 253 Metadata: &Metadata{ 254 HashCheckingModeValues: hashOptions, 255 VersionComparator: comp, 256 Requirement: requirement, 257 }, 258 }) 259 } 260 261 return pkgs, extraPaths, s.Err() 262 } 263 264 // readLine reads a line from the scanner, removes comments and joins it with 265 // the next line if it ends with a backslash. 266 func readLine(scanner *bufio.Scanner, builder *strings.Builder) string { 267 l := scanner.Text() 268 l = removeComments(l) 269 270 if hasEnvVariable(l) { 271 // Ignore env variables 272 // https://github.com/pypa/pip/blob/72a32e/src/pip/_internal/req/req_file.py#L503 273 // TODO(b/286213823): Implement metric 274 return "" 275 } 276 277 if strings.HasSuffix(l, `\`) { 278 builder.WriteString(l[:len(l)-1]) 279 scanner.Scan() 280 return readLine(scanner, builder) 281 } 282 283 builder.WriteString(l) 284 285 return builder.String() 286 } 287 288 func (e Extractor) exportStats(input *filesystem.ScanInput, err error) { 289 var fileSizeBytes int64 290 if input.Info != nil { 291 fileSizeBytes = input.Info.Size() 292 } 293 e.stats.AfterFileExtracted(e.Name(), &stats.FileExtractedStats{ 294 Path: input.Path, 295 Result: filesystem.ExtractorErrorToFileExtractedResult(err), 296 FileSizeBytes: fileSizeBytes, 297 }) 298 } 299 300 func nameFromRequirement(s string) string { 301 for _, sep := range []string{"===", "==", ">=", "<=", "~=", "!=", "<"} { 302 s, _, _ = strings.Cut(s, sep) 303 } 304 return s 305 } 306 307 func getLowestVersion(s string) (name, version, comparator string) { 308 // TODO(b/286213823): Implement metric 309 if reUnsupportedConstraints.FindString(s) != "" { 310 // Return the name so the package will be in the list for dependency resolution. 311 return nameFromRequirement(s), "", "" 312 } 313 314 t := []string{} 315 separators := []string{"===", "==", ">=", "<=", "~="} 316 comp := "" 317 for _, sep := range separators { 318 if strings.Contains(s, sep) { 319 t = strings.SplitN(s, sep, 2) 320 comp = sep 321 break 322 } 323 } 324 325 if len(t) == 0 { 326 // Length of t being 0 indicates that there is no separator. 327 return s, "", "" 328 } 329 if len(t) != 2 { 330 return "", "", "" 331 } 332 333 // For all other separators the lowest version is the one we found. 334 return t[0], t[1], comp 335 } 336 337 func removeComments(s string) string { 338 return reComment.ReplaceAllString(s, "") 339 } 340 341 func removeWhiteSpaces(s string) string { 342 return reWhitespace.ReplaceAllString(s, "") 343 } 344 345 func ignorePythonSpecifier(s string) string { 346 return strings.SplitN(s, ";", 2)[0] 347 } 348 349 func isValidPackage(s string) bool { 350 return reValidPkg.MatchString(s) 351 } 352 353 func removeExtras(s string) string { 354 return reExtras.ReplaceAllString(s, "") 355 } 356 357 func hasEnvVariable(s string) bool { 358 return reEnvVar.FindString(s) != "" 359 } 360 361 // splitPerRequirementOptions removes from the input all text after the first per requirement option 362 // and returns the remaining input along with the values of the --hash options. See the documentation 363 // in https://pip.pypa.io/en/stable/reference/requirements-file-format/#per-requirement-options. 364 func splitPerRequirementOptions(s string) (string, []string) { 365 hashes := []string{} 366 for _, hashOptionMatch := range reHashOption.FindAllStringSubmatch(s, -1) { 367 hashes = append(hashes, hashOptionMatch[1]) 368 } 369 return reTextAfterFirstOptionInclusive.ReplaceAllString(s, ""), hashes 370 }