github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/python/wheelegg/wheelegg.go (about) 1 // Copyright 2025 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package wheelegg extracts wheel and egg files. 16 package wheelegg 17 18 import ( 19 "archive/zip" 20 "bufio" 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "net/textproto" 26 "path/filepath" 27 "strings" 28 29 "github.com/google/osv-scalibr/extractor" 30 "github.com/google/osv-scalibr/extractor/filesystem" 31 "github.com/google/osv-scalibr/extractor/filesystem/internal/units" 32 "github.com/google/osv-scalibr/extractor/filesystem/simplefileapi" 33 scalibrfs "github.com/google/osv-scalibr/fs" 34 "github.com/google/osv-scalibr/inventory" 35 "github.com/google/osv-scalibr/plugin" 36 "github.com/google/osv-scalibr/purl" 37 "github.com/google/osv-scalibr/stats" 38 ) 39 40 const ( 41 // Name is the unique name of this extractor. 42 Name = "python/wheelegg" 43 44 // defaultMaxFileSizeBytes is the maximum file size an extractor will unmarshal. 45 // If Extract gets a bigger file, it will return an error. 46 defaultMaxFileSizeBytes = 100 * units.MiB 47 ) 48 49 // Extractor extracts python packages from wheel/egg files. 50 type Extractor struct { 51 maxFileSizeBytes int64 52 stats stats.Collector 53 } 54 55 // Config is the configuration for the Extractor. 56 type Config struct { 57 // MaxFileSizeBytes is the maximum file size this extractor will unmarshal. If 58 // `FileRequired` gets a bigger file, it will return false, 59 MaxFileSizeBytes int64 60 // Stats is a stats collector for reporting metrics. 61 Stats stats.Collector 62 } 63 64 // DefaultConfig returns the default configuration for the wheel/egg extractor. 65 func DefaultConfig() Config { 66 return Config{ 67 MaxFileSizeBytes: defaultMaxFileSizeBytes, 68 Stats: nil, 69 } 70 } 71 72 // New returns a wheel/egg extractor. 73 // 74 // For most use cases, initialize with: 75 // ``` 76 // e := New(DefaultConfig()) 77 // ``` 78 func New(cfg Config) *Extractor { 79 return &Extractor{ 80 maxFileSizeBytes: cfg.MaxFileSizeBytes, 81 stats: cfg.Stats, 82 } 83 } 84 85 // NewDefault returns an extractor with the default config settings. 86 func NewDefault() filesystem.Extractor { return New(DefaultConfig()) } 87 88 // Name of the extractor. 89 func (e Extractor) Name() string { return Name } 90 91 // Version of the extractor. 92 func (e Extractor) Version() int { return 0 } 93 94 // Requirements of the extractor. 95 func (e Extractor) Requirements() *plugin.Capabilities { return &plugin.Capabilities{} } 96 97 var ( 98 requiredFiles = []string{ 99 // Metadata format 100 "EGG-INFO/PKG-INFO", 101 ".egg-info", 102 ".egg-info/PKG-INFO", 103 ".dist-info/METADATA", 104 // zip file with Metadata files inside. 105 ".egg", 106 ".whl", 107 } 108 ) 109 110 // FileRequired returns true if the specified file matches python Metadata file 111 // patterns. 112 func (e Extractor) FileRequired(api filesystem.FileAPI) bool { 113 path := api.Path() 114 // For Windows 115 normalizedPath := filepath.ToSlash(path) 116 117 for _, r := range requiredFiles { 118 if strings.HasSuffix(normalizedPath, r) { 119 fileinfo, err := api.Stat() 120 if err != nil { 121 return false 122 } 123 124 // We only want to skip the file for being too large if it is a relevant 125 // file at all, so we check the file size after checking the file suffix. 126 if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes { 127 e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultSizeLimitExceeded) 128 return false 129 } 130 131 e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultOK) 132 return true 133 } 134 } 135 return false 136 } 137 138 func (e Extractor) reportFileRequired(path string, fileSizeBytes int64, result stats.FileRequiredResult) { 139 if e.stats == nil { 140 return 141 } 142 e.stats.AfterFileRequired(e.Name(), &stats.FileRequiredStats{ 143 Path: path, 144 Result: result, 145 FileSizeBytes: fileSizeBytes, 146 }) 147 } 148 149 // Extract extracts packages from wheel/egg files passed through the scan input. 150 // For .egg files, input.Info.Size() is required to unzip the file. 151 func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) { 152 var err error 153 var pkgs []*extractor.Package 154 if strings.HasSuffix(input.Path, ".egg") || strings.HasSuffix(input.Path, ".whl") { 155 // TODO(b/280417821): In case extractZip returns no packages, we could parse the filename. 156 pkgs, err = e.extractZip(ctx, input) 157 } else { 158 var p *extractor.Package 159 if p, err = e.extractSingleFile(input.Reader, input.Path); p != nil { 160 pkgs = []*extractor.Package{p} 161 } 162 } 163 164 if e.stats != nil { 165 var fileSizeBytes int64 166 if input.Info != nil { 167 fileSizeBytes = input.Info.Size() 168 } 169 e.stats.AfterFileExtracted(e.Name(), &stats.FileExtractedStats{ 170 Path: input.Path, 171 Result: filesystem.ExtractorErrorToFileExtractedResult(err), 172 FileSizeBytes: fileSizeBytes, 173 }) 174 } 175 return inventory.Inventory{Packages: pkgs}, err 176 } 177 178 // ErrSizeNotSet will trigger when Info.Size() is not set. 179 var ErrSizeNotSet = errors.New("input.Info is nil, but should have Size set") 180 181 func (e Extractor) extractZip(ctx context.Context, input *filesystem.ScanInput) ([]*extractor.Package, error) { 182 r, err := scalibrfs.NewReaderAt(input.Reader) 183 if err != nil { 184 return nil, fmt.Errorf("newReaderAt: %w", err) 185 } 186 187 if input.Info == nil { 188 return nil, ErrSizeNotSet 189 } 190 s := input.Info.Size() 191 zr, err := zip.NewReader(r, s) 192 if err != nil { 193 return nil, fmt.Errorf("zip.NewReader: %w", err) 194 } 195 pkgs := []*extractor.Package{} 196 for _, f := range zr.File { 197 if ctx.Err() != nil { 198 return nil, ctx.Err() 199 } 200 201 if !e.FileRequired(simplefileapi.New(f.Name, f.FileInfo())) { 202 continue 203 } 204 p, err := e.openAndExtract(f, input) 205 if err != nil { 206 return pkgs, err 207 } 208 pkgs = append(pkgs, p) 209 } 210 return pkgs, nil 211 } 212 213 func (e Extractor) openAndExtract(f *zip.File, input *filesystem.ScanInput) (*extractor.Package, error) { 214 r, err := f.Open() 215 if err != nil { 216 return nil, fmt.Errorf("f.Open(%s): %w", f.Name, err) 217 } 218 defer r.Close() 219 220 // TODO(b/280438976): Store the path inside the zip file. 221 p, err := e.extractSingleFile(r, input.Path) 222 if err != nil { 223 return nil, err 224 } 225 226 return p, nil 227 } 228 229 func (e Extractor) extractSingleFile(r io.Reader, path string) (*extractor.Package, error) { 230 p, err := parse(r) 231 if err != nil { 232 return nil, fmt.Errorf("wheelegg.parse: %w", err) 233 } 234 235 p.Locations = []string{path} 236 return p, nil 237 } 238 239 func parse(r io.Reader) (*extractor.Package, error) { 240 rd := textproto.NewReader(bufio.NewReader(r)) 241 h, err := rd.ReadMIMEHeader() 242 name := h.Get("Name") 243 version := h.Get("version") 244 if name == "" || version == "" { 245 // In case we got name and version but also an error, we ignore the error. This can happen in 246 // malformed files like passlib 1.7.4. 247 if err != nil { 248 return nil, fmt.Errorf("ReadMIMEHeader(): %w %s %s", err, h.Get("Name"), h.Get("version")) 249 } 250 return nil, fmt.Errorf("Name or version is empty (name: %q, version: %q)", name, version) 251 } 252 253 return &extractor.Package{ 254 Name: name, 255 Version: version, 256 PURLType: purl.TypePyPi, 257 Metadata: &PythonPackageMetadata{ 258 Author: h.Get("Author"), 259 AuthorEmail: h.Get("Author-email"), 260 }, 261 }, nil 262 }