github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/java/archive/archive.go (about) 1 // Copyright 2025 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package archive extracts Java archive files. 16 package archive 17 18 import ( 19 "archive/zip" 20 "bytes" 21 "context" 22 "crypto/sha1" 23 "encoding/base64" 24 "errors" 25 "fmt" 26 "io" 27 "path/filepath" 28 "strings" 29 30 "github.com/google/osv-scalibr/extractor" 31 "github.com/google/osv-scalibr/extractor/filesystem" 32 "github.com/google/osv-scalibr/extractor/filesystem/internal/units" 33 archivemeta "github.com/google/osv-scalibr/extractor/filesystem/language/java/archive/metadata" 34 "github.com/google/osv-scalibr/inventory" 35 "github.com/google/osv-scalibr/log" 36 "github.com/google/osv-scalibr/plugin" 37 "github.com/google/osv-scalibr/purl" 38 "github.com/google/osv-scalibr/stats" 39 "go.uber.org/multierr" 40 ) 41 42 const ( 43 // Name is the unique name of this extractor. 44 Name = "java/archive" 45 46 // defaultMaxZipDepth is the maximum number of inner zip files within an archive the default extractor will unzip. 47 // Once reached, no more inner zip files will be explored during extraction. 48 defaultMaxZipDepth = 16 49 // defaultMaxZipBytes in the maximum number of bytes recursively read from an archive file. 50 // If this limit is reached, the default extractor is halted and results so far are returned. 51 defaultMaxZipBytes = 4 * units.GiB 52 // defaultMinZipBytes is slightly larger than an empty zip file which is 22 bytes. 53 // https://en.wikipedia.org/wiki/ZIP_(file_format)#:~:text=Viewed%20as%20an%20ASCII%20string,file%20are%20usually%20%22PK%22. 54 defaultMinZipBytes = 30 55 ) 56 57 var ( 58 archiveExtensions = []string{".jar", ".war", ".ear", ".jmod", ".par", ".sar", ".jpi", ".hpi", ".lpkg", ".nar"} 59 ) 60 61 // Config is the configuration for the Extractor. 62 type Config struct { 63 // MaxZipDepth is the maximum number of inner zip files within an archive the extractor will unzip. 64 // Once reached, no more inner zip files will be explored during extraction. 65 MaxZipDepth int 66 // MaxFileSizeBytes is the maximum size of a file that can be extracted. 67 // If this limit is greater than zero and a file is encountered that is larger 68 // than this limit, the file is ignored by returning false for `FileRequired`. 69 MaxFileSizeBytes int64 70 // MaxOpenedBytes is the maximum number of bytes recursively read from an archive file. 71 // If this limit is reached, extraction is halted and results so far are returned. 72 MaxOpenedBytes int64 73 // MinZipBytes is use to ignore empty zip files during extraction. 74 // Zip files smaller than minZipBytes are ignored. 75 MinZipBytes int 76 // ExtractFromFilename configures if JAR files should be extracted from filenames when no "pom.properties" is present. 77 ExtractFromFilename bool 78 // HashJars configures if JAR files should be hashed with base64(sha1()), which can be used in deps.dev. 79 HashJars bool 80 // Stats is a stats collector for reporting metrics. 81 Stats stats.Collector 82 } 83 84 // Extractor extracts Java packages from archive files. 85 type Extractor struct { 86 maxZipDepth int 87 maxFileSizeBytes int64 88 maxOpenedBytes int64 89 minZipBytes int 90 extractFromFilename bool 91 hashJars bool 92 stats stats.Collector 93 } 94 95 // DefaultConfig returns the default configuration for the Java archive extractor. 96 func DefaultConfig() Config { 97 return Config{ 98 MaxZipDepth: defaultMaxZipDepth, 99 MaxFileSizeBytes: 0, 100 MaxOpenedBytes: defaultMaxZipBytes, 101 MinZipBytes: defaultMinZipBytes, 102 ExtractFromFilename: true, 103 HashJars: true, 104 Stats: nil, 105 } 106 } 107 108 // New returns a Java archive extractor. 109 // 110 // For most use cases, initialize with: 111 // ``` 112 // e := New(DefaultConfig()) 113 // ``` 114 func New(cfg Config) *Extractor { 115 return &Extractor{ 116 maxZipDepth: cfg.MaxZipDepth, 117 maxFileSizeBytes: cfg.MaxFileSizeBytes, 118 maxOpenedBytes: cfg.MaxOpenedBytes, 119 minZipBytes: cfg.MinZipBytes, 120 extractFromFilename: cfg.ExtractFromFilename, 121 hashJars: cfg.HashJars, 122 stats: cfg.Stats, 123 } 124 } 125 126 // NewDefault returns an extractor with the default config settings. 127 func NewDefault() filesystem.Extractor { return New(DefaultConfig()) } 128 129 // Name of the extractor. 130 func (e Extractor) Name() string { return Name } 131 132 // Version of the extractor. 133 func (e Extractor) Version() int { return 0 } 134 135 // Requirements of the extractor. 136 func (e Extractor) Requirements() *plugin.Capabilities { return &plugin.Capabilities{} } 137 138 // FileRequired returns true if the specified file matches java archive file patterns. 139 func (e Extractor) FileRequired(api filesystem.FileAPI) bool { 140 path := api.Path() 141 if !IsArchive(filepath.ToSlash(path)) { 142 return false 143 } 144 145 fileinfo, err := api.Stat() 146 if err != nil { 147 return false 148 } 149 if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes { 150 e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultSizeLimitExceeded) 151 return false 152 } 153 154 e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultOK) 155 return true 156 } 157 158 func (e Extractor) reportFileRequired(path string, fileSizeBytes int64, result stats.FileRequiredResult) { 159 if e.stats == nil { 160 return 161 } 162 e.stats.AfterFileRequired(e.Name(), &stats.FileRequiredStats{ 163 Path: path, 164 Result: result, 165 FileSizeBytes: fileSizeBytes, 166 }) 167 } 168 169 // Extract extracts java packages from archive files passed through input. 170 func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) { 171 pkgs, openedBytes, err := e.extractWithMax(ctx, input, 1, 0) 172 if e.stats != nil { 173 var fileSizeBytes int64 174 if input.Info != nil { 175 fileSizeBytes = input.Info.Size() 176 } 177 e.stats.AfterFileExtracted(e.Name(), &stats.FileExtractedStats{ 178 Path: input.Path, 179 Result: filesystem.ExtractorErrorToFileExtractedResult(err), 180 FileSizeBytes: fileSizeBytes, 181 UncompressedBytes: openedBytes, 182 }) 183 } 184 return inventory.Inventory{Packages: pkgs}, err 185 } 186 187 // extractWithMax recursively unzips and extracts packages from archive files starting at input. 188 // 189 // It returns early with an error if max depth or max opened bytes is reached. 190 // Extracted packages are returned even if an error has occurred. 191 func (e Extractor) extractWithMax(ctx context.Context, input *filesystem.ScanInput, depth int, openedBytes int64) ([]*extractor.Package, int64, error) { 192 // Return early if any max/min thresholds are hit. 193 if depth > e.maxZipDepth { 194 return nil, openedBytes, fmt.Errorf("%s reached max zip depth %d", e.Name(), depth) 195 } 196 if oBytes := openedBytes + input.Info.Size(); oBytes > e.maxOpenedBytes { 197 return nil, oBytes, fmt.Errorf( 198 "%w: %s reached max opened bytes of %d at %q", 199 filesystem.ErrExtractorMemoryLimitExceeded, e.Name(), oBytes, input.Path) 200 } 201 if int(input.Info.Size()) < e.minZipBytes { 202 log.Warnf("%s ignoring zip with size %d because it is smaller than min size %d at %q", 203 e.Name(), input.Info.Size(), e.minZipBytes, input.Path) 204 return nil, openedBytes, nil 205 } 206 207 // Create ReaderAt 208 r, ok := input.Reader.(io.ReaderAt) 209 l := input.Info.Size() 210 if !ok { 211 log.Debugf("Reader of %s does not implement ReaderAt. Fall back to read to memory.", input.Path) 212 b, err := io.ReadAll(input.Reader) 213 if err != nil { 214 return nil, openedBytes, fmt.Errorf("%s failed to read file: %w", e.Name(), err) 215 } 216 openedBytes += int64(len(b)) 217 // Check size again in case input.Info.Size() was not accurate. Return early if hit max. 218 if openedBytes > e.maxOpenedBytes { 219 return nil, openedBytes, fmt.Errorf( 220 "%w: %s reached max opened bytes of %d at %q", 221 filesystem.ErrExtractorMemoryLimitExceeded, e.Name(), openedBytes, input.Path) 222 } 223 r = bytes.NewReader(b) 224 l = int64(len(b)) 225 } 226 227 // Hash Jar 228 sha1 := "" 229 if e.hashJars { 230 h, err := hashJar(r.(io.Reader)) 231 if err != nil { 232 log.Errorf("HashJar(%q) err: %v", input.Path, err) 233 // continue extracting even if hashing failed 234 } 235 if _, err := r.(io.Seeker).Seek(0, 0); err != nil { 236 log.Errorf("%q: Failed to seek to the start, after hashing: %v", input.Path, err) 237 } 238 sha1 = h 239 } 240 241 // Unzip Jar 242 zipReader, err := zip.NewReader(r, l) 243 if err != nil { 244 return nil, openedBytes, fmt.Errorf("%s invalid archive: %w", e.Name(), err) 245 } 246 247 log.Debugf("extract jar archive: %s", input.Path) 248 249 // Aggregate errors while looping through files in the zip to continue extraction of other files. 250 errs := []error{} 251 pkgs := []*extractor.Package{} 252 packagePom := []*extractor.Package{} 253 packageManifest := []*extractor.Package{} 254 255 for _, file := range zipReader.File { 256 // Return if canceled or exceeding deadline. 257 if errors.Is(ctx.Err(), context.DeadlineExceeded) { 258 // Ignore local findings from pom and manifest, as they are incomplete. 259 return pkgs, openedBytes, fmt.Errorf("%s halted due to context deadline exceeded", e.Name()) 260 } 261 if errors.Is(ctx.Err(), context.Canceled) { 262 // Ignore local findings from pom and manifest, as they are incomplete. 263 return pkgs, openedBytes, fmt.Errorf("%s halted due to context was canceled", e.Name()) 264 } 265 266 path := filepath.Join(input.Path, file.Name) 267 switch { 268 case filepath.Base(file.Name) == "pom.properties": 269 pp, err := parsePomProps(file) 270 if err != nil { 271 log.Errorf("%s failed to extract from pom.properties at %q: %v", e.Name(), path, err) 272 errs = append(errs, err) 273 continue 274 } 275 if pp.valid() { 276 packagePom = append(packagePom, &extractor.Package{ 277 Name: fmt.Sprintf("%s:%s", pp.GroupID, pp.ArtifactID), 278 Version: pp.Version, 279 PURLType: purl.TypeMaven, 280 Metadata: &archivemeta.Metadata{ 281 ArtifactID: pp.ArtifactID, 282 GroupID: pp.GroupID, 283 SHA1: sha1, 284 }, 285 Locations: []string{input.Path, path}, 286 }) 287 } 288 289 case isManifest(file.Name): 290 mf, err := parseManifest(file) 291 if err != nil { 292 log.Errorf("%s failed to extract from manifest.mf at %q: %v", e.Name(), path, err) 293 errs = append(errs, err) 294 continue 295 } 296 if mf.valid() { 297 packageManifest = append(packageManifest, &extractor.Package{ 298 Name: fmt.Sprintf("%s:%s", mf.GroupID, mf.ArtifactID), 299 Version: mf.Version, 300 PURLType: purl.TypeMaven, 301 Metadata: &archivemeta.Metadata{ 302 ArtifactID: mf.ArtifactID, 303 GroupID: mf.GroupID, 304 SHA1: sha1, 305 }, 306 Locations: []string{input.Path, path}, 307 }) 308 } 309 310 case IsArchive(file.Name): 311 // Anonymous func needed to defer f.Close(). 312 func() { 313 f, err := file.Open() 314 if err != nil { 315 log.Errorf("%s failed to open file %q: %v", e.Name(), path, err) 316 errs = append(errs, err) 317 return 318 } 319 // Do not need to handle error from f.Close() because it only happens if the file was previously closed. 320 defer f.Close() 321 subInput := &filesystem.ScanInput{Path: path, Info: file.FileInfo(), Reader: f} 322 var subPackage []*extractor.Package 323 subPackage, openedBytes, err = e.extractWithMax(ctx, subInput, depth+1, openedBytes) 324 // Prepend the current input path 325 for i := range subPackage { 326 subPackage[i].Locations = append([]string{input.Path}, subPackage[i].Locations...) 327 } 328 if err != nil { 329 log.Errorf("%s failed to extract %q: %v", e.Name(), path, err) 330 errs = append(errs, err) 331 return 332 } 333 pkgs = append(pkgs, subPackage...) 334 }() 335 } 336 } 337 338 pkgs = append(pkgs, packagePom...) 339 340 // If there is no pom.properties, try combining MANIFEST.MF and filename. 341 packageFilename := []*extractor.Package{} 342 if len(packagePom) == 0 && e.extractFromFilename { 343 p := ParseFilename(input.Path) 344 if p != nil { 345 log.Debugf("PropsFromFilename(%q): %+v", input.Path, p) 346 // All Maven packages require a group ID as part of the package name, but 347 // they are usually not part of the filename of the JAR. However, for some 348 // legacy packages that were created before the reverse-domain convention 349 // was established, the group ID is the same as the artifact ID (e.g. 350 // junit:junit or commons-httpclient:commons-httpclient). Unless we find 351 // the group ID from another source, we default to assuming that the group 352 // ID is the artifact ID since that is how vulnerabilities are reported 353 // for these legacy packages (e.g. 354 // https://github.com/advisories/GHSA-3832-9276-x7gf). 355 groupID := p.ArtifactID 356 if p.GroupID != "" { 357 groupID = strings.ToLower(p.GroupID) 358 } 359 // If manifest.mf was found, use GroupID from manifest instead, if 360 // present. Then remove manifest from the Package. 361 if len(packageManifest) == 1 { 362 metadata := packageManifest[0].Metadata.(*archivemeta.Metadata) 363 if metadata.GroupID != "" { 364 groupID = metadata.GroupID 365 packageManifest = nil 366 } 367 } 368 packageFilename = append(packageFilename, &extractor.Package{ 369 Name: fmt.Sprintf("%s:%s", groupID, p.ArtifactID), 370 Version: p.Version, 371 PURLType: purl.TypeMaven, 372 Metadata: &archivemeta.Metadata{ 373 ArtifactID: p.ArtifactID, 374 GroupID: groupID, 375 SHA1: sha1, 376 }, 377 Locations: []string{input.Path}, 378 }) 379 } 380 } 381 pkgs = append(pkgs, packageFilename...) 382 383 if len(packagePom) == 0 && len(packageFilename) == 0 { 384 pkgs = append(pkgs, packageManifest...) 385 } 386 387 // If nothing worked, return the hash. 388 if len(pkgs) == 0 && sha1 != "" { 389 pkgs = append(pkgs, &extractor.Package{ 390 Name: "unknown", 391 Version: "unknown", 392 PURLType: purl.TypeMaven, 393 Metadata: &archivemeta.Metadata{ 394 ArtifactID: "unknown", 395 GroupID: "unknown", 396 SHA1: sha1, 397 }, 398 Locations: []string{input.Path}, 399 }) 400 } 401 402 // Aggregate errors. 403 err = multierr.Combine(errs...) 404 if err != nil { 405 return pkgs, openedBytes, fmt.Errorf("error(s) in extractor %s: %w", e.Name(), err) 406 } 407 408 return pkgs, openedBytes, err 409 } 410 411 // hashJar returns base64(sha1()) of the file. This is compatible to dev.deps. 412 func hashJar(r io.Reader) (string, error) { 413 // SHA1 414 hasher := sha1.New() 415 _, err := io.Copy(hasher, r) 416 if err != nil { 417 return "", err 418 } 419 h := hasher.Sum(nil) 420 421 // Base64 422 return base64.StdEncoding.EncodeToString(h), nil 423 } 424 425 // IsArchive returns true if the file path ends with one of the supported archive extensions. 426 func IsArchive(path string) bool { 427 ext := filepath.Ext(path) 428 for _, archiveExt := range archiveExtensions { 429 if strings.EqualFold(ext, archiveExt) { 430 return true 431 } 432 } 433 return false 434 } 435 436 func isManifest(path string) bool { 437 return strings.ToLower(filepath.Base(path)) == "manifest.mf" 438 }