github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/java/archive/archive.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package archive extracts Java archive files.
    16  package archive
    17  
    18  import (
    19  	"archive/zip"
    20  	"bytes"
    21  	"context"
    22  	"crypto/sha1"
    23  	"encoding/base64"
    24  	"errors"
    25  	"fmt"
    26  	"io"
    27  	"path/filepath"
    28  	"strings"
    29  
    30  	"github.com/google/osv-scalibr/extractor"
    31  	"github.com/google/osv-scalibr/extractor/filesystem"
    32  	"github.com/google/osv-scalibr/extractor/filesystem/internal/units"
    33  	archivemeta "github.com/google/osv-scalibr/extractor/filesystem/language/java/archive/metadata"
    34  	"github.com/google/osv-scalibr/inventory"
    35  	"github.com/google/osv-scalibr/log"
    36  	"github.com/google/osv-scalibr/plugin"
    37  	"github.com/google/osv-scalibr/purl"
    38  	"github.com/google/osv-scalibr/stats"
    39  	"go.uber.org/multierr"
    40  )
    41  
    42  const (
    43  	// Name is the unique name of this extractor.
    44  	Name = "java/archive"
    45  
    46  	// defaultMaxZipDepth is the maximum number of inner zip files within an archive the default extractor will unzip.
    47  	// Once reached, no more inner zip files will be explored during extraction.
    48  	defaultMaxZipDepth = 16
    49  	// defaultMaxZipBytes in the maximum number of bytes recursively read from an archive file.
    50  	// If this limit is reached, the default extractor is halted and results so far are returned.
    51  	defaultMaxZipBytes = 4 * units.GiB
    52  	// defaultMinZipBytes is slightly larger than an empty zip file which is 22 bytes.
    53  	// https://en.wikipedia.org/wiki/ZIP_(file_format)#:~:text=Viewed%20as%20an%20ASCII%20string,file%20are%20usually%20%22PK%22.
    54  	defaultMinZipBytes = 30
    55  )
    56  
    57  var (
    58  	archiveExtensions = []string{".jar", ".war", ".ear", ".jmod", ".par", ".sar", ".jpi", ".hpi", ".lpkg", ".nar"}
    59  )
    60  
    61  // Config is the configuration for the Extractor.
    62  type Config struct {
    63  	// MaxZipDepth is the maximum number of inner zip files within an archive the extractor will unzip.
    64  	// Once reached, no more inner zip files will be explored during extraction.
    65  	MaxZipDepth int
    66  	// MaxFileSizeBytes is the maximum size of a file that can be extracted.
    67  	// If this limit is greater than zero and a file is encountered that is larger
    68  	// than this limit, the file is ignored by returning false for `FileRequired`.
    69  	MaxFileSizeBytes int64
    70  	// MaxOpenedBytes is the maximum number of bytes recursively read from an archive file.
    71  	// If this limit is reached, extraction is halted and results so far are returned.
    72  	MaxOpenedBytes int64
    73  	// MinZipBytes is use to ignore empty zip files during extraction.
    74  	// Zip files smaller than minZipBytes are ignored.
    75  	MinZipBytes int
    76  	// ExtractFromFilename configures if JAR files should be extracted from filenames when no "pom.properties" is present.
    77  	ExtractFromFilename bool
    78  	// HashJars configures if JAR files should be hashed with base64(sha1()), which can be used in deps.dev.
    79  	HashJars bool
    80  	// Stats is a stats collector for reporting metrics.
    81  	Stats stats.Collector
    82  }
    83  
    84  // Extractor extracts Java packages from archive files.
    85  type Extractor struct {
    86  	maxZipDepth         int
    87  	maxFileSizeBytes    int64
    88  	maxOpenedBytes      int64
    89  	minZipBytes         int
    90  	extractFromFilename bool
    91  	hashJars            bool
    92  	stats               stats.Collector
    93  }
    94  
    95  // DefaultConfig returns the default configuration for the Java archive extractor.
    96  func DefaultConfig() Config {
    97  	return Config{
    98  		MaxZipDepth:         defaultMaxZipDepth,
    99  		MaxFileSizeBytes:    0,
   100  		MaxOpenedBytes:      defaultMaxZipBytes,
   101  		MinZipBytes:         defaultMinZipBytes,
   102  		ExtractFromFilename: true,
   103  		HashJars:            true,
   104  		Stats:               nil,
   105  	}
   106  }
   107  
   108  // New returns a Java archive extractor.
   109  //
   110  // For most use cases, initialize with:
   111  // ```
   112  // e := New(DefaultConfig())
   113  // ```
   114  func New(cfg Config) *Extractor {
   115  	return &Extractor{
   116  		maxZipDepth:         cfg.MaxZipDepth,
   117  		maxFileSizeBytes:    cfg.MaxFileSizeBytes,
   118  		maxOpenedBytes:      cfg.MaxOpenedBytes,
   119  		minZipBytes:         cfg.MinZipBytes,
   120  		extractFromFilename: cfg.ExtractFromFilename,
   121  		hashJars:            cfg.HashJars,
   122  		stats:               cfg.Stats,
   123  	}
   124  }
   125  
   126  // NewDefault returns an extractor with the default config settings.
   127  func NewDefault() filesystem.Extractor { return New(DefaultConfig()) }
   128  
   129  // Name of the extractor.
   130  func (e Extractor) Name() string { return Name }
   131  
   132  // Version of the extractor.
   133  func (e Extractor) Version() int { return 0 }
   134  
   135  // Requirements of the extractor.
   136  func (e Extractor) Requirements() *plugin.Capabilities { return &plugin.Capabilities{} }
   137  
   138  // FileRequired returns true if the specified file matches java archive file patterns.
   139  func (e Extractor) FileRequired(api filesystem.FileAPI) bool {
   140  	path := api.Path()
   141  	if !IsArchive(filepath.ToSlash(path)) {
   142  		return false
   143  	}
   144  
   145  	fileinfo, err := api.Stat()
   146  	if err != nil {
   147  		return false
   148  	}
   149  	if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes {
   150  		e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultSizeLimitExceeded)
   151  		return false
   152  	}
   153  
   154  	e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultOK)
   155  	return true
   156  }
   157  
   158  func (e Extractor) reportFileRequired(path string, fileSizeBytes int64, result stats.FileRequiredResult) {
   159  	if e.stats == nil {
   160  		return
   161  	}
   162  	e.stats.AfterFileRequired(e.Name(), &stats.FileRequiredStats{
   163  		Path:          path,
   164  		Result:        result,
   165  		FileSizeBytes: fileSizeBytes,
   166  	})
   167  }
   168  
   169  // Extract extracts java packages from archive files passed through input.
   170  func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) {
   171  	pkgs, openedBytes, err := e.extractWithMax(ctx, input, 1, 0)
   172  	if e.stats != nil {
   173  		var fileSizeBytes int64
   174  		if input.Info != nil {
   175  			fileSizeBytes = input.Info.Size()
   176  		}
   177  		e.stats.AfterFileExtracted(e.Name(), &stats.FileExtractedStats{
   178  			Path:              input.Path,
   179  			Result:            filesystem.ExtractorErrorToFileExtractedResult(err),
   180  			FileSizeBytes:     fileSizeBytes,
   181  			UncompressedBytes: openedBytes,
   182  		})
   183  	}
   184  	return inventory.Inventory{Packages: pkgs}, err
   185  }
   186  
   187  // extractWithMax recursively unzips and extracts packages from archive files starting at input.
   188  //
   189  // It returns early with an error if max depth or max opened bytes is reached.
   190  // Extracted packages are returned even if an error has occurred.
   191  func (e Extractor) extractWithMax(ctx context.Context, input *filesystem.ScanInput, depth int, openedBytes int64) ([]*extractor.Package, int64, error) {
   192  	// Return early if any max/min thresholds are hit.
   193  	if depth > e.maxZipDepth {
   194  		return nil, openedBytes, fmt.Errorf("%s reached max zip depth %d", e.Name(), depth)
   195  	}
   196  	if oBytes := openedBytes + input.Info.Size(); oBytes > e.maxOpenedBytes {
   197  		return nil, oBytes, fmt.Errorf(
   198  			"%w: %s reached max opened bytes of %d at %q",
   199  			filesystem.ErrExtractorMemoryLimitExceeded, e.Name(), oBytes, input.Path)
   200  	}
   201  	if int(input.Info.Size()) < e.minZipBytes {
   202  		log.Warnf("%s ignoring zip with size %d because it is smaller than min size %d at %q",
   203  			e.Name(), input.Info.Size(), e.minZipBytes, input.Path)
   204  		return nil, openedBytes, nil
   205  	}
   206  
   207  	// Create ReaderAt
   208  	r, ok := input.Reader.(io.ReaderAt)
   209  	l := input.Info.Size()
   210  	if !ok {
   211  		log.Debugf("Reader of %s does not implement ReaderAt. Fall back to read to memory.", input.Path)
   212  		b, err := io.ReadAll(input.Reader)
   213  		if err != nil {
   214  			return nil, openedBytes, fmt.Errorf("%s failed to read file: %w", e.Name(), err)
   215  		}
   216  		openedBytes += int64(len(b))
   217  		// Check size again in case input.Info.Size() was not accurate. Return early if hit max.
   218  		if openedBytes > e.maxOpenedBytes {
   219  			return nil, openedBytes, fmt.Errorf(
   220  				"%w: %s reached max opened bytes of %d at %q",
   221  				filesystem.ErrExtractorMemoryLimitExceeded, e.Name(), openedBytes, input.Path)
   222  		}
   223  		r = bytes.NewReader(b)
   224  		l = int64(len(b))
   225  	}
   226  
   227  	// Hash Jar
   228  	sha1 := ""
   229  	if e.hashJars {
   230  		h, err := hashJar(r.(io.Reader))
   231  		if err != nil {
   232  			log.Errorf("HashJar(%q) err: %v", input.Path, err)
   233  			// continue extracting even if hashing failed
   234  		}
   235  		if _, err := r.(io.Seeker).Seek(0, 0); err != nil {
   236  			log.Errorf("%q: Failed to seek to the start, after hashing: %v", input.Path, err)
   237  		}
   238  		sha1 = h
   239  	}
   240  
   241  	// Unzip Jar
   242  	zipReader, err := zip.NewReader(r, l)
   243  	if err != nil {
   244  		return nil, openedBytes, fmt.Errorf("%s invalid archive: %w", e.Name(), err)
   245  	}
   246  
   247  	log.Debugf("extract jar archive: %s", input.Path)
   248  
   249  	// Aggregate errors while looping through files in the zip to continue extraction of other files.
   250  	errs := []error{}
   251  	pkgs := []*extractor.Package{}
   252  	packagePom := []*extractor.Package{}
   253  	packageManifest := []*extractor.Package{}
   254  
   255  	for _, file := range zipReader.File {
   256  		// Return if canceled or exceeding deadline.
   257  		if errors.Is(ctx.Err(), context.DeadlineExceeded) {
   258  			// Ignore local findings from pom and manifest, as they are incomplete.
   259  			return pkgs, openedBytes, fmt.Errorf("%s halted due to context deadline exceeded", e.Name())
   260  		}
   261  		if errors.Is(ctx.Err(), context.Canceled) {
   262  			// Ignore local findings from pom and manifest, as they are incomplete.
   263  			return pkgs, openedBytes, fmt.Errorf("%s halted due to context was canceled", e.Name())
   264  		}
   265  
   266  		path := filepath.Join(input.Path, file.Name)
   267  		switch {
   268  		case filepath.Base(file.Name) == "pom.properties":
   269  			pp, err := parsePomProps(file)
   270  			if err != nil {
   271  				log.Errorf("%s failed to extract from pom.properties at %q: %v", e.Name(), path, err)
   272  				errs = append(errs, err)
   273  				continue
   274  			}
   275  			if pp.valid() {
   276  				packagePom = append(packagePom, &extractor.Package{
   277  					Name:     fmt.Sprintf("%s:%s", pp.GroupID, pp.ArtifactID),
   278  					Version:  pp.Version,
   279  					PURLType: purl.TypeMaven,
   280  					Metadata: &archivemeta.Metadata{
   281  						ArtifactID: pp.ArtifactID,
   282  						GroupID:    pp.GroupID,
   283  						SHA1:       sha1,
   284  					},
   285  					Locations: []string{input.Path, path},
   286  				})
   287  			}
   288  
   289  		case isManifest(file.Name):
   290  			mf, err := parseManifest(file)
   291  			if err != nil {
   292  				log.Errorf("%s failed to extract from manifest.mf at %q: %v", e.Name(), path, err)
   293  				errs = append(errs, err)
   294  				continue
   295  			}
   296  			if mf.valid() {
   297  				packageManifest = append(packageManifest, &extractor.Package{
   298  					Name:     fmt.Sprintf("%s:%s", mf.GroupID, mf.ArtifactID),
   299  					Version:  mf.Version,
   300  					PURLType: purl.TypeMaven,
   301  					Metadata: &archivemeta.Metadata{
   302  						ArtifactID: mf.ArtifactID,
   303  						GroupID:    mf.GroupID,
   304  						SHA1:       sha1,
   305  					},
   306  					Locations: []string{input.Path, path},
   307  				})
   308  			}
   309  
   310  		case IsArchive(file.Name):
   311  			// Anonymous func needed to defer f.Close().
   312  			func() {
   313  				f, err := file.Open()
   314  				if err != nil {
   315  					log.Errorf("%s failed to open file  %q: %v", e.Name(), path, err)
   316  					errs = append(errs, err)
   317  					return
   318  				}
   319  				// Do not need to handle error from f.Close() because it only happens if the file was previously closed.
   320  				defer f.Close()
   321  				subInput := &filesystem.ScanInput{Path: path, Info: file.FileInfo(), Reader: f}
   322  				var subPackage []*extractor.Package
   323  				subPackage, openedBytes, err = e.extractWithMax(ctx, subInput, depth+1, openedBytes)
   324  				// Prepend the current input path
   325  				for i := range subPackage {
   326  					subPackage[i].Locations = append([]string{input.Path}, subPackage[i].Locations...)
   327  				}
   328  				if err != nil {
   329  					log.Errorf("%s failed to extract %q: %v", e.Name(), path, err)
   330  					errs = append(errs, err)
   331  					return
   332  				}
   333  				pkgs = append(pkgs, subPackage...)
   334  			}()
   335  		}
   336  	}
   337  
   338  	pkgs = append(pkgs, packagePom...)
   339  
   340  	// If there is no pom.properties, try combining MANIFEST.MF and filename.
   341  	packageFilename := []*extractor.Package{}
   342  	if len(packagePom) == 0 && e.extractFromFilename {
   343  		p := ParseFilename(input.Path)
   344  		if p != nil {
   345  			log.Debugf("PropsFromFilename(%q): %+v", input.Path, p)
   346  			// All Maven packages require a group ID as part of the package name, but
   347  			// they are usually not part of the filename of the JAR. However, for some
   348  			// legacy packages that were created before the reverse-domain convention
   349  			// was established, the group ID is the same as the artifact ID (e.g.
   350  			// junit:junit or commons-httpclient:commons-httpclient). Unless we find
   351  			// the group ID from another source, we default to assuming that the group
   352  			// ID is the artifact ID since that is how vulnerabilities are reported
   353  			// for these legacy packages (e.g.
   354  			// https://github.com/advisories/GHSA-3832-9276-x7gf).
   355  			groupID := p.ArtifactID
   356  			if p.GroupID != "" {
   357  				groupID = strings.ToLower(p.GroupID)
   358  			}
   359  			// If manifest.mf was found, use GroupID from manifest instead, if
   360  			// present. Then remove manifest from the Package.
   361  			if len(packageManifest) == 1 {
   362  				metadata := packageManifest[0].Metadata.(*archivemeta.Metadata)
   363  				if metadata.GroupID != "" {
   364  					groupID = metadata.GroupID
   365  					packageManifest = nil
   366  				}
   367  			}
   368  			packageFilename = append(packageFilename, &extractor.Package{
   369  				Name:     fmt.Sprintf("%s:%s", groupID, p.ArtifactID),
   370  				Version:  p.Version,
   371  				PURLType: purl.TypeMaven,
   372  				Metadata: &archivemeta.Metadata{
   373  					ArtifactID: p.ArtifactID,
   374  					GroupID:    groupID,
   375  					SHA1:       sha1,
   376  				},
   377  				Locations: []string{input.Path},
   378  			})
   379  		}
   380  	}
   381  	pkgs = append(pkgs, packageFilename...)
   382  
   383  	if len(packagePom) == 0 && len(packageFilename) == 0 {
   384  		pkgs = append(pkgs, packageManifest...)
   385  	}
   386  
   387  	// If nothing worked, return the hash.
   388  	if len(pkgs) == 0 && sha1 != "" {
   389  		pkgs = append(pkgs, &extractor.Package{
   390  			Name:     "unknown",
   391  			Version:  "unknown",
   392  			PURLType: purl.TypeMaven,
   393  			Metadata: &archivemeta.Metadata{
   394  				ArtifactID: "unknown",
   395  				GroupID:    "unknown",
   396  				SHA1:       sha1,
   397  			},
   398  			Locations: []string{input.Path},
   399  		})
   400  	}
   401  
   402  	// Aggregate errors.
   403  	err = multierr.Combine(errs...)
   404  	if err != nil {
   405  		return pkgs, openedBytes, fmt.Errorf("error(s) in extractor %s: %w", e.Name(), err)
   406  	}
   407  
   408  	return pkgs, openedBytes, err
   409  }
   410  
   411  // hashJar returns base64(sha1()) of the file. This is compatible to dev.deps.
   412  func hashJar(r io.Reader) (string, error) {
   413  	// SHA1
   414  	hasher := sha1.New()
   415  	_, err := io.Copy(hasher, r)
   416  	if err != nil {
   417  		return "", err
   418  	}
   419  	h := hasher.Sum(nil)
   420  
   421  	// Base64
   422  	return base64.StdEncoding.EncodeToString(h), nil
   423  }
   424  
   425  // IsArchive returns true if the file path ends with one of the supported archive extensions.
   426  func IsArchive(path string) bool {
   427  	ext := filepath.Ext(path)
   428  	for _, archiveExt := range archiveExtensions {
   429  		if strings.EqualFold(ext, archiveExt) {
   430  			return true
   431  		}
   432  	}
   433  	return false
   434  }
   435  
   436  func isManifest(path string) bool {
   437  	return strings.ToLower(filepath.Base(path)) == "manifest.mf"
   438  }