github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/java/archive_parser.go (about)

     1  package java
     2  
     3  import (
     4  	"cmp"
     5  	"context"
     6  	"crypto"
     7  	"fmt"
     8  	"io"
     9  	"iter"
    10  	"os"
    11  	"path"
    12  	"slices"
    13  	"strings"
    14  
    15  	"github.com/scylladb/go-set/strset"
    16  	"golang.org/x/exp/maps"
    17  
    18  	"github.com/anchore/syft/internal"
    19  	intFile "github.com/anchore/syft/internal/file"
    20  	"github.com/anchore/syft/internal/log"
    21  	"github.com/anchore/syft/internal/unknown"
    22  	"github.com/anchore/syft/syft/artifact"
    23  	"github.com/anchore/syft/syft/file"
    24  	"github.com/anchore/syft/syft/pkg"
    25  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    26  	"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
    27  	"github.com/anchore/syft/syft/pkg/cataloger/java/internal/maven"
    28  )
    29  
    30  var archiveFormatGlobs = []string{
    31  	"**/*.jar",
    32  	"**/*.war",
    33  	"**/*.ear",
    34  	"**/*.par",
    35  	"**/*.sar",
    36  	"**/*.nar",
    37  	"**/*.jpi",
    38  	"**/*.hpi",
    39  	"**/*.kar",
    40  	"**/*.far",
    41  	"**/*.lpkg", // Zip-compressed package used to deploy applications
    42  	// (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter
    43  	// has information about the application and installation requirements.
    44  	// NOTE(jonasagx): If you would like to test it with lpkg file,
    45  	// use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403
    46  	// LifeRay makes it pretty cumbersome to make a such plugins; their docs are
    47  	// out of date, and they charge for their IDE. If you find an example
    48  	// project that we can build in CI feel free to include it
    49  	"**/*.rar", // Java Resource Adapter Archive
    50  }
    51  
    52  // javaArchiveHashes are all the current hash algorithms used to calculate archive digests
    53  var javaArchiveHashes = []crypto.Hash{
    54  	crypto.SHA1,
    55  }
    56  
    57  type archiveParser struct {
    58  	fileManifest intFile.ZipFileManifest
    59  	location     file.Location
    60  	archivePath  string
    61  	contentPath  string
    62  	fileInfo     archiveFilename
    63  	detectNested bool
    64  	cfg          ArchiveCatalogerConfig
    65  	maven        *maven.Resolver
    66  }
    67  
    68  type genericArchiveParserAdapter struct {
    69  	cfg ArchiveCatalogerConfig
    70  }
    71  
    72  func newGenericArchiveParserAdapter(cfg ArchiveCatalogerConfig) genericArchiveParserAdapter {
    73  	return genericArchiveParserAdapter{cfg: cfg}
    74  }
    75  
    76  // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives
    77  func (gap genericArchiveParserAdapter) parseJavaArchive(ctx context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    78  	return gap.processJavaArchive(ctx, reader, nil)
    79  }
    80  
    81  // processJavaArchive processes an archive for java contents, returning all Java libraries and nested archives
    82  func (gap genericArchiveParserAdapter) processJavaArchive(ctx context.Context, reader file.LocationReadCloser, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
    83  	parser, cleanupFn, err := newJavaArchiveParser(ctx, reader, true, gap.cfg)
    84  	// note: even on error, we should always run cleanup functions
    85  	defer cleanupFn()
    86  	if err != nil {
    87  		return nil, nil, err
    88  	}
    89  	return parser.parse(ctx, parentPkg)
    90  }
    91  
    92  // uniquePkgKey creates a unique string to identify the given package.
    93  func uniquePkgKey(groupID string, p *pkg.Package) string {
    94  	if p == nil {
    95  		return ""
    96  	}
    97  	return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version)
    98  }
    99  
   100  // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover
   101  // and parse nested archives or ignore them.
   102  func newJavaArchiveParser(ctx context.Context, reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig) (*archiveParser, func(), error) {
   103  	// fetch the last element of the virtual path
   104  	virtualElements := strings.Split(reader.Path(), ":")
   105  	currentFilepath := virtualElements[len(virtualElements)-1]
   106  
   107  	contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader)
   108  	if err != nil {
   109  		return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err)
   110  	}
   111  
   112  	fileManifest, err := intFile.NewZipFileManifest(ctx, archivePath)
   113  	if err != nil {
   114  		return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err)
   115  	}
   116  
   117  	return &archiveParser{
   118  		fileManifest: fileManifest,
   119  		location:     reader.Location,
   120  		archivePath:  archivePath,
   121  		contentPath:  contentPath,
   122  		fileInfo:     newJavaArchiveFilename(currentFilepath),
   123  		detectNested: detectNested,
   124  		cfg:          cfg,
   125  		maven:        maven.NewResolver(nil, cfg.mavenConfig()),
   126  	}, cleanupFn, nil
   127  }
   128  
   129  // parse the loaded archive and return all packages found.
   130  func (j *archiveParser) parse(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   131  	var pkgs []pkg.Package
   132  	var relationships []artifact.Relationship
   133  
   134  	// find the parent package from the java manifest
   135  	mainPkg, err := j.discoverMainPackage(ctx)
   136  	if err != nil {
   137  		return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err)
   138  	}
   139  
   140  	// find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg
   141  	// NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml
   142  	// IMPORTANT!: discoverPkgsFromAllMavenFiles may change mainPkg information, so needs to be called before SetID and before copying for relationships, etc.
   143  	auxPkgs, err := j.discoverPkgsFromAllMavenFiles(ctx, mainPkg)
   144  	if err != nil {
   145  		return nil, nil, err
   146  	}
   147  
   148  	if mainPkg != nil {
   149  		finalizePackage(mainPkg)
   150  		pkgs = append(pkgs, *mainPkg)
   151  
   152  		if parentPkg != nil {
   153  			relationships = append(relationships, artifact.Relationship{
   154  				From: *mainPkg,
   155  				To:   *parentPkg,
   156  				Type: artifact.DependencyOfRelationship,
   157  			})
   158  		}
   159  	}
   160  
   161  	for i := range auxPkgs {
   162  		auxPkg := &auxPkgs[i]
   163  
   164  		finalizePackage(auxPkg)
   165  		pkgs = append(pkgs, *auxPkg)
   166  
   167  		if mainPkg != nil {
   168  			relationships = append(relationships, artifact.Relationship{
   169  				From: *auxPkg,
   170  				To:   *mainPkg,
   171  				Type: artifact.DependencyOfRelationship,
   172  			})
   173  		}
   174  	}
   175  
   176  	var errs error
   177  	if j.detectNested {
   178  		// find nested java archive packages
   179  		nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(ctx, mainPkg)
   180  		if err != nil {
   181  			errs = unknown.Append(errs, j.location, err)
   182  		}
   183  		pkgs = append(pkgs, nestedPkgs...)
   184  		relationships = append(relationships, nestedRelationships...)
   185  	} else {
   186  		// .jar and .war files are present in archives, are others? or generally just consider them top-level?
   187  		nestedArchives := j.fileManifest.GlobMatch(true, "**/*.jar", "**/*.war")
   188  		if len(nestedArchives) > 0 {
   189  			slices.Sort(nestedArchives)
   190  			errs = unknown.Appendf(errs, j.location, "nested archives not cataloged: %v", strings.Join(nestedArchives, ", "))
   191  		}
   192  	}
   193  
   194  	if len(pkgs) == 0 {
   195  		errs = unknown.Appendf(errs, j.location, "no package identified in archive")
   196  	}
   197  
   198  	return pkgs, relationships, errs
   199  }
   200  
   201  // finalizePackage potentially updates some package information such as classifying the package as a Jenkins plugin,
   202  // sets the PURL, and calls p.SetID()
   203  func finalizePackage(p *pkg.Package) {
   204  	if m, ok := p.Metadata.(pkg.JavaArchive); ok {
   205  		p.PURL = packageURL(p.Name, p.Version, m)
   206  
   207  		if strings.Contains(p.PURL, "io.jenkins.plugins") || strings.Contains(p.PURL, "org.jenkins-ci.plugins") {
   208  			p.Type = pkg.JenkinsPluginPkg
   209  		}
   210  	} else {
   211  		log.WithFields("package", p.String()).Debug("unable to extract java metadata to generate purl")
   212  	}
   213  
   214  	p.SetID()
   215  }
   216  
   217  // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages.
   218  func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, error) {
   219  	// search and parse java manifest files
   220  	manifestMatches := j.fileManifest.GlobMatch(false, manifestGlob)
   221  	if len(manifestMatches) > 1 {
   222  		return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches)
   223  	} else if len(manifestMatches) == 0 {
   224  		// we did not find any manifests, but that may not be a problem (there may be other information to generate packages for)
   225  		return nil, nil
   226  	}
   227  
   228  	// fetch the manifest file
   229  	contents, err := intFile.ContentsFromZip(ctx, j.archivePath, manifestMatches...)
   230  	if err != nil {
   231  		return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err)
   232  	}
   233  
   234  	// parse the manifest file into a rich object
   235  	manifestContents := contents[manifestMatches[0]]
   236  	manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents))
   237  	if err != nil {
   238  		log.Debugf("failed to parse java manifest (%s): %+v", j.location, err)
   239  		return nil, nil
   240  	}
   241  
   242  	// check for existence of Weave-Classes manifest key in order to exclude jars getting misrepresented as
   243  	// their targeted counterparts, e.g. newrelic spring and tomcat instrumentation
   244  	if _, ok := manifest.Main.Get("Weave-Classes"); ok {
   245  		log.Debugf("excluding archive due to Weave-Classes manifest entry: %s", j.location)
   246  		return nil, nil
   247  	}
   248  
   249  	// grab and assign digest for the entire archive
   250  	digests, err := getDigestsFromArchive(ctx, j.archivePath)
   251  	if err != nil {
   252  		return nil, err
   253  	}
   254  
   255  	name, version, lics, parsedPom := j.discoverNameVersionLicense(ctx, manifest)
   256  	var pkgPomProject *pkg.JavaPomProject
   257  	if parsedPom != nil {
   258  		pkgPomProject = newPomProject(ctx, j.maven, parsedPom.path, parsedPom.project)
   259  	}
   260  
   261  	return &pkg.Package{
   262  		// TODO: maybe select name should just have a pom properties in it?
   263  		Name:     name,
   264  		Version:  version,
   265  		Language: pkg.Java,
   266  		Licenses: pkg.NewLicenseSet(lics...),
   267  		Locations: file.NewLocationSet(
   268  			j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   269  		),
   270  		Type: j.fileInfo.pkgType(),
   271  		Metadata: pkg.JavaArchive{
   272  			VirtualPath:    j.location.Path(),
   273  			Manifest:       manifest,
   274  			PomProject:     pkgPomProject,
   275  			ArchiveDigests: digests,
   276  		},
   277  	}, nil
   278  }
   279  
   280  func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest *pkg.JavaManifest) (string, string, []pkg.License, *parsedPomProject) {
   281  	// we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest
   282  	// TODO: when we support locations of paths within archives we should start passing the specific manifest location object instead of the top jar
   283  	lics := pkg.NewLicensesFromLocationWithContext(ctx, j.location, selectLicenses(manifest)...)
   284  	/*
   285  		We should name and version from, in this order:
   286  		1. pom.properties if we find exactly 1
   287  		2. pom.xml if we find exactly 1
   288  		3. manifest
   289  		4. filename
   290  	*/
   291  	groupID, artifactID, version, parsedPom := j.discoverMainPackageFromPomInfo(ctx)
   292  	if artifactID == "" {
   293  		artifactID = selectName(manifest, j.fileInfo)
   294  	}
   295  	if version == "" {
   296  		version = selectVersion(manifest, j.fileInfo)
   297  	}
   298  
   299  	if len(lics) == 0 {
   300  		fileLicenses := j.getLicenseFromFileInArchive(ctx)
   301  		if fileLicenses != nil {
   302  			lics = append(lics, fileLicenses...)
   303  		}
   304  	}
   305  
   306  	// If we didn't find any licenses in the archive so far, we'll try again in Maven Central using groupIDFromJavaMetadata
   307  	if len(lics) == 0 {
   308  		// Today we don't have a way to distinguish between licenses from the manifest and licenses from the pom.xml
   309  		// until the file.Location object can support sub-paths (i.e. paths within archives, recursively; issue https://github.com/anchore/syft/issues/2211).
   310  		// Until then it's less confusing to use the licenses from the pom.xml only if the manifest did not list any.
   311  		lics = j.findLicenseFromJavaMetadata(ctx, groupID, artifactID, version, parsedPom, manifest)
   312  	}
   313  
   314  	return artifactID, version, lics, parsedPom
   315  }
   316  
   317  // findLicenseFromJavaMetadata attempts to find license information from all available maven metadata properties and pom info
   318  func (j *archiveParser) findLicenseFromJavaMetadata(ctx context.Context, groupID, artifactID, version string, parsedPom *parsedPomProject, manifest *pkg.JavaManifest) []pkg.License {
   319  	if groupID == "" {
   320  		if gID := groupIDFromJavaMetadata(artifactID, pkg.JavaArchive{Manifest: manifest}); gID != "" {
   321  			groupID = gID
   322  		}
   323  	}
   324  
   325  	var err error
   326  	var pomLicenses []maven.License
   327  	if parsedPom != nil {
   328  		pomLicenses, err = j.maven.ResolveLicenses(ctx, parsedPom.project)
   329  		if err != nil {
   330  			log.WithFields("error", err, "mavenID", j.maven.ResolveID(ctx, parsedPom.project)).Trace("error attempting to resolve pom licenses")
   331  		}
   332  	}
   333  
   334  	if err == nil && len(pomLicenses) == 0 {
   335  		pomLicenses, err = j.maven.FindLicenses(ctx, groupID, artifactID, version)
   336  		if err != nil {
   337  			log.WithFields("error", err, "mavenID", maven.NewID(groupID, artifactID, version)).Trace("error attempting to find licenses")
   338  		}
   339  	}
   340  
   341  	if len(pomLicenses) == 0 {
   342  		// Try removing the last part of the groupId, as sometimes it duplicates the artifactId
   343  		packages := strings.Split(groupID, ".")
   344  		groupID = strings.Join(packages[:len(packages)-1], ".")
   345  		pomLicenses, err = j.maven.FindLicenses(ctx, groupID, artifactID, version)
   346  		if err != nil {
   347  			log.WithFields("error", err, "mavenID", maven.NewID(groupID, artifactID, version)).Trace("error attempting to find sub-group licenses")
   348  		}
   349  	}
   350  
   351  	return toPkgLicenses(ctx, &j.location, pomLicenses)
   352  }
   353  
   354  func toPkgLicenses(ctx context.Context, location *file.Location, licenses []maven.License) []pkg.License {
   355  	var out []pkg.License
   356  	for _, license := range licenses {
   357  		name := ""
   358  		if license.Name != nil {
   359  			name = *license.Name
   360  		}
   361  		url := ""
   362  		if license.URL != nil {
   363  			url = *license.URL
   364  		}
   365  		// note: it is possible to:
   366  		// - have a license without a URL
   367  		// - have license and a URL
   368  		// - have a URL without a license (this is weird, but can happen)
   369  		if name == "" && url == "" {
   370  			continue
   371  		}
   372  		out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, name, url, location))
   373  	}
   374  	return out
   375  }
   376  
   377  type parsedPomProject struct {
   378  	path    string
   379  	project *maven.Project
   380  }
   381  
   382  // discoverMainPackageFromPomInfo attempts to resolve maven groupId, artifactId, version and other info from found pom information
   383  func (j *archiveParser) discoverMainPackageFromPomInfo(ctx context.Context) (group, name, version string, parsedPom *parsedPomProject) {
   384  	// Find the pom.properties/pom.xml if the names seem like a plausible match
   385  	properties, _ := pomPropertiesByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob))
   386  	projects, _ := pomProjectByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob))
   387  
   388  	artifactsMap := j.buildArtifactsMap(properties)
   389  	pomProperties, parsedPom := j.findBestPomMatch(properties, projects, artifactsMap)
   390  
   391  	parsedPom = j.handleSinglePomXML(properties, projects, parsedPom)
   392  
   393  	return j.resolveIdentity(ctx, pomProperties, parsedPom)
   394  }
   395  
   396  func (j *archiveParser) buildArtifactsMap(properties map[string]pkg.JavaPomProperties) *strset.Set {
   397  	artifactsMap := strset.New()
   398  	for _, propertiesObj := range properties {
   399  		artifactsMap.Add(propertiesObj.ArtifactID)
   400  	}
   401  	return artifactsMap
   402  }
   403  
   404  func (j *archiveParser) findBestPomMatch(properties map[string]pkg.JavaPomProperties,
   405  	projects map[string]*parsedPomProject, artifactsMap *strset.Set) (pkg.JavaPomProperties, *parsedPomProject) {
   406  	var pomProperties pkg.JavaPomProperties
   407  	var parsedPom *parsedPomProject
   408  
   409  	for parentPath, propertiesObj := range sortedIter(properties) {
   410  		if !artifactIDMatchesFilename(propertiesObj.ArtifactID, j.fileInfo.name, artifactsMap) {
   411  			continue
   412  		}
   413  
   414  		pomProperties, parsedPom = j.updateMatchIfBetter(pomProperties, parsedPom, propertiesObj, parentPath, projects)
   415  
   416  		if j.isExactMatch(propertiesObj, parsedPom) {
   417  			break
   418  		}
   419  	}
   420  
   421  	return pomProperties, parsedPom
   422  }
   423  
   424  func (j *archiveParser) updateMatchIfBetter(currentProps pkg.JavaPomProperties, currentPom *parsedPomProject,
   425  	newProps pkg.JavaPomProperties, parentPath string, projects map[string]*parsedPomProject) (pkg.JavaPomProperties, *parsedPomProject) {
   426  	// Keep the first match
   427  	if currentProps.ArtifactID == "" {
   428  		proj, hasProject := projects[parentPath]
   429  		if hasProject {
   430  			return newProps, proj
   431  		}
   432  		return newProps, currentPom
   433  	}
   434  
   435  	proj, hasProject := projects[parentPath]
   436  	if !hasProject {
   437  		return currentProps, currentPom
   438  	}
   439  
   440  	// Keep the first matching artifact with a pom.xml
   441  	if currentPom == nil {
   442  		return newProps, proj
   443  	}
   444  
   445  	// Prefer exact matches
   446  	if j.isExactMatch(newProps, proj) {
   447  		return newProps, proj
   448  	}
   449  
   450  	return currentProps, currentPom
   451  }
   452  
   453  func (j *archiveParser) isExactMatch(props pkg.JavaPomProperties, pom *parsedPomProject) bool {
   454  	if pom == nil {
   455  		return false
   456  	}
   457  	return strings.Contains(j.fileInfo.name, props.GroupID) || j.fileInfo.name == props.ArtifactID
   458  }
   459  
   460  func (j *archiveParser) handleSinglePomXML(properties map[string]pkg.JavaPomProperties,
   461  	projects map[string]*parsedPomProject, currentPom *parsedPomProject) *parsedPomProject {
   462  	if len(properties) == 0 && len(projects) == 1 {
   463  		for _, projectsObj := range projects {
   464  			return projectsObj
   465  		}
   466  	}
   467  	return currentPom
   468  }
   469  
   470  func (j *archiveParser) resolveIdentity(ctx context.Context, pomProperties pkg.JavaPomProperties,
   471  	parsedPom *parsedPomProject) (group, name, version string, pom *parsedPomProject) {
   472  	group = pomProperties.GroupID
   473  	name = pomProperties.ArtifactID
   474  	version = pomProperties.Version
   475  
   476  	if parsedPom != nil && parsedPom.project != nil {
   477  		id := j.maven.ResolveID(ctx, parsedPom.project)
   478  		if group == "" {
   479  			group = id.GroupID
   480  		}
   481  		if name == "" {
   482  			name = id.ArtifactID
   483  		}
   484  		if version == "" {
   485  			version = id.Version
   486  		}
   487  	}
   488  
   489  	return group, name, version, parsedPom
   490  }
   491  
   492  // artifactIDMatchesFilename returns true if one starts with the other
   493  func artifactIDMatchesFilename(artifactID, fileName string, artifactsMap *strset.Set) bool {
   494  	if artifactID == "" || fileName == "" {
   495  		return false
   496  	}
   497  	// Ensure true is returned when filename matches the artifact ID, prevent random retrieval by checking prefix and suffix
   498  	if artifactsMap.Has(fileName) {
   499  		return artifactID == fileName
   500  	}
   501  	// Use fallback check with suffix and prefix if no POM properties file matches the exact artifact name
   502  	return strings.HasPrefix(artifactID, fileName) || strings.HasSuffix(fileName, artifactID)
   503  }
   504  
   505  // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given
   506  // parent package, returning all listed Java packages found for each pom
   507  // properties discovered and potentially updating the given parentPkg with new
   508  // data.
   509  func (j *archiveParser) discoverPkgsFromAllMavenFiles(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, error) {
   510  	if parentPkg == nil {
   511  		return nil, nil
   512  	}
   513  
   514  	var pkgs []pkg.Package
   515  
   516  	// pom.properties
   517  	properties, err := pomPropertiesByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob))
   518  	if err != nil {
   519  		return nil, err
   520  	}
   521  
   522  	// pom.xml
   523  	projects, err := pomProjectByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob))
   524  	if err != nil {
   525  		return nil, err
   526  	}
   527  
   528  	for parentPath, propertiesObj := range sortedIter(properties) {
   529  		var parsedPom *parsedPomProject
   530  		if proj, exists := projects[parentPath]; exists {
   531  			parsedPom = proj
   532  		}
   533  
   534  		pkgFromPom := newPackageFromMavenData(ctx, j.maven, propertiesObj, parsedPom, parentPkg, j.location)
   535  		if pkgFromPom != nil {
   536  			pkgs = append(pkgs, *pkgFromPom)
   537  		}
   538  	}
   539  
   540  	return pkgs, nil
   541  }
   542  
   543  func getDigestsFromArchive(ctx context.Context, archivePath string) ([]file.Digest, error) {
   544  	archiveCloser, err := os.Open(archivePath)
   545  	if err != nil {
   546  		return nil, fmt.Errorf("unable to open archive path (%s): %w", archivePath, err)
   547  	}
   548  	defer internal.CloseAndLogError(archiveCloser, archivePath)
   549  
   550  	// grab and assign digest for the entire archive
   551  	digests, err := intFile.NewDigestsFromFile(ctx, archiveCloser, javaArchiveHashes)
   552  	if err != nil {
   553  		log.Debugf("failed to create digest for file=%q: %+v", archivePath, err)
   554  	}
   555  
   556  	return digests, nil
   557  }
   558  
   559  func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) []pkg.License {
   560  	// prefer identified licenses, fall back to unknown
   561  	var identified []pkg.License
   562  	var unidentified []pkg.License
   563  
   564  	for _, glob := range []string{"/META-INF/*", "/*"} {
   565  		var licenseMatches []string
   566  		for _, f := range j.fileManifest.GlobMatch(true, glob) {
   567  			if licenses.IsLicenseFile(path.Base(f)) {
   568  				licenseMatches = append(licenseMatches, f)
   569  			}
   570  		}
   571  
   572  		if len(licenseMatches) > 0 {
   573  			contents, err := intFile.ContentsFromZip(ctx, j.archivePath, licenseMatches...)
   574  			if err != nil {
   575  				log.Debugf("unable to extract java license (%s): %w", j.location, err)
   576  				continue
   577  			}
   578  
   579  			for _, licenseMatch := range licenseMatches {
   580  				licenseContents := contents[licenseMatch]
   581  				r := strings.NewReader(licenseContents)
   582  				foundLicenses := pkg.NewLicensesFromReadCloserWithContext(ctx, file.NewLocationReadCloser(j.location, io.NopCloser(r)))
   583  				for _, l := range foundLicenses {
   584  					if l.SPDXExpression != "" {
   585  						identified = append(identified, l)
   586  					} else {
   587  						unidentified = append(unidentified, l)
   588  					}
   589  				}
   590  			}
   591  
   592  			// prefer licenses found in /META-INF
   593  			if len(identified) > 0 {
   594  				break
   595  			}
   596  		}
   597  	}
   598  
   599  	if len(identified) == 0 {
   600  		return unidentified
   601  	}
   602  
   603  	return identified
   604  }
   605  
   606  func (j *archiveParser) discoverPkgsFromNestedArchives(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   607  	// we know that all java archives are zip formatted files, so we can use the shared zip helper
   608  	return discoverPkgsFromZip(ctx, j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg, j.cfg)
   609  }
   610  
   611  // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and
   612  // associating each discovered package to the given parent package.
   613  func discoverPkgsFromZip(ctx context.Context, location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) {
   614  	// search and parse pom.properties files & fetch the contents
   615  	openers, err := intFile.ExtractFromZipToUniqueTempFile(ctx, archivePath, contentPath, fileManifest.GlobMatch(false, archiveFormatGlobs...)...)
   616  	if err != nil {
   617  		return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err)
   618  	}
   619  
   620  	return discoverPkgsFromOpeners(ctx, location, openers, parentPkg, cfg)
   621  }
   622  
   623  // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package.
   624  func discoverPkgsFromOpeners(ctx context.Context, location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) {
   625  	var pkgs []pkg.Package
   626  	var relationships []artifact.Relationship
   627  
   628  	for pathWithinArchive, archiveOpener := range sortedIter(openers) {
   629  		nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(ctx, location, pathWithinArchive, archiveOpener, cfg, parentPkg)
   630  		if err != nil {
   631  			log.WithFields("location", location.Path(), "error", err).Debug("unable to discover java packages from opener")
   632  			continue
   633  		}
   634  
   635  		// attach the parent package to all discovered packages that are not already associated with a java archive
   636  		for _, p := range nestedPkgs {
   637  			if metadata, ok := p.Metadata.(pkg.JavaArchive); ok {
   638  				if metadata.Parent == nil {
   639  					metadata.Parent = parentPkg
   640  				}
   641  				p.Metadata = metadata
   642  			}
   643  			pkgs = append(pkgs, p)
   644  		}
   645  
   646  		relationships = append(relationships, nestedRelationships...)
   647  	}
   648  
   649  	return pkgs, relationships, nil
   650  }
   651  
   652  // discoverPkgsFromOpener finds Java archives within the given file.
   653  func discoverPkgsFromOpener(ctx context.Context, location file.Location, pathWithinArchive string, archiveOpener intFile.Opener, cfg ArchiveCatalogerConfig, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   654  	archiveReadCloser, err := archiveOpener.Open()
   655  	if err != nil {
   656  		return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err)
   657  	}
   658  	defer func() {
   659  		if closeErr := archiveReadCloser.Close(); closeErr != nil {
   660  			log.Debugf("unable to close archived file from tempdir: %+v", closeErr)
   661  		}
   662  	}()
   663  
   664  	nestedPath := fmt.Sprintf("%s:%s", location.Path(), pathWithinArchive)
   665  	nestedLocation := file.NewLocationFromCoordinates(location.Coordinates)
   666  	nestedLocation.AccessPath = nestedPath
   667  	gap := newGenericArchiveParserAdapter(cfg)
   668  	nestedPkgs, nestedRelationships, err := gap.processJavaArchive(ctx, file.LocationReadCloser{
   669  		Location:   nestedLocation,
   670  		ReadCloser: archiveReadCloser,
   671  	}, parentPkg)
   672  	if err != nil {
   673  		return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err)
   674  	}
   675  
   676  	return nestedPkgs, nestedRelationships, nil
   677  }
   678  
   679  func pomPropertiesByParentPath(ctx context.Context, archivePath string, location file.Location, extractPaths []string) (map[string]pkg.JavaPomProperties, error) {
   680  	contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(ctx, archivePath, extractPaths...)
   681  	if err != nil {
   682  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   683  	}
   684  
   685  	propertiesByParentPath := make(map[string]pkg.JavaPomProperties)
   686  	for filePath, fileContents := range sortedIter(contentsOfMavenPropertiesFiles) {
   687  		pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents))
   688  		if err != nil {
   689  			log.WithFields("contents-path", filePath, "location", location.Path(), "error", err).Debug("failed to parse pom.properties")
   690  			continue
   691  		}
   692  
   693  		if pomProperties == nil {
   694  			continue
   695  		}
   696  
   697  		if pomProperties.Version == "" || pomProperties.ArtifactID == "" {
   698  			// TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package
   699  			continue
   700  		}
   701  
   702  		propertiesByParentPath[path.Dir(filePath)] = *pomProperties
   703  	}
   704  
   705  	return propertiesByParentPath, nil
   706  }
   707  
   708  func pomProjectByParentPath(ctx context.Context, archivePath string, location file.Location, extractPaths []string) (map[string]*parsedPomProject, error) {
   709  	contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(ctx, archivePath, extractPaths...)
   710  	if err != nil {
   711  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   712  	}
   713  
   714  	projectByParentPath := make(map[string]*parsedPomProject)
   715  	for filePath, fileContents := range sortedIter(contentsOfMavenProjectFiles) {
   716  		// TODO: when we support locations of paths within archives we should start passing the specific pom.xml location object instead of the top jar
   717  		pom, err := maven.ParsePomXML(strings.NewReader(fileContents))
   718  		if err != nil {
   719  			log.WithFields("contents-path", filePath, "location", location.Path(), "error", err).Debug("failed to parse pom.xml")
   720  			continue
   721  		}
   722  		if pom == nil {
   723  			continue
   724  		}
   725  
   726  		projectByParentPath[path.Dir(filePath)] = &parsedPomProject{
   727  			path:    filePath,
   728  			project: pom,
   729  		}
   730  	}
   731  	return projectByParentPath, nil
   732  }
   733  
   734  // newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
   735  // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not.
   736  func newPackageFromMavenData(ctx context.Context, r *maven.Resolver, pomProperties pkg.JavaPomProperties, parsedPom *parsedPomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package {
   737  	// keep the artifact name within the virtual path if this package does not match the parent package
   738  	vPathSuffix := ""
   739  	groupID := ""
   740  	if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive); ok {
   741  		groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
   742  	}
   743  
   744  	parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version)
   745  	// Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package
   746  	// See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical
   747  	// packages and create equal virtual paths so they are de duped in the future
   748  	pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)
   749  	if parentKey != pomProjectKey {
   750  		// build a new virtual path suffix for the package that is different from the parent package
   751  		// we want to use the GroupID and ArtifactID here to preserve uniqueness
   752  		// Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core")
   753  		// https://github.com/anchore/syft/issues/1944
   754  		vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID
   755  	}
   756  	virtualPath := location.Path() + vPathSuffix
   757  
   758  	var pkgPomProject *pkg.JavaPomProject
   759  
   760  	var err error
   761  	var pomLicenses []maven.License
   762  	if parsedPom == nil {
   763  		// If we have no pom.xml, check maven central using pom.properties
   764  		pomLicenses, err = r.FindLicenses(ctx, pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)
   765  	} else {
   766  		pkgPomProject = newPomProject(ctx, r, parsedPom.path, parsedPom.project)
   767  		pomLicenses, err = r.ResolveLicenses(ctx, parsedPom.project)
   768  	}
   769  
   770  	if err != nil {
   771  		log.WithFields("error", err, "mavenID", maven.NewID(pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)).Trace("error attempting to resolve licenses")
   772  	}
   773  
   774  	licenseSet := pkg.NewLicenseSet(toPkgLicenses(ctx, &location, pomLicenses)...)
   775  
   776  	p := pkg.Package{
   777  		Name:    pomProperties.ArtifactID,
   778  		Version: pomProperties.Version,
   779  		Locations: file.NewLocationSet(
   780  			location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   781  		),
   782  		Licenses: licenseSet,
   783  		Language: pkg.Java,
   784  		Type:     pomProperties.PkgTypeIndicated(),
   785  		Metadata: pkg.JavaArchive{
   786  			VirtualPath:   virtualPath,
   787  			PomProperties: &pomProperties,
   788  			PomProject:    pkgPomProject,
   789  			Parent:        parentPkg,
   790  		},
   791  	}
   792  
   793  	if packageIdentitiesMatch(p, parentPkg) {
   794  		updateParentPackage(p, parentPkg)
   795  		return nil
   796  	}
   797  
   798  	return &p
   799  }
   800  
   801  func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
   802  	metadata, ok := p.Metadata.(pkg.JavaArchive)
   803  	parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaArchive)
   804  	if !ok || !parentOk {
   805  		switch {
   806  		case !ok:
   807  			log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name)
   808  		default: // !parentOk
   809  			log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name)
   810  		}
   811  		// if we can't extract metadata, we can check for matching identities via the package name
   812  		// this is not ideal, but it's better than nothing - this should not be used if we have Metadata
   813  
   814  		return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg)
   815  	}
   816  
   817  	// try to determine identity with the metadata
   818  	groupID := groupIDFromJavaMetadata(p.Name, metadata)
   819  	parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
   820  	if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) {
   821  		return true
   822  	}
   823  
   824  	// the virtual path matches...
   825  	if parentMetadata.VirtualPath == metadata.VirtualPath {
   826  		return true
   827  	}
   828  
   829  	// the pom artifactId is the parent name
   830  	// note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good
   831  	// example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but
   832  	// they do not indicate the same package.
   833  	// NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name
   834  	// from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core")
   835  	// we will use this check as a last resort
   836  	if metadata.PomProperties != nil {
   837  		if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID {
   838  			return true
   839  		}
   840  	}
   841  	return false
   842  }
   843  
   844  func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) {
   845  	// we've run across more information about our parent package, add this info to the parent package metadata
   846  	// the pom properties is typically a better source of information for name and version than the manifest
   847  	parentPkg.Name = p.Name
   848  	parentPkg.Version = p.Version
   849  
   850  	// we may have learned more about the type via data in the pom properties
   851  	parentPkg.Type = p.Type
   852  
   853  	metadata, ok := p.Metadata.(pkg.JavaArchive)
   854  	if !ok {
   855  		return
   856  	}
   857  	pomPropertiesCopy := *metadata.PomProperties
   858  
   859  	// keep the pom properties, but don't overwrite existing pom properties
   860  	parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive)
   861  	if ok && parentMetadata.PomProperties == nil {
   862  		parentMetadata.PomProperties = &pomPropertiesCopy
   863  		parentPkg.Metadata = parentMetadata
   864  	}
   865  }
   866  
   867  func sortedIter[K cmp.Ordered, V any](values map[K]V) iter.Seq2[K, V] {
   868  	return func(yield func(K, V) bool) {
   869  		keys := maps.Keys(values)
   870  		slices.Sort(keys)
   871  		for _, key := range keys {
   872  			if !yield(key, values[key]) {
   873  				return
   874  			}
   875  		}
   876  	}
   877  }