github.com/anchore/syft@v1.4.2-0.20240516191711-1bec1fc5d397/syft/pkg/cataloger/java/archive_parser.go (about)

     1  package java
     2  
     3  import (
     4  	"context"
     5  	"crypto"
     6  	"fmt"
     7  	"os"
     8  	"path"
     9  	"strings"
    10  
    11  	intFile "github.com/anchore/syft/internal/file"
    12  	"github.com/anchore/syft/internal/licenses"
    13  	"github.com/anchore/syft/internal/log"
    14  	"github.com/anchore/syft/syft/artifact"
    15  	"github.com/anchore/syft/syft/file"
    16  	"github.com/anchore/syft/syft/pkg"
    17  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    18  )
    19  
    20  var archiveFormatGlobs = []string{
    21  	"**/*.jar",
    22  	"**/*.war",
    23  	"**/*.ear",
    24  	"**/*.par",
    25  	"**/*.sar",
    26  	"**/*.nar",
    27  	"**/*.jpi",
    28  	"**/*.hpi",
    29  	"**/*.lpkg", // Zip-compressed package used to deploy applications
    30  	// (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter
    31  	// has information about the application and installation requirements.
    32  	// NOTE(jonasagx): If you would like to test it with lpkg file,
    33  	// use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403
    34  	// LifeRay makes it pretty cumbersome to make a such plugins; their docs are
    35  	// out of date, and they charge for their IDE. If you find an example
    36  	// project that we can build in CI feel free to include it
    37  }
    38  
    39  // javaArchiveHashes are all the current hash algorithms used to calculate archive digests
    40  var javaArchiveHashes = []crypto.Hash{
    41  	crypto.SHA1,
    42  }
    43  
    44  type archiveParser struct {
    45  	fileManifest intFile.ZipFileManifest
    46  	location     file.Location
    47  	archivePath  string
    48  	contentPath  string
    49  	fileInfo     archiveFilename
    50  	detectNested bool
    51  	cfg          ArchiveCatalogerConfig
    52  }
    53  
    54  type genericArchiveParserAdapter struct {
    55  	cfg ArchiveCatalogerConfig
    56  }
    57  
    58  func newGenericArchiveParserAdapter(cfg ArchiveCatalogerConfig) genericArchiveParserAdapter {
    59  	return genericArchiveParserAdapter{cfg: cfg}
    60  }
    61  
    62  // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives.
    63  func (gap genericArchiveParserAdapter) parseJavaArchive(ctx context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    64  	parser, cleanupFn, err := newJavaArchiveParser(reader, true, gap.cfg)
    65  	// note: even on error, we should always run cleanup functions
    66  	defer cleanupFn()
    67  	if err != nil {
    68  		return nil, nil, err
    69  	}
    70  	return parser.parse(ctx)
    71  }
    72  
    73  // uniquePkgKey creates a unique string to identify the given package.
    74  func uniquePkgKey(groupID string, p *pkg.Package) string {
    75  	if p == nil {
    76  		return ""
    77  	}
    78  	return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version)
    79  }
    80  
    81  // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover
    82  // and parse nested archives or ignore them.
    83  func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig) (*archiveParser, func(), error) {
    84  	// fetch the last element of the virtual path
    85  	virtualElements := strings.Split(reader.Path(), ":")
    86  	currentFilepath := virtualElements[len(virtualElements)-1]
    87  
    88  	contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader)
    89  	if err != nil {
    90  		return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err)
    91  	}
    92  
    93  	fileManifest, err := intFile.NewZipFileManifest(archivePath)
    94  	if err != nil {
    95  		return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err)
    96  	}
    97  
    98  	return &archiveParser{
    99  		fileManifest: fileManifest,
   100  		location:     reader.Location,
   101  		archivePath:  archivePath,
   102  		contentPath:  contentPath,
   103  		fileInfo:     newJavaArchiveFilename(currentFilepath),
   104  		detectNested: detectNested,
   105  		cfg:          cfg,
   106  	}, cleanupFn, nil
   107  }
   108  
   109  // parse the loaded archive and return all packages found.
   110  func (j *archiveParser) parse(ctx context.Context) ([]pkg.Package, []artifact.Relationship, error) {
   111  	var pkgs []pkg.Package
   112  	var relationships []artifact.Relationship
   113  
   114  	// find the parent package from the java manifest
   115  	parentPkg, err := j.discoverMainPackage(ctx)
   116  	if err != nil {
   117  		return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err)
   118  	}
   119  
   120  	// find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg
   121  	// NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml
   122  	auxPkgs, err := j.discoverPkgsFromAllMavenFiles(ctx, parentPkg)
   123  	if err != nil {
   124  		return nil, nil, err
   125  	}
   126  	pkgs = append(pkgs, auxPkgs...)
   127  
   128  	if j.detectNested {
   129  		// find nested java archive packages
   130  		nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(ctx, parentPkg)
   131  		if err != nil {
   132  			return nil, nil, err
   133  		}
   134  		pkgs = append(pkgs, nestedPkgs...)
   135  		relationships = append(relationships, nestedRelationships...)
   136  	}
   137  
   138  	// lastly, add the parent package to the list (assuming the parent exists)
   139  	if parentPkg != nil {
   140  		pkgs = append([]pkg.Package{*parentPkg}, pkgs...)
   141  	}
   142  
   143  	// add pURLs to all packages found
   144  	// note: since package information may change after initial creation when parsing multiple locations within the
   145  	// jar, we wait until the conclusion of the parsing process before synthesizing pURLs.
   146  	for i := range pkgs {
   147  		p := &pkgs[i]
   148  		if m, ok := p.Metadata.(pkg.JavaArchive); ok {
   149  			p.PURL = packageURL(p.Name, p.Version, m)
   150  		} else {
   151  			log.WithFields("package", p.String()).Warn("unable to extract java metadata to generate purl")
   152  		}
   153  		p.SetID()
   154  	}
   155  
   156  	return pkgs, relationships, nil
   157  }
   158  
   159  // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages.
   160  func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, error) {
   161  	// search and parse java manifest files
   162  	manifestMatches := j.fileManifest.GlobMatch(false, manifestGlob)
   163  	if len(manifestMatches) > 1 {
   164  		return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches)
   165  	} else if len(manifestMatches) == 0 {
   166  		// we did not find any manifests, but that may not be a problem (there may be other information to generate packages for)
   167  		return nil, nil
   168  	}
   169  
   170  	// fetch the manifest file
   171  	contents, err := intFile.ContentsFromZip(j.archivePath, manifestMatches...)
   172  	if err != nil {
   173  		return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err)
   174  	}
   175  
   176  	// parse the manifest file into a rich object
   177  	manifestContents := contents[manifestMatches[0]]
   178  	manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents))
   179  	if err != nil {
   180  		log.Warnf("failed to parse java manifest (%s): %+v", j.location, err)
   181  		return nil, nil
   182  	}
   183  
   184  	// check for existence of Weave-Classes manifest key in order to exclude jars getting misrepresented as
   185  	// their targeted counterparts, e.g. newrelic spring and tomcat instrumentation
   186  	if _, ok := manifest.Main.Get("Weave-Classes"); ok {
   187  		log.Debugf("excluding archive due to Weave-Classes manifest entry: %s", j.location)
   188  		return nil, nil
   189  	}
   190  
   191  	// grab and assign digest for the entire archive
   192  	digests, err := getDigestsFromArchive(j.archivePath)
   193  	if err != nil {
   194  		return nil, err
   195  	}
   196  
   197  	licenses, name, version, err := j.parseLicenses(ctx, manifest)
   198  	if err != nil {
   199  		return nil, err
   200  	}
   201  
   202  	return &pkg.Package{
   203  		// TODO: maybe select name should just have a pom properties in it?
   204  		Name:     name,
   205  		Version:  version,
   206  		Language: pkg.Java,
   207  		Licenses: pkg.NewLicenseSet(licenses...),
   208  		Locations: file.NewLocationSet(
   209  			j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   210  		),
   211  		Type: j.fileInfo.pkgType(),
   212  		Metadata: pkg.JavaArchive{
   213  			VirtualPath:    j.location.Path(),
   214  			Manifest:       manifest,
   215  			ArchiveDigests: digests,
   216  		},
   217  	}, nil
   218  }
   219  
   220  func (j *archiveParser) parseLicenses(ctx context.Context, manifest *pkg.JavaManifest) ([]pkg.License, string, string, error) {
   221  	// we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest
   222  	// TODO: when we support locations of paths within archives we should start passing the specific manifest location object instead of the top jar
   223  	licenses := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...)
   224  	/*
   225  		We should name and version from, in this order:
   226  		1. pom.properties if we find exactly 1
   227  		2. pom.xml if we find exactly 1
   228  		3. manifest
   229  		4. filename
   230  	*/
   231  	name, version, pomLicenses := j.guessMainPackageNameAndVersionFromPomInfo(ctx)
   232  	if name == "" {
   233  		name = selectName(manifest, j.fileInfo)
   234  	}
   235  	if version == "" {
   236  		version = selectVersion(manifest, j.fileInfo)
   237  	}
   238  	if len(licenses) == 0 {
   239  		// Today we don't have a way to distinguish between licenses from the manifest and licenses from the pom.xml
   240  		// until the file.Location object can support sub-paths (i.e. paths within archives, recursively; issue https://github.com/anchore/syft/issues/2211).
   241  		// Until then it's less confusing to use the licenses from the pom.xml only if the manifest did not list any.
   242  		licenses = append(licenses, pomLicenses...)
   243  	}
   244  
   245  	if len(licenses) == 0 {
   246  		fileLicenses, err := j.getLicenseFromFileInArchive()
   247  		if err != nil {
   248  			return nil, "", "", err
   249  		}
   250  		if fileLicenses != nil {
   251  			licenses = append(licenses, fileLicenses...)
   252  		}
   253  	}
   254  
   255  	// If we didn't find any licenses in the archive so far, we'll try again in Maven Central using groupIDFromJavaMetadata
   256  	if len(licenses) == 0 && j.cfg.UseNetwork {
   257  		licenses = findLicenseFromJavaMetadata(ctx, name, manifest, version, j, licenses)
   258  	}
   259  
   260  	return licenses, name, version, nil
   261  }
   262  
   263  func findLicenseFromJavaMetadata(ctx context.Context, name string, manifest *pkg.JavaManifest, version string, j *archiveParser, licenses []pkg.License) []pkg.License {
   264  	var groupID = name
   265  	if gID := groupIDFromJavaMetadata(name, pkg.JavaArchive{Manifest: manifest}); gID != "" {
   266  		groupID = gID
   267  	}
   268  	pomLicenses := recursivelyFindLicensesFromParentPom(ctx, groupID, name, version, j.cfg)
   269  
   270  	if len(pomLicenses) == 0 {
   271  		// Try removing the last part of the groupId, as sometimes it duplicates the artifactId
   272  		packages := strings.Split(groupID, ".")
   273  		groupID = strings.Join(packages[:len(packages)-1], ".")
   274  		pomLicenses = recursivelyFindLicensesFromParentPom(ctx, groupID, name, version, j.cfg)
   275  	}
   276  
   277  	if len(pomLicenses) > 0 {
   278  		pkgLicenses := pkg.NewLicensesFromLocation(j.location, pomLicenses...)
   279  		if pkgLicenses != nil {
   280  			licenses = append(licenses, pkgLicenses...)
   281  		}
   282  	}
   283  	return licenses
   284  }
   285  
   286  type parsedPomProject struct {
   287  	*pkg.JavaPomProject
   288  	Licenses []pkg.License
   289  }
   290  
   291  func (j *archiveParser) guessMainPackageNameAndVersionFromPomInfo(ctx context.Context) (name, version string, licenses []pkg.License) {
   292  	pomPropertyMatches := j.fileManifest.GlobMatch(false, pomPropertiesGlob)
   293  	pomMatches := j.fileManifest.GlobMatch(false, pomXMLGlob)
   294  	var pomPropertiesObject pkg.JavaPomProperties
   295  	var pomProjectObject *parsedPomProject
   296  
   297  	// Find the pom.properties/pom.xml if the names seem like a plausible match
   298  	properties, _ := pomPropertiesByParentPath(j.archivePath, j.location, pomPropertyMatches)
   299  	projects, _ := pomProjectByParentPath(j.archivePath, j.location, pomMatches)
   300  
   301  	for parentPath, propertiesObj := range properties {
   302  		if artifactIDMatchesFilename(propertiesObj.ArtifactID, j.fileInfo.name) {
   303  			pomPropertiesObject = propertiesObj
   304  			if proj, exists := projects[parentPath]; exists {
   305  				pomProjectObject = proj
   306  				break
   307  			}
   308  		}
   309  	}
   310  
   311  	name = pomPropertiesObject.ArtifactID
   312  	if name == "" && pomProjectObject != nil {
   313  		name = pomProjectObject.ArtifactID
   314  	}
   315  	version = pomPropertiesObject.Version
   316  	if version == "" && pomProjectObject != nil {
   317  		version = pomProjectObject.Version
   318  	}
   319  	if j.cfg.UseNetwork {
   320  		if pomProjectObject == nil {
   321  			// If we have no pom.xml, check maven central using pom.properties
   322  			parentLicenses := recursivelyFindLicensesFromParentPom(ctx, pomPropertiesObject.GroupID, pomPropertiesObject.ArtifactID, pomPropertiesObject.Version, j.cfg)
   323  			if len(parentLicenses) > 0 {
   324  				for _, licenseName := range parentLicenses {
   325  					licenses = append(licenses, pkg.NewLicenseFromFields(licenseName, "", nil))
   326  				}
   327  			}
   328  		} else {
   329  			findPomLicenses(ctx, pomProjectObject, j.cfg)
   330  		}
   331  	}
   332  
   333  	if pomProjectObject != nil {
   334  		licenses = pomProjectObject.Licenses
   335  	}
   336  
   337  	return name, version, licenses
   338  }
   339  
   340  func artifactIDMatchesFilename(artifactID, fileName string) bool {
   341  	if artifactID == "" || fileName == "" {
   342  		return false
   343  	}
   344  	return strings.HasPrefix(artifactID, fileName) || strings.HasSuffix(fileName, artifactID)
   345  }
   346  
   347  func findPomLicenses(ctx context.Context, pomProjectObject *parsedPomProject, cfg ArchiveCatalogerConfig) {
   348  	// If we don't have any licenses until now, and if we have a parent Pom, then we'll check the parent pom in maven central for licenses.
   349  	if pomProjectObject != nil && pomProjectObject.Parent != nil && len(pomProjectObject.Licenses) == 0 {
   350  		parentLicenses := recursivelyFindLicensesFromParentPom(
   351  			ctx,
   352  			pomProjectObject.Parent.GroupID,
   353  			pomProjectObject.Parent.ArtifactID,
   354  			pomProjectObject.Parent.Version,
   355  			cfg)
   356  
   357  		if len(parentLicenses) > 0 {
   358  			for _, licenseName := range parentLicenses {
   359  				pomProjectObject.Licenses = append(pomProjectObject.Licenses, pkg.NewLicenseFromFields(licenseName, "", nil))
   360  			}
   361  		}
   362  	}
   363  }
   364  
   365  // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given
   366  // parent package, returning all listed Java packages found for each pom
   367  // properties discovered and potentially updating the given parentPkg with new
   368  // data.
   369  func (j *archiveParser) discoverPkgsFromAllMavenFiles(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, error) {
   370  	if parentPkg == nil {
   371  		return nil, nil
   372  	}
   373  
   374  	var pkgs []pkg.Package
   375  
   376  	// pom.properties
   377  	properties, err := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob))
   378  	if err != nil {
   379  		return nil, err
   380  	}
   381  
   382  	// pom.xml
   383  	projects, err := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob))
   384  	if err != nil {
   385  		return nil, err
   386  	}
   387  
   388  	for parentPath, propertiesObj := range properties {
   389  		var pomProject *parsedPomProject
   390  		if proj, exists := projects[parentPath]; exists {
   391  			pomProject = proj
   392  		}
   393  
   394  		pkgFromPom := newPackageFromMavenData(ctx, propertiesObj, pomProject, parentPkg, j.location, j.cfg)
   395  		if pkgFromPom != nil {
   396  			pkgs = append(pkgs, *pkgFromPom)
   397  		}
   398  	}
   399  
   400  	return pkgs, nil
   401  }
   402  
   403  func getDigestsFromArchive(archivePath string) ([]file.Digest, error) {
   404  	archiveCloser, err := os.Open(archivePath)
   405  	if err != nil {
   406  		return nil, fmt.Errorf("unable to open archive path (%s): %w", archivePath, err)
   407  	}
   408  	defer archiveCloser.Close()
   409  
   410  	// grab and assign digest for the entire archive
   411  	digests, err := intFile.NewDigestsFromFile(archiveCloser, javaArchiveHashes)
   412  	if err != nil {
   413  		log.Warnf("failed to create digest for file=%q: %+v", archivePath, err)
   414  	}
   415  
   416  	return digests, nil
   417  }
   418  
   419  func (j *archiveParser) getLicenseFromFileInArchive() ([]pkg.License, error) {
   420  	var fileLicenses []pkg.License
   421  	for _, filename := range licenses.FileNames() {
   422  		licenseMatches := j.fileManifest.GlobMatch(true, "/META-INF/"+filename)
   423  		if len(licenseMatches) == 0 {
   424  			// Try the root directory if it's not in META-INF
   425  			licenseMatches = j.fileManifest.GlobMatch(true, "/"+filename)
   426  		}
   427  
   428  		if len(licenseMatches) > 0 {
   429  			contents, err := intFile.ContentsFromZip(j.archivePath, licenseMatches...)
   430  			if err != nil {
   431  				return nil, fmt.Errorf("unable to extract java license (%s): %w", j.location, err)
   432  			}
   433  
   434  			for _, licenseMatch := range licenseMatches {
   435  				licenseContents := contents[licenseMatch]
   436  				parsed, err := licenses.Parse(strings.NewReader(licenseContents), j.location)
   437  				if err != nil {
   438  					return nil, err
   439  				}
   440  
   441  				if len(parsed) > 0 {
   442  					fileLicenses = append(fileLicenses, parsed...)
   443  				}
   444  			}
   445  		}
   446  	}
   447  
   448  	return fileLicenses, nil
   449  }
   450  
   451  func (j *archiveParser) discoverPkgsFromNestedArchives(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   452  	// we know that all java archives are zip formatted files, so we can use the shared zip helper
   453  	return discoverPkgsFromZip(ctx, j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg, j.cfg)
   454  }
   455  
   456  // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and
   457  // associating each discovered package to the given parent package.
   458  func discoverPkgsFromZip(ctx context.Context, location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) {
   459  	// search and parse pom.properties files & fetch the contents
   460  	openers, err := intFile.ExtractFromZipToUniqueTempFile(archivePath, contentPath, fileManifest.GlobMatch(false, archiveFormatGlobs...)...)
   461  	if err != nil {
   462  		return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err)
   463  	}
   464  
   465  	return discoverPkgsFromOpeners(ctx, location, openers, parentPkg, cfg)
   466  }
   467  
   468  // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package.
   469  func discoverPkgsFromOpeners(ctx context.Context, location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) {
   470  	var pkgs []pkg.Package
   471  	var relationships []artifact.Relationship
   472  
   473  	for pathWithinArchive, archiveOpener := range openers {
   474  		nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(ctx, location, pathWithinArchive, archiveOpener, cfg)
   475  		if err != nil {
   476  			log.WithFields("location", location.Path()).Warnf("unable to discover java packages from opener: %+v", err)
   477  			continue
   478  		}
   479  
   480  		// attach the parent package to all discovered packages that are not already associated with a java archive
   481  		for _, p := range nestedPkgs {
   482  			if metadata, ok := p.Metadata.(pkg.JavaArchive); ok {
   483  				if metadata.Parent == nil {
   484  					metadata.Parent = parentPkg
   485  				}
   486  				p.Metadata = metadata
   487  			}
   488  			pkgs = append(pkgs, p)
   489  		}
   490  
   491  		relationships = append(relationships, nestedRelationships...)
   492  	}
   493  
   494  	return pkgs, relationships, nil
   495  }
   496  
   497  // discoverPkgsFromOpener finds Java archives within the given file.
   498  func discoverPkgsFromOpener(ctx context.Context, location file.Location, pathWithinArchive string, archiveOpener intFile.Opener, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) {
   499  	archiveReadCloser, err := archiveOpener.Open()
   500  	if err != nil {
   501  		return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err)
   502  	}
   503  	defer func() {
   504  		if closeErr := archiveReadCloser.Close(); closeErr != nil {
   505  			log.Warnf("unable to close archived file from tempdir: %+v", closeErr)
   506  		}
   507  	}()
   508  
   509  	nestedPath := fmt.Sprintf("%s:%s", location.Path(), pathWithinArchive)
   510  	nestedLocation := file.NewLocationFromCoordinates(location.Coordinates)
   511  	nestedLocation.AccessPath = nestedPath
   512  	gap := newGenericArchiveParserAdapter(cfg)
   513  	nestedPkgs, nestedRelationships, err := gap.parseJavaArchive(ctx, nil, nil, file.LocationReadCloser{
   514  		Location:   nestedLocation,
   515  		ReadCloser: archiveReadCloser,
   516  	})
   517  	if err != nil {
   518  		return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err)
   519  	}
   520  
   521  	return nestedPkgs, nestedRelationships, nil
   522  }
   523  
   524  func pomPropertiesByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.JavaPomProperties, error) {
   525  	contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...)
   526  	if err != nil {
   527  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   528  	}
   529  
   530  	propertiesByParentPath := make(map[string]pkg.JavaPomProperties)
   531  	for filePath, fileContents := range contentsOfMavenPropertiesFiles {
   532  		pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents))
   533  		if err != nil {
   534  			log.WithFields("contents-path", filePath, "location", location.Path()).Warnf("failed to parse pom.properties: %+v", err)
   535  			continue
   536  		}
   537  
   538  		if pomProperties == nil {
   539  			continue
   540  		}
   541  
   542  		if pomProperties.Version == "" || pomProperties.ArtifactID == "" {
   543  			// TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package
   544  			continue
   545  		}
   546  
   547  		propertiesByParentPath[path.Dir(filePath)] = *pomProperties
   548  	}
   549  
   550  	return propertiesByParentPath, nil
   551  }
   552  
   553  func pomProjectByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]*parsedPomProject, error) {
   554  	contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...)
   555  	if err != nil {
   556  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   557  	}
   558  
   559  	projectByParentPath := make(map[string]*parsedPomProject)
   560  	for filePath, fileContents := range contentsOfMavenProjectFiles {
   561  		// TODO: when we support locations of paths within archives we should start passing the specific pom.xml location object instead of the top jar
   562  		pomProject, err := parsePomXMLProject(filePath, strings.NewReader(fileContents), location)
   563  		if err != nil {
   564  			log.WithFields("contents-path", filePath, "location", location.Path()).Warnf("failed to parse pom.xml: %+v", err)
   565  			continue
   566  		}
   567  
   568  		if pomProject == nil {
   569  			continue
   570  		}
   571  
   572  		// If we don't have a version, then maybe the parent pom has it...
   573  		if (pomProject.Parent == nil && pomProject.Version == "") || pomProject.ArtifactID == "" {
   574  			// TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package
   575  			continue
   576  		}
   577  
   578  		projectByParentPath[path.Dir(filePath)] = pomProject
   579  	}
   580  	return projectByParentPath, nil
   581  }
   582  
   583  // newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
   584  // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not.
   585  func newPackageFromMavenData(ctx context.Context, pomProperties pkg.JavaPomProperties, parsedPomProject *parsedPomProject, parentPkg *pkg.Package, location file.Location, cfg ArchiveCatalogerConfig) *pkg.Package {
   586  	// keep the artifact name within the virtual path if this package does not match the parent package
   587  	vPathSuffix := ""
   588  	groupID := ""
   589  	if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive); ok {
   590  		groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
   591  	}
   592  
   593  	parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version)
   594  	// Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package
   595  	// See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical
   596  	// packages and create equal virtual paths so they are de duped in the future
   597  	pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)
   598  	if parentKey != pomProjectKey {
   599  		// build a new virtual path suffix for the package that is different from the parent package
   600  		// we want to use the GroupID and ArtifactID here to preserve uniqueness
   601  		// Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core")
   602  		// https://github.com/anchore/syft/issues/1944
   603  		vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID
   604  	}
   605  	virtualPath := location.Path() + vPathSuffix
   606  
   607  	var pkgPomProject *pkg.JavaPomProject
   608  	licenses := make([]pkg.License, 0)
   609  
   610  	if cfg.UseNetwork {
   611  		if parsedPomProject == nil {
   612  			// If we have no pom.xml, check maven central using pom.properties
   613  			parentLicenses := recursivelyFindLicensesFromParentPom(ctx, pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version, cfg)
   614  			if len(parentLicenses) > 0 {
   615  				for _, licenseName := range parentLicenses {
   616  					licenses = append(licenses, pkg.NewLicenseFromFields(licenseName, "", nil))
   617  				}
   618  			}
   619  		} else {
   620  			findPomLicenses(ctx, parsedPomProject, cfg)
   621  		}
   622  	}
   623  
   624  	if parsedPomProject != nil {
   625  		pkgPomProject = parsedPomProject.JavaPomProject
   626  		licenses = append(licenses, parsedPomProject.Licenses...)
   627  	}
   628  
   629  	p := pkg.Package{
   630  		Name:    pomProperties.ArtifactID,
   631  		Version: pomProperties.Version,
   632  		Locations: file.NewLocationSet(
   633  			location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   634  		),
   635  		Licenses: pkg.NewLicenseSet(licenses...),
   636  		Language: pkg.Java,
   637  		Type:     pomProperties.PkgTypeIndicated(),
   638  		Metadata: pkg.JavaArchive{
   639  			VirtualPath:   virtualPath,
   640  			PomProperties: &pomProperties,
   641  			PomProject:    pkgPomProject,
   642  			Parent:        parentPkg,
   643  		},
   644  	}
   645  
   646  	if packageIdentitiesMatch(p, parentPkg) {
   647  		updateParentPackage(p, parentPkg)
   648  		return nil
   649  	}
   650  
   651  	return &p
   652  }
   653  
   654  func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
   655  	metadata, ok := p.Metadata.(pkg.JavaArchive)
   656  	parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaArchive)
   657  	if !ok || !parentOk {
   658  		switch {
   659  		case !ok:
   660  			log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name)
   661  		case !parentOk:
   662  			log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name)
   663  		}
   664  		// if we can't extract metadata, we can check for matching identities via the package name
   665  		// this is not ideal, but it's better than nothing - this should not be used if we have Metadata
   666  
   667  		return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg)
   668  	}
   669  
   670  	// try to determine identity with the metadata
   671  	groupID := groupIDFromJavaMetadata(p.Name, metadata)
   672  	parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
   673  	if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) {
   674  		return true
   675  	}
   676  
   677  	// the virtual path matches...
   678  	if parentMetadata.VirtualPath == metadata.VirtualPath {
   679  		return true
   680  	}
   681  
   682  	// the pom artifactId is the parent name
   683  	// note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good
   684  	// example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but
   685  	// they do not indicate the same package.
   686  	// NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name
   687  	// from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core")
   688  	// we will use this check as a last resort
   689  	if metadata.PomProperties != nil {
   690  		if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID {
   691  			return true
   692  		}
   693  	}
   694  	return false
   695  }
   696  
   697  func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) {
   698  	// we've run across more information about our parent package, add this info to the parent package metadata
   699  	// the pom properties is typically a better source of information for name and version than the manifest
   700  	parentPkg.Name = p.Name
   701  	parentPkg.Version = p.Version
   702  
   703  	// we may have learned more about the type via data in the pom properties
   704  	parentPkg.Type = p.Type
   705  
   706  	metadata, ok := p.Metadata.(pkg.JavaArchive)
   707  	if !ok {
   708  		return
   709  	}
   710  	pomPropertiesCopy := *metadata.PomProperties
   711  
   712  	// keep the pom properties, but don't overwrite existing pom properties
   713  	parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive)
   714  	if ok && parentMetadata.PomProperties == nil {
   715  		parentMetadata.PomProperties = &pomPropertiesCopy
   716  		parentPkg.Metadata = parentMetadata
   717  	}
   718  }