github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/pkg/cataloger/java/archive_parser.go (about)

     1  package java
     2  
     3  import (
     4  	"crypto"
     5  	"fmt"
     6  	"io"
     7  	"net/http"
     8  	"net/url"
     9  	"os"
    10  	"path"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/vifraa/gopom"
    15  
    16  	"github.com/anchore/syft/syft/artifact"
    17  	"github.com/anchore/syft/syft/file"
    18  	"github.com/anchore/syft/syft/pkg"
    19  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    20  	intFile "github.com/lineaje-labs/syft/internal/file"
    21  	"github.com/lineaje-labs/syft/internal/licenses"
    22  	"github.com/lineaje-labs/syft/internal/log"
    23  )
    24  
    25  var archiveFormatGlobs = []string{
    26  	"**/*.jar",
    27  	"**/*.war",
    28  	"**/*.ear",
    29  	"**/*.par",
    30  	"**/*.sar",
    31  	"**/*.nar",
    32  	"**/*.jpi",
    33  	"**/*.hpi",
    34  	"**/*.lpkg", // Zip-compressed package used to deploy applications
    35  	// (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter
    36  	// has information about the application and installation requirements.
    37  	// NOTE(jonasagx): If you would like to test it with lpkg file,
    38  	// use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403
    39  	// LifeRay makes it pretty cumbersome to make a such plugins; their docs are
    40  	// out of date, and they charge for their IDE. If you find an example
    41  	// project that we can build in CI feel free to include it
    42  }
    43  
    44  // javaArchiveHashes are all the current hash algorithms used to calculate archive digests
    45  var javaArchiveHashes = []crypto.Hash{
    46  	crypto.MD5,
    47  	crypto.SHA1,
    48  	crypto.SHA256,
    49  }
    50  
    51  type archiveParser struct {
    52  	fileManifest intFile.ZipFileManifest
    53  	location     file.Location
    54  	archivePath  string
    55  	contentPath  string
    56  	fileInfo     archiveFilename
    57  	detectNested bool
    58  	cfg          ArchiveCatalogerConfig
    59  }
    60  
    61  type genericArchiveParserAdapter struct {
    62  	cfg ArchiveCatalogerConfig
    63  }
    64  
    65  func newGenericArchiveParserAdapter(cfg ArchiveCatalogerConfig) genericArchiveParserAdapter {
    66  	return genericArchiveParserAdapter{cfg: cfg}
    67  }
    68  
    69  // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives.
    70  func (gap genericArchiveParserAdapter) parseJavaArchive(
    71  	_ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser,
    72  ) ([]pkg.Package, []artifact.Relationship, error) {
    73  	parser, cleanupFn, err := newJavaArchiveParser(reader, true, gap.cfg)
    74  	// note: even on error, we should always run cleanup functions
    75  	defer cleanupFn()
    76  	if err != nil {
    77  		return nil, nil, err
    78  	}
    79  	return parser.parse()
    80  }
    81  
    82  // uniquePkgKey creates a unique string to identify the given package.
    83  func uniquePkgKey(groupID string, p *pkg.Package) string {
    84  	if p == nil {
    85  		return ""
    86  	}
    87  	return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version)
    88  }
    89  
    90  // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover
    91  // and parse nested archives or ignore them.
    92  func newJavaArchiveParser(
    93  	reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig,
    94  ) (*archiveParser, func(), error) {
    95  	// fetch the last element of the virtual path
    96  	virtualElements := strings.Split(reader.Path(), ":")
    97  	currentFilepath := virtualElements[len(virtualElements)-1]
    98  
    99  	contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader)
   100  	if err != nil {
   101  		return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err)
   102  	}
   103  
   104  	fileManifest, err := intFile.NewZipFileManifest(archivePath)
   105  	if err != nil {
   106  		return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err)
   107  	}
   108  
   109  	return &archiveParser{
   110  		fileManifest: fileManifest,
   111  		location:     reader.Location,
   112  		archivePath:  archivePath,
   113  		contentPath:  contentPath,
   114  		fileInfo:     newJavaArchiveFilename(currentFilepath),
   115  		detectNested: detectNested,
   116  		cfg:          cfg,
   117  	}, cleanupFn, nil
   118  }
   119  
   120  // parse the loaded archive and return all packages found.
   121  func (j *archiveParser) parse() ([]pkg.Package, []artifact.Relationship, error) {
   122  	var pkgs []pkg.Package
   123  	var relationships []artifact.Relationship
   124  
   125  	// find the parent package from the java manifest
   126  	parentPkg, err := j.discoverMainPackage()
   127  	if err != nil {
   128  		return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err)
   129  	}
   130  
   131  	// find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg
   132  	// NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml
   133  	auxPkgs, err := j.discoverPkgsFromAllMavenFiles(parentPkg)
   134  	if err != nil {
   135  		return nil, nil, err
   136  	}
   137  	pkgs = append(pkgs, auxPkgs...)
   138  
   139  	if j.detectNested {
   140  		// find nested java archive packages
   141  		nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(parentPkg)
   142  		if err != nil {
   143  			return nil, nil, err
   144  		}
   145  		pkgs = append(pkgs, nestedPkgs...)
   146  		relationships = append(relationships, nestedRelationships...)
   147  	}
   148  
   149  	// lastly, add the parent package to the list (assuming the parent exists)
   150  	if parentPkg != nil {
   151  		pkgs = append([]pkg.Package{*parentPkg}, pkgs...)
   152  	}
   153  
   154  	// add pURLs to all packages found
   155  	// note: since package information may change after initial creation when parsing multiple locations within the
   156  	// jar, we wait until the conclusion of the parsing process before synthesizing pURLs.
   157  	for i := range pkgs {
   158  		p := &pkgs[i]
   159  		if m, ok := p.Metadata.(pkg.JavaArchive); ok {
   160  			p.PURL = packageURL(p.Name, p.Version, m)
   161  		} else {
   162  			log.WithFields("package", p.String()).Warn("unable to extract java metadata to generate purl")
   163  		}
   164  		p.SetID()
   165  	}
   166  
   167  	return pkgs, relationships, nil
   168  }
   169  
   170  // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages.
   171  func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) {
   172  	// search and parse java manifest files
   173  	manifestMatches := j.fileManifest.GlobMatch(false, manifestGlob)
   174  	if len(manifestMatches) > 1 {
   175  		return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches)
   176  	} else if len(manifestMatches) == 0 {
   177  		// we did not find any manifests, but that may not be a problem (there may be other information to generate packages for)
   178  		return nil, nil
   179  	}
   180  
   181  	// fetch the manifest file
   182  	contents, err := intFile.ContentsFromZip(j.archivePath, manifestMatches...)
   183  	if err != nil {
   184  		return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err)
   185  	}
   186  
   187  	// parse the manifest file into a rich object
   188  	manifestContents := contents[manifestMatches[0]]
   189  	manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents))
   190  	if err != nil {
   191  		log.Warnf("failed to parse java manifest (%s): %+v", j.location, err)
   192  		return nil, nil
   193  	}
   194  
   195  	// grab and assign digest for the entire archive
   196  	digests, err := getDigestsFromArchive(j.archivePath)
   197  	if err != nil {
   198  		return nil, err
   199  	}
   200  
   201  	licenses, name, version, err := j.parseLicenses(manifest)
   202  	if err != nil {
   203  		return nil, err
   204  	}
   205  
   206  	return &pkg.Package{
   207  		// TODO: maybe select name should just have a pom properties in it?
   208  		Name:     name,
   209  		Version:  version,
   210  		Language: pkg.Java,
   211  		Licenses: pkg.NewLicenseSet(licenses...),
   212  		Locations: file.NewLocationSet(
   213  			j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   214  		),
   215  		Type: j.fileInfo.pkgType(),
   216  		Metadata: pkg.JavaArchive{
   217  			VirtualPath:    j.location.Path(),
   218  			Manifest:       manifest,
   219  			ArchiveDigests: digests,
   220  		},
   221  	}, nil
   222  }
   223  
   224  func (j *archiveParser) parseLicenses(manifest *pkg.JavaManifest) ([]pkg.License, string, string, error) {
   225  	// we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest
   226  	// TODO: when we support locations of paths within archives we should start passing the specific manifest location object instead of the top jar
   227  	licenses := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...)
   228  	/*
   229  		We should name and version from, in this order:
   230  		1. pom.properties if we find exactly 1
   231  		2. pom.xml if we find exactly 1
   232  		3. manifest
   233  		4. filename
   234  	*/
   235  	name, version, pomLicenses := j.guessMainPackageNameAndVersionFromPomInfo()
   236  	if name == "" {
   237  		name = selectName(manifest, j.fileInfo)
   238  	}
   239  	if version == "" {
   240  		version = selectVersion(manifest, j.fileInfo)
   241  	}
   242  	if len(licenses) == 0 {
   243  		// Today we don't have a way to distinguish between licenses from the manifest and licenses from the pom.xml
   244  		// until the file.Location object can support sub-paths (i.e. paths within archives, recursively; issue https://github.com/anchore/syft/issues/2211).
   245  		// Until then it's less confusing to use the licenses from the pom.xml only if the manifest did not list any.
   246  		licenses = append(licenses, pomLicenses...)
   247  	}
   248  
   249  	if len(licenses) == 0 {
   250  		fileLicenses, err := j.getLicenseFromFileInArchive()
   251  		if err != nil {
   252  			return nil, "", "", err
   253  		}
   254  		if fileLicenses != nil {
   255  			licenses = append(licenses, fileLicenses...)
   256  		}
   257  	}
   258  
   259  	// If we didn't find any licenses in the archive so far, we'll try again in Maven Central using groupIDFromJavaMetadata
   260  	if len(licenses) == 0 && j.cfg.UseNetwork {
   261  		licenses = findLicenseFromJavaMetadata(name, manifest, version, j, licenses)
   262  	}
   263  
   264  	return licenses, name, version, nil
   265  }
   266  
   267  func findLicenseFromJavaMetadata(
   268  	name string, manifest *pkg.JavaManifest, version string, j *archiveParser, licenses []pkg.License,
   269  ) []pkg.License {
   270  	var groupID = name
   271  	if gID := groupIDFromJavaMetadata(name, pkg.JavaArchive{Manifest: manifest}); gID != "" {
   272  		groupID = gID
   273  	}
   274  	pomLicenses, err := recursivelyFindLicensesFromParentPom(groupID, name, version, j.cfg)
   275  	if err != nil {
   276  		log.Tracef("unable to get parent pom from Maven central: %v", err)
   277  	}
   278  
   279  	if len(pomLicenses) == 0 {
   280  		// Try removing the last part of the groupId, as sometimes it duplicates the artifactId
   281  		packages := strings.Split(groupID, ".")
   282  		groupID = strings.Join(packages[:len(packages)-1], ".")
   283  		pomLicenses, err = recursivelyFindLicensesFromParentPom(groupID, name, version, j.cfg)
   284  		if err != nil {
   285  			log.Tracef("unable to get parent pom from Maven central: %v", err)
   286  		}
   287  	}
   288  
   289  	if len(pomLicenses) > 0 {
   290  		pkgLicenses := pkg.NewLicensesFromLocation(j.location, pomLicenses...)
   291  		if pkgLicenses != nil {
   292  			licenses = append(licenses, pkgLicenses...)
   293  		}
   294  	}
   295  	return licenses
   296  }
   297  
   298  type parsedPomProject struct {
   299  	*pkg.JavaPomProject
   300  	Licenses []pkg.License
   301  }
   302  
   303  func (j *archiveParser) guessMainPackageNameAndVersionFromPomInfo() (name, version string, licenses []pkg.License) {
   304  	pomPropertyMatches := j.fileManifest.GlobMatch(false, pomPropertiesGlob)
   305  	pomMatches := j.fileManifest.GlobMatch(false, pomXMLGlob)
   306  	var pomPropertiesObject pkg.JavaPomProperties
   307  	var pomProjectObject *parsedPomProject
   308  
   309  	// Find the pom.properties/pom.xml if the names seem like a plausible match
   310  	properties, _ := pomPropertiesByParentPath(j.archivePath, j.location, pomPropertyMatches)
   311  	projects, _ := pomProjectByParentPath(j.archivePath, j.location, pomMatches)
   312  
   313  	for parentPath, propertiesObj := range properties {
   314  		if artifactIDMatchesFilename(propertiesObj.ArtifactID, j.fileInfo.name) {
   315  			pomPropertiesObject = propertiesObj
   316  			if proj, exists := projects[parentPath]; exists {
   317  				pomProjectObject = proj
   318  				break
   319  			}
   320  		}
   321  	}
   322  
   323  	name = pomPropertiesObject.ArtifactID
   324  	if name == "" && pomProjectObject != nil {
   325  		name = pomProjectObject.ArtifactID
   326  	}
   327  	version = pomPropertiesObject.Version
   328  	if version == "" && pomProjectObject != nil {
   329  		version = pomProjectObject.Version
   330  	}
   331  	if pomProjectObject != nil && j.cfg.UseNetwork {
   332  		findPomLicenses(pomProjectObject, j.cfg)
   333  	}
   334  
   335  	if pomProjectObject != nil {
   336  		licenses = pomProjectObject.Licenses
   337  	}
   338  
   339  	return name, version, licenses
   340  }
   341  
   342  func artifactIDMatchesFilename(artifactID, fileName string) bool {
   343  	if artifactID == "" || fileName == "" {
   344  		return false
   345  	}
   346  	return strings.HasPrefix(artifactID, fileName) || strings.HasSuffix(fileName, artifactID)
   347  }
   348  
   349  func findPomLicenses(pomProjectObject *parsedPomProject, cfg ArchiveCatalogerConfig) {
   350  	// If we don't have any licenses until now, and if we have a parent Pom, then we'll check the parent pom in maven central for licenses.
   351  	if pomProjectObject != nil && pomProjectObject.Parent != nil && len(pomProjectObject.Licenses) == 0 {
   352  		parentLicenses, err := recursivelyFindLicensesFromParentPom(
   353  			pomProjectObject.Parent.GroupID,
   354  			pomProjectObject.Parent.ArtifactID,
   355  			pomProjectObject.Parent.Version,
   356  			cfg)
   357  		if err != nil {
   358  			// We don't want to abort here as the parent pom might not exist in Maven Central, we'll just log the error
   359  			log.Tracef("unable to get parent pom from Maven central: %v", err)
   360  			return
   361  		}
   362  		if len(parentLicenses) > 0 {
   363  			for _, licenseName := range parentLicenses {
   364  				pomProjectObject.Licenses = append(pomProjectObject.Licenses, pkg.NewLicenseFromFields(licenseName, "", nil))
   365  			}
   366  		}
   367  	}
   368  }
   369  
   370  func formatMavenPomURL(groupID, artifactID, version, mavenBaseURL string) (requestURL string, err error) {
   371  	// groupID needs to go from maven.org -> maven/org
   372  	urlPath := strings.Split(groupID, ".")
   373  	artifactPom := fmt.Sprintf("%s-%s.pom", artifactID, version)
   374  	urlPath = append(urlPath, artifactID, version, artifactPom)
   375  
   376  	// ex:"https://repo1.maven.org/maven2/groupID/artifactID/artifactPom
   377  	requestURL, err = url.JoinPath(mavenBaseURL, urlPath...)
   378  	if err != nil {
   379  		return requestURL, fmt.Errorf("could not construct maven url: %w", err)
   380  	}
   381  	return requestURL, err
   382  }
   383  
   384  func recursivelyFindLicensesFromParentPom(
   385  	groupID, artifactID, version string, cfg ArchiveCatalogerConfig,
   386  ) ([]string, error) {
   387  	var licenses []string
   388  	// As there can be nested parent poms, we'll recursively check for licenses until we reach the max depth
   389  	for i := 0; i < cfg.MaxParentRecursiveDepth; i++ {
   390  		parentPom, err := getPomFromMavenRepo(groupID, artifactID, version, cfg.MavenBaseURL)
   391  		if err != nil {
   392  			return nil, err
   393  		}
   394  		parentLicenses := parseLicensesFromPom(parentPom)
   395  		if len(parentLicenses) > 0 || parentPom == nil || parentPom.Parent == nil {
   396  			licenses = parentLicenses
   397  			break
   398  		}
   399  
   400  		groupID = *parentPom.Parent.GroupID
   401  		artifactID = *parentPom.Parent.ArtifactID
   402  		version = *parentPom.Parent.Version
   403  	}
   404  
   405  	return licenses, nil
   406  }
   407  
   408  func getPomFromMavenRepo(groupID, artifactID, version, mavenBaseURL string) (*gopom.Project, error) {
   409  	requestURL, err := formatMavenPomURL(groupID, artifactID, version, mavenBaseURL)
   410  	if err != nil {
   411  		return nil, err
   412  	}
   413  	log.Tracef("trying to fetch parent pom from Maven central %s", requestURL)
   414  
   415  	mavenRequest, err := http.NewRequest(http.MethodGet, requestURL, nil)
   416  	if err != nil {
   417  		return nil, fmt.Errorf("unable to format request for Maven central: %w", err)
   418  	}
   419  
   420  	httpClient := &http.Client{
   421  		Timeout: time.Second * 10,
   422  	}
   423  
   424  	resp, err := httpClient.Do(mavenRequest)
   425  	if err != nil {
   426  		return nil, fmt.Errorf("unable to get pom from Maven central: %w", err)
   427  	}
   428  	defer func() {
   429  		if err := resp.Body.Close(); err != nil {
   430  			log.Errorf("unable to close body: %+v", err)
   431  		}
   432  	}()
   433  
   434  	bytes, err := io.ReadAll(resp.Body)
   435  	if err != nil {
   436  		return nil, fmt.Errorf("unable to parse pom from Maven central: %w", err)
   437  	}
   438  
   439  	pom, err := decodePomXML(strings.NewReader(string(bytes)))
   440  	if err != nil {
   441  		return nil, fmt.Errorf("unable to parse pom from Maven central: %w", err)
   442  	}
   443  
   444  	return &pom, nil
   445  }
   446  
   447  func parseLicensesFromPom(pom *gopom.Project) []string {
   448  	var licenses []string
   449  	if pom != nil && pom.Licenses != nil {
   450  		for _, license := range *pom.Licenses {
   451  			if license.Name != nil {
   452  				licenses = append(licenses, *license.Name)
   453  			} else if license.URL != nil {
   454  				licenses = append(licenses, *license.URL)
   455  			}
   456  		}
   457  	}
   458  
   459  	return licenses
   460  }
   461  
   462  // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given
   463  // parent package, returning all listed Java packages found for each pom
   464  // properties discovered and potentially updating the given parentPkg with new
   465  // data.
   466  func (j *archiveParser) discoverPkgsFromAllMavenFiles(parentPkg *pkg.Package) ([]pkg.Package, error) {
   467  	if parentPkg == nil {
   468  		return nil, nil
   469  	}
   470  
   471  	var pkgs []pkg.Package
   472  
   473  	// pom.properties
   474  	properties, err := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob))
   475  	if err != nil {
   476  		return nil, err
   477  	}
   478  
   479  	// pom.xml
   480  	projects, err := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob))
   481  	if err != nil {
   482  		return nil, err
   483  	}
   484  
   485  	for parentPath, propertiesObj := range properties {
   486  		var pomProject *parsedPomProject
   487  		if proj, exists := projects[parentPath]; exists {
   488  			pomProject = proj
   489  		}
   490  
   491  		pkgFromPom := newPackageFromMavenData(propertiesObj, pomProject, parentPkg, j.location, j.cfg)
   492  		if pkgFromPom != nil {
   493  			pkgs = append(pkgs, *pkgFromPom)
   494  		}
   495  	}
   496  
   497  	return pkgs, nil
   498  }
   499  
   500  func getDigestsFromArchive(archivePath string) ([]file.Digest, error) {
   501  	archiveCloser, err := os.Open(archivePath)
   502  	if err != nil {
   503  		return nil, fmt.Errorf("unable to open archive path (%s): %w", archivePath, err)
   504  	}
   505  	defer archiveCloser.Close()
   506  
   507  	// grab and assign digest for the entire archive
   508  	digests, err := intFile.NewDigestsFromFile(archiveCloser, javaArchiveHashes)
   509  	if err != nil {
   510  		log.Warnf("failed to create digest for file=%q: %+v", archivePath, err)
   511  	}
   512  
   513  	return digests, nil
   514  }
   515  
   516  func (j *archiveParser) getLicenseFromFileInArchive() ([]pkg.License, error) {
   517  	var fileLicenses []pkg.License
   518  	for _, filename := range licenses.FileNames() {
   519  		licenseMatches := j.fileManifest.GlobMatch(true, "/META-INF/"+filename)
   520  		if len(licenseMatches) == 0 {
   521  			// Try the root directory if it's not in META-INF
   522  			licenseMatches = j.fileManifest.GlobMatch(true, "/"+filename)
   523  		}
   524  
   525  		if len(licenseMatches) > 0 {
   526  			contents, err := intFile.ContentsFromZip(j.archivePath, licenseMatches...)
   527  			if err != nil {
   528  				return nil, fmt.Errorf("unable to extract java license (%s): %w", j.location, err)
   529  			}
   530  
   531  			for _, licenseMatch := range licenseMatches {
   532  				licenseContents := contents[licenseMatch]
   533  				parsed, err := licenses.Parse(strings.NewReader(licenseContents), j.location)
   534  				if err != nil {
   535  					return nil, err
   536  				}
   537  
   538  				if len(parsed) > 0 {
   539  					fileLicenses = append(fileLicenses, parsed...)
   540  				}
   541  			}
   542  		}
   543  	}
   544  
   545  	return fileLicenses, nil
   546  }
   547  
   548  func (j *archiveParser) discoverPkgsFromNestedArchives(parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   549  	// we know that all java archives are zip formatted files, so we can use the shared zip helper
   550  	return discoverPkgsFromZip(j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg, j.cfg)
   551  }
   552  
   553  // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and
   554  // associating each discovered package to the given parent package.
   555  func discoverPkgsFromZip(
   556  	location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest,
   557  	parentPkg *pkg.Package, cfg ArchiveCatalogerConfig,
   558  ) ([]pkg.Package, []artifact.Relationship, error) {
   559  	// search and parse pom.properties files & fetch the contents
   560  	openers, err := intFile.ExtractFromZipToUniqueTempFile(archivePath, contentPath, fileManifest.GlobMatch(false, archiveFormatGlobs...)...)
   561  	if err != nil {
   562  		return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err)
   563  	}
   564  
   565  	return discoverPkgsFromOpeners(location, openers, parentPkg, cfg)
   566  }
   567  
   568  // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package.
   569  func discoverPkgsFromOpeners(
   570  	location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig,
   571  ) ([]pkg.Package, []artifact.Relationship, error) {
   572  	var pkgs []pkg.Package
   573  	var relationships []artifact.Relationship
   574  
   575  	for pathWithinArchive, archiveOpener := range openers {
   576  		nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(location, pathWithinArchive, archiveOpener, cfg)
   577  		if err != nil {
   578  			log.WithFields("location", location.Path()).Warnf("unable to discover java packages from opener: %+v", err)
   579  			continue
   580  		}
   581  
   582  		// attach the parent package to all discovered packages that are not already associated with a java archive
   583  		for _, p := range nestedPkgs {
   584  			if metadata, ok := p.Metadata.(pkg.JavaArchive); ok {
   585  				if metadata.Parent == nil {
   586  					metadata.Parent = parentPkg
   587  				}
   588  				p.Metadata = metadata
   589  			}
   590  			pkgs = append(pkgs, p)
   591  		}
   592  
   593  		relationships = append(relationships, nestedRelationships...)
   594  	}
   595  
   596  	return pkgs, relationships, nil
   597  }
   598  
   599  // discoverPkgsFromOpener finds Java archives within the given file.
   600  func discoverPkgsFromOpener(
   601  	location file.Location, pathWithinArchive string, archiveOpener intFile.Opener, cfg ArchiveCatalogerConfig,
   602  ) ([]pkg.Package, []artifact.Relationship, error) {
   603  	archiveReadCloser, err := archiveOpener.Open()
   604  	if err != nil {
   605  		return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err)
   606  	}
   607  	defer func() {
   608  		if closeErr := archiveReadCloser.Close(); closeErr != nil {
   609  			log.Warnf("unable to close archived file from tempdir: %+v", closeErr)
   610  		}
   611  	}()
   612  
   613  	nestedPath := fmt.Sprintf("%s:%s", location.Path(), pathWithinArchive)
   614  	nestedLocation := file.NewLocationFromCoordinates(location.Coordinates)
   615  	nestedLocation.AccessPath = nestedPath
   616  	gap := newGenericArchiveParserAdapter(cfg)
   617  	nestedPkgs, nestedRelationships, err := gap.parseJavaArchive(nil, nil, file.LocationReadCloser{
   618  		Location:   nestedLocation,
   619  		ReadCloser: archiveReadCloser,
   620  	})
   621  	if err != nil {
   622  		return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err)
   623  	}
   624  
   625  	return nestedPkgs, nestedRelationships, nil
   626  }
   627  
   628  func pomPropertiesByParentPath(
   629  	archivePath string, location file.Location, extractPaths []string,
   630  ) (map[string]pkg.JavaPomProperties, error) {
   631  	contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...)
   632  	if err != nil {
   633  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   634  	}
   635  
   636  	propertiesByParentPath := make(map[string]pkg.JavaPomProperties)
   637  	for filePath, fileContents := range contentsOfMavenPropertiesFiles {
   638  		pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents))
   639  		if err != nil {
   640  			log.WithFields("contents-path", filePath, "location", location.Path()).Warnf("failed to parse pom.properties: %+v", err)
   641  			continue
   642  		}
   643  
   644  		if pomProperties == nil {
   645  			continue
   646  		}
   647  
   648  		if pomProperties.Version == "" || pomProperties.ArtifactID == "" {
   649  			// TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package
   650  			continue
   651  		}
   652  
   653  		propertiesByParentPath[path.Dir(filePath)] = *pomProperties
   654  	}
   655  
   656  	return propertiesByParentPath, nil
   657  }
   658  
   659  func pomProjectByParentPath(
   660  	archivePath string, location file.Location, extractPaths []string,
   661  ) (map[string]*parsedPomProject, error) {
   662  	contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...)
   663  	if err != nil {
   664  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   665  	}
   666  
   667  	projectByParentPath := make(map[string]*parsedPomProject)
   668  	for filePath, fileContents := range contentsOfMavenProjectFiles {
   669  		// TODO: when we support locations of paths within archives we should start passing the specific pom.xml location object instead of the top jar
   670  		pomProject, err := parsePomXMLProject(filePath, strings.NewReader(fileContents), location)
   671  		if err != nil {
   672  			log.WithFields("contents-path", filePath, "location", location.Path()).Warnf("failed to parse pom.xml: %+v", err)
   673  			continue
   674  		}
   675  
   676  		if pomProject == nil {
   677  			continue
   678  		}
   679  
   680  		// If we don't have a version, then maybe the parent pom has it...
   681  		if (pomProject.Parent == nil && pomProject.Version == "") || pomProject.ArtifactID == "" {
   682  			// TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package
   683  			continue
   684  		}
   685  
   686  		projectByParentPath[path.Dir(filePath)] = pomProject
   687  	}
   688  	return projectByParentPath, nil
   689  }
   690  
   691  // newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
   692  // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not.
   693  func newPackageFromMavenData(
   694  	pomProperties pkg.JavaPomProperties, parsedPomProject *parsedPomProject, parentPkg *pkg.Package,
   695  	location file.Location, cfg ArchiveCatalogerConfig,
   696  ) *pkg.Package {
   697  	// keep the artifact name within the virtual path if this package does not match the parent package
   698  	vPathSuffix := ""
   699  	groupID := ""
   700  	if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive); ok {
   701  		groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
   702  	}
   703  
   704  	parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version)
   705  	// Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package
   706  	// See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical
   707  	// packages and create equal virtual paths so they are de duped in the future
   708  	pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)
   709  	if parentKey != pomProjectKey {
   710  		// build a new virtual path suffix for the package that is different from the parent package
   711  		// we want to use the GroupID and ArtifactID here to preserve uniqueness
   712  		// Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core")
   713  		// https://github.com/anchore/syft/issues/1944
   714  		vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID
   715  	}
   716  	virtualPath := location.Path() + vPathSuffix
   717  
   718  	var pkgPomProject *pkg.JavaPomProject
   719  	licenses := make([]pkg.License, 0)
   720  	if parsedPomProject != nil {
   721  		if cfg.UseNetwork {
   722  			findPomLicenses(parsedPomProject, cfg)
   723  		}
   724  		pkgPomProject = parsedPomProject.JavaPomProject
   725  		licenses = append(licenses, parsedPomProject.Licenses...)
   726  	}
   727  
   728  	p := pkg.Package{
   729  		Name:    pomProperties.ArtifactID,
   730  		Version: pomProperties.Version,
   731  		Locations: file.NewLocationSet(
   732  			location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   733  		),
   734  		Licenses: pkg.NewLicenseSet(licenses...),
   735  		Language: pkg.Java,
   736  		Type:     pomProperties.PkgTypeIndicated(),
   737  		Metadata: pkg.JavaArchive{
   738  			VirtualPath:   virtualPath,
   739  			PomProperties: &pomProperties,
   740  			PomProject:    pkgPomProject,
   741  			Parent:        parentPkg,
   742  		},
   743  	}
   744  
   745  	if packageIdentitiesMatch(p, parentPkg) {
   746  		updateParentPackage(p, parentPkg)
   747  		return nil
   748  	}
   749  
   750  	return &p
   751  }
   752  
   753  func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
   754  	metadata, ok := p.Metadata.(pkg.JavaArchive)
   755  	parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaArchive)
   756  	if !ok || !parentOk {
   757  		switch {
   758  		case !ok:
   759  			log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name)
   760  		case !parentOk:
   761  			log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name)
   762  		}
   763  		// if we can't extract metadata, we can check for matching identities via the package name
   764  		// this is not ideal, but it's better than nothing - this should not be used if we have Metadata
   765  
   766  		return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg)
   767  	}
   768  
   769  	// try to determine identity with the metadata
   770  	groupID := groupIDFromJavaMetadata(p.Name, metadata)
   771  	parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
   772  	if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) {
   773  		return true
   774  	}
   775  
   776  	// the virtual path matches...
   777  	if parentMetadata.VirtualPath == metadata.VirtualPath {
   778  		return true
   779  	}
   780  
   781  	// the pom artifactId is the parent name
   782  	// note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good
   783  	// example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but
   784  	// they do not indicate the same package.
   785  	// NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name
   786  	// from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core")
   787  	// we will use this check as a last resort
   788  	if metadata.PomProperties != nil {
   789  		if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID {
   790  			return true
   791  		}
   792  	}
   793  	return false
   794  }
   795  
   796  func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) {
   797  	// we've run across more information about our parent package, add this info to the parent package metadata
   798  	// the pom properties is typically a better source of information for name and version than the manifest
   799  	parentPkg.Name = p.Name
   800  	parentPkg.Version = p.Version
   801  
   802  	// we may have learned more about the type via data in the pom properties
   803  	parentPkg.Type = p.Type
   804  
   805  	metadata, ok := p.Metadata.(pkg.JavaArchive)
   806  	if !ok {
   807  		return
   808  	}
   809  	pomPropertiesCopy := *metadata.PomProperties
   810  
   811  	// keep the pom properties, but don't overwrite existing pom properties
   812  	parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive)
   813  	if ok && parentMetadata.PomProperties == nil {
   814  		parentMetadata.PomProperties = &pomPropertiesCopy
   815  		parentPkg.Metadata = parentMetadata
   816  	}
   817  }