github.com/nextlinux/gosbom@v0.81.1-0.20230627115839-1ff50c281391/gosbom/pkg/cataloger/java/archive_parser.go (about)

     1  package java
     2  
     3  import (
     4  	"crypto"
     5  	"fmt"
     6  	"os"
     7  	"path"
     8  	"strings"
     9  
    10  	"github.com/nextlinux/gosbom/gosbom/artifact"
    11  	"github.com/nextlinux/gosbom/gosbom/file"
    12  	"github.com/nextlinux/gosbom/gosbom/pkg"
    13  	"github.com/nextlinux/gosbom/gosbom/pkg/cataloger/generic"
    14  	intFile "github.com/nextlinux/gosbom/internal/file"
    15  	"github.com/nextlinux/gosbom/internal/log"
    16  )
    17  
    18  var _ generic.Parser = parseJavaArchive
    19  
    20  var archiveFormatGlobs = []string{
    21  	"**/*.jar",
    22  	"**/*.war",
    23  	"**/*.ear",
    24  	"**/*.par",
    25  	"**/*.sar",
    26  	"**/*.nar",
    27  	"**/*.jpi",
    28  	"**/*.hpi",
    29  	"**/*.lpkg", // Zip-compressed package used to deploy applications
    30  	// (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter
    31  	// has information about the application and installation requirements.
    32  	// NOTE(jonasagx): If you would like to test it with lpkg file,
    33  	// use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403
    34  	// LifeRay makes it pretty cumbersome to make a such plugins; their docs are
    35  	// out of date, and they charge for their IDE. If you find an example
    36  	// project that we can build in CI feel free to include it
    37  }
    38  
    39  // javaArchiveHashes are all the current hash algorithms used to calculate archive digests
    40  var javaArchiveHashes = []crypto.Hash{
    41  	crypto.SHA1,
    42  }
    43  
    44  type archiveParser struct {
    45  	fileManifest intFile.ZipFileManifest
    46  	location     file.Location
    47  	archivePath  string
    48  	contentPath  string
    49  	fileInfo     archiveFilename
    50  	detectNested bool
    51  }
    52  
    53  // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives.
    54  func parseJavaArchive(_ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    55  	parser, cleanupFn, err := newJavaArchiveParser(reader, true)
    56  	// note: even on error, we should always run cleanup functions
    57  	defer cleanupFn()
    58  	if err != nil {
    59  		return nil, nil, err
    60  	}
    61  	return parser.parse()
    62  }
    63  
    64  // uniquePkgKey creates a unique string to identify the given package.
    65  func uniquePkgKey(p *pkg.Package) string {
    66  	if p == nil {
    67  		return ""
    68  	}
    69  	return fmt.Sprintf("%s|%s", p.Name, p.Version)
    70  }
    71  
    72  // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover
    73  // and parse nested archives or ignore them.
    74  func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool) (*archiveParser, func(), error) {
    75  	// fetch the last element of the virtual path
    76  	virtualElements := strings.Split(reader.AccessPath(), ":")
    77  	currentFilepath := virtualElements[len(virtualElements)-1]
    78  
    79  	contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader)
    80  	if err != nil {
    81  		return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err)
    82  	}
    83  
    84  	fileManifest, err := intFile.NewZipFileManifest(archivePath)
    85  	if err != nil {
    86  		return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err)
    87  	}
    88  
    89  	return &archiveParser{
    90  		fileManifest: fileManifest,
    91  		location:     reader.Location,
    92  		archivePath:  archivePath,
    93  		contentPath:  contentPath,
    94  		fileInfo:     newJavaArchiveFilename(currentFilepath),
    95  		detectNested: detectNested,
    96  	}, cleanupFn, nil
    97  }
    98  
    99  // parse the loaded archive and return all packages found.
   100  func (j *archiveParser) parse() ([]pkg.Package, []artifact.Relationship, error) {
   101  	var pkgs []pkg.Package
   102  	var relationships []artifact.Relationship
   103  
   104  	// find the parent package from the java manifest
   105  	parentPkg, err := j.discoverMainPackage()
   106  	if err != nil {
   107  		return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err)
   108  	}
   109  
   110  	// find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg
   111  	// NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml
   112  	auxPkgs, err := j.discoverPkgsFromAllMavenFiles(parentPkg)
   113  	if err != nil {
   114  		return nil, nil, err
   115  	}
   116  	pkgs = append(pkgs, auxPkgs...)
   117  
   118  	if j.detectNested {
   119  		// find nested java archive packages
   120  		nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(parentPkg)
   121  		if err != nil {
   122  			return nil, nil, err
   123  		}
   124  		pkgs = append(pkgs, nestedPkgs...)
   125  		relationships = append(relationships, nestedRelationships...)
   126  	}
   127  
   128  	// lastly, add the parent package to the list (assuming the parent exists)
   129  	if parentPkg != nil {
   130  		pkgs = append([]pkg.Package{*parentPkg}, pkgs...)
   131  	}
   132  
   133  	// add pURLs to all packages found
   134  	// note: since package information may change after initial creation when parsing multiple locations within the
   135  	// jar, we wait until the conclusion of the parsing process before synthesizing pURLs.
   136  	for i := range pkgs {
   137  		p := &pkgs[i]
   138  		if m, ok := p.Metadata.(pkg.JavaMetadata); ok {
   139  			p.PURL = packageURL(p.Name, p.Version, m)
   140  		} else {
   141  			log.WithFields("package", p.String()).Warn("unable to extract java metadata to generate purl")
   142  		}
   143  		p.SetID()
   144  	}
   145  
   146  	return pkgs, relationships, nil
   147  }
   148  
   149  // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages.
   150  func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) {
   151  	// search and parse java manifest files
   152  	// TODO: do we want to prefer or check for pom files over manifest here?
   153  	manifestMatches := j.fileManifest.GlobMatch(manifestGlob)
   154  	if len(manifestMatches) > 1 {
   155  		return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches)
   156  	} else if len(manifestMatches) == 0 {
   157  		// we did not find any manifests, but that may not be a problem (there may be other information to generate packages for)
   158  		return nil, nil
   159  	}
   160  
   161  	// fetch the manifest file
   162  	contents, err := intFile.ContentsFromZip(j.archivePath, manifestMatches...)
   163  	if err != nil {
   164  		return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err)
   165  	}
   166  
   167  	// parse the manifest file into a rich object
   168  	manifestContents := contents[manifestMatches[0]]
   169  	manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents))
   170  	if err != nil {
   171  		log.Warnf("failed to parse java manifest (%s): %+v", j.location, err)
   172  		return nil, nil
   173  	}
   174  
   175  	archiveCloser, err := os.Open(j.archivePath)
   176  	if err != nil {
   177  		return nil, fmt.Errorf("unable to open archive path (%s): %w", j.archivePath, err)
   178  	}
   179  	defer archiveCloser.Close()
   180  
   181  	// grab and assign digest for the entire archive
   182  	digests, err := file.NewDigestsFromFile(archiveCloser, javaArchiveHashes)
   183  	if err != nil {
   184  		log.Warnf("failed to create digest for file=%q: %+v", j.archivePath, err)
   185  	}
   186  
   187  	// we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest
   188  	licenses := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...)
   189  	return &pkg.Package{
   190  		Name:     selectName(manifest, j.fileInfo),
   191  		Version:  selectVersion(manifest, j.fileInfo),
   192  		Language: pkg.Java,
   193  		Licenses: pkg.NewLicenseSet(licenses...),
   194  		Locations: file.NewLocationSet(
   195  			j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   196  		),
   197  		Type:         j.fileInfo.pkgType(),
   198  		MetadataType: pkg.JavaMetadataType,
   199  		Metadata: pkg.JavaMetadata{
   200  			VirtualPath:    j.location.AccessPath(),
   201  			Manifest:       manifest,
   202  			ArchiveDigests: digests,
   203  		},
   204  	}, nil
   205  }
   206  
   207  // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given
   208  // parent package, returning all listed Java packages found for each pom
   209  // properties discovered and potentially updating the given parentPkg with new
   210  // data.
   211  func (j *archiveParser) discoverPkgsFromAllMavenFiles(parentPkg *pkg.Package) ([]pkg.Package, error) {
   212  	if parentPkg == nil {
   213  		return nil, nil
   214  	}
   215  
   216  	var pkgs []pkg.Package
   217  
   218  	// pom.properties
   219  	properties, err := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(pomPropertiesGlob))
   220  	if err != nil {
   221  		return nil, err
   222  	}
   223  
   224  	// pom.xml
   225  	projects, err := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(pomXMLGlob))
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  
   230  	for parentPath, propertiesObj := range properties {
   231  		var pomProject *pkg.PomProject
   232  		if proj, exists := projects[parentPath]; exists {
   233  			pomProject = &proj
   234  		}
   235  
   236  		pkgFromPom := newPackageFromMavenData(propertiesObj, pomProject, parentPkg, j.location)
   237  		if pkgFromPom != nil {
   238  			pkgs = append(pkgs, *pkgFromPom)
   239  		}
   240  	}
   241  
   242  	return pkgs, nil
   243  }
   244  
   245  func (j *archiveParser) discoverPkgsFromNestedArchives(parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   246  	// we know that all java archives are zip formatted files, so we can use the shared zip helper
   247  	return discoverPkgsFromZip(j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg)
   248  }
   249  
   250  // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and
   251  // associating each discovered package to the given parent package.
   252  func discoverPkgsFromZip(location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   253  	// search and parse pom.properties files & fetch the contents
   254  	openers, err := intFile.ExtractFromZipToUniqueTempFile(archivePath, contentPath, fileManifest.GlobMatch(archiveFormatGlobs...)...)
   255  	if err != nil {
   256  		return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err)
   257  	}
   258  
   259  	return discoverPkgsFromOpeners(location, openers, parentPkg)
   260  }
   261  
   262  // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package.
   263  func discoverPkgsFromOpeners(location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) {
   264  	var pkgs []pkg.Package
   265  	var relationships []artifact.Relationship
   266  
   267  	for pathWithinArchive, archiveOpener := range openers {
   268  		nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(location, pathWithinArchive, archiveOpener)
   269  		if err != nil {
   270  			log.WithFields("location", location.AccessPath()).Warnf("unable to discover java packages from opener: %+v", err)
   271  			continue
   272  		}
   273  
   274  		// attach the parent package to all discovered packages that are not already associated with a java archive
   275  		for _, p := range nestedPkgs {
   276  			if metadata, ok := p.Metadata.(pkg.JavaMetadata); ok {
   277  				if metadata.Parent == nil {
   278  					metadata.Parent = parentPkg
   279  				}
   280  				p.Metadata = metadata
   281  			}
   282  			pkgs = append(pkgs, p)
   283  		}
   284  
   285  		relationships = append(relationships, nestedRelationships...)
   286  	}
   287  
   288  	return pkgs, relationships, nil
   289  }
   290  
   291  // discoverPkgsFromOpener finds Java archives within the given file.
   292  func discoverPkgsFromOpener(location file.Location, pathWithinArchive string, archiveOpener intFile.Opener) ([]pkg.Package, []artifact.Relationship, error) {
   293  	archiveReadCloser, err := archiveOpener.Open()
   294  	if err != nil {
   295  		return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err)
   296  	}
   297  	defer func() {
   298  		if closeErr := archiveReadCloser.Close(); closeErr != nil {
   299  			log.Warnf("unable to close archived file from tempdir: %+v", closeErr)
   300  		}
   301  	}()
   302  
   303  	nestedPath := fmt.Sprintf("%s:%s", location.AccessPath(), pathWithinArchive)
   304  	nestedLocation := file.NewLocationFromCoordinates(location.Coordinates)
   305  	nestedLocation.VirtualPath = nestedPath
   306  	nestedPkgs, nestedRelationships, err := parseJavaArchive(nil, nil, file.LocationReadCloser{
   307  		Location:   nestedLocation,
   308  		ReadCloser: archiveReadCloser,
   309  	})
   310  	if err != nil {
   311  		return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err)
   312  	}
   313  
   314  	return nestedPkgs, nestedRelationships, nil
   315  }
   316  
   317  func pomPropertiesByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.PomProperties, error) {
   318  	contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...)
   319  	if err != nil {
   320  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   321  	}
   322  
   323  	propertiesByParentPath := make(map[string]pkg.PomProperties)
   324  	for filePath, fileContents := range contentsOfMavenPropertiesFiles {
   325  		pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents))
   326  		if err != nil {
   327  			log.WithFields("contents-path", filePath, "location", location.AccessPath()).Warnf("failed to parse pom.properties: %+v", err)
   328  			continue
   329  		}
   330  
   331  		if pomProperties == nil {
   332  			continue
   333  		}
   334  
   335  		if pomProperties.Version == "" || pomProperties.ArtifactID == "" {
   336  			// TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package
   337  			continue
   338  		}
   339  
   340  		propertiesByParentPath[path.Dir(filePath)] = *pomProperties
   341  	}
   342  
   343  	return propertiesByParentPath, nil
   344  }
   345  
   346  func pomProjectByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.PomProject, error) {
   347  	contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...)
   348  	if err != nil {
   349  		return nil, fmt.Errorf("unable to extract maven files: %w", err)
   350  	}
   351  
   352  	projectByParentPath := make(map[string]pkg.PomProject)
   353  	for filePath, fileContents := range contentsOfMavenProjectFiles {
   354  		pomProject, err := parsePomXMLProject(filePath, strings.NewReader(fileContents))
   355  		if err != nil {
   356  			log.WithFields("contents-path", filePath, "location", location.AccessPath()).Warnf("failed to parse pom.xml: %+v", err)
   357  			continue
   358  		}
   359  
   360  		if pomProject == nil {
   361  			continue
   362  		}
   363  
   364  		if pomProject.Version == "" || pomProject.ArtifactID == "" {
   365  			// TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package
   366  			continue
   367  		}
   368  
   369  		projectByParentPath[path.Dir(filePath)] = *pomProject
   370  	}
   371  	return projectByParentPath, nil
   372  }
   373  
   374  // packagesFromPomProperties processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
   375  // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not.
   376  func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.PomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package {
   377  	// keep the artifact name within the virtual path if this package does not match the parent package
   378  	vPathSuffix := ""
   379  	if !strings.HasPrefix(pomProperties.ArtifactID, parentPkg.Name) {
   380  		vPathSuffix += ":" + pomProperties.ArtifactID
   381  	}
   382  	virtualPath := location.AccessPath() + vPathSuffix
   383  
   384  	// discovered props = new package
   385  	p := pkg.Package{
   386  		Name:    pomProperties.ArtifactID,
   387  		Version: pomProperties.Version,
   388  		Locations: file.NewLocationSet(
   389  			location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
   390  		),
   391  		Language:     pkg.Java,
   392  		Type:         pomProperties.PkgTypeIndicated(),
   393  		MetadataType: pkg.JavaMetadataType,
   394  		Metadata: pkg.JavaMetadata{
   395  			VirtualPath:   virtualPath,
   396  			PomProperties: &pomProperties,
   397  			PomProject:    pomProject,
   398  			Parent:        parentPkg,
   399  		},
   400  	}
   401  
   402  	if packageIdentitiesMatch(p, parentPkg) {
   403  		updateParentPackage(p, parentPkg)
   404  		return nil
   405  	}
   406  
   407  	return &p
   408  }
   409  
   410  func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
   411  	// the name/version pair matches...
   412  	if uniquePkgKey(&p) == uniquePkgKey(parentPkg) {
   413  		return true
   414  	}
   415  
   416  	metadata, ok := p.Metadata.(pkg.JavaMetadata)
   417  	if !ok {
   418  		log.WithFields("package", p.String()).Warn("unable to extract java metadata to check for matching package identity")
   419  		return false
   420  	}
   421  
   422  	parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata)
   423  	if !ok {
   424  		log.WithFields("package", p.String()).Warn("unable to extract java metadata from parent for verifying virtual path")
   425  		return false
   426  	}
   427  
   428  	// the virtual path matches...
   429  	if parentMetadata.VirtualPath == metadata.VirtualPath {
   430  		return true
   431  	}
   432  
   433  	// the pom artifactId is the parent name
   434  	// note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good
   435  	// example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but
   436  	// they do not indicate the same package.
   437  	if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID {
   438  		return true
   439  	}
   440  
   441  	return false
   442  }
   443  
   444  func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) {
   445  	// we've run across more information about our parent package, add this info to the parent package metadata
   446  	// the pom properties is typically a better source of information for name and version than the manifest
   447  	parentPkg.Name = p.Name
   448  	parentPkg.Version = p.Version
   449  
   450  	// we may have learned more about the type via data in the pom properties
   451  	parentPkg.Type = p.Type
   452  
   453  	metadata, ok := p.Metadata.(pkg.JavaMetadata)
   454  	if !ok {
   455  		return
   456  	}
   457  	pomPropertiesCopy := *metadata.PomProperties
   458  
   459  	// keep the pom properties, but don't overwrite existing pom properties
   460  	parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata)
   461  	if ok && parentMetadata.PomProperties == nil {
   462  		parentMetadata.PomProperties = &pomPropertiesCopy
   463  		parentPkg.Metadata = parentMetadata
   464  	}
   465  }