github.com/nextlinux/gosbom@v0.81.1-0.20230627115839-1ff50c281391/gosbom/pkg/cataloger/java/archive_parser.go (about) 1 package java 2 3 import ( 4 "crypto" 5 "fmt" 6 "os" 7 "path" 8 "strings" 9 10 "github.com/nextlinux/gosbom/gosbom/artifact" 11 "github.com/nextlinux/gosbom/gosbom/file" 12 "github.com/nextlinux/gosbom/gosbom/pkg" 13 "github.com/nextlinux/gosbom/gosbom/pkg/cataloger/generic" 14 intFile "github.com/nextlinux/gosbom/internal/file" 15 "github.com/nextlinux/gosbom/internal/log" 16 ) 17 18 var _ generic.Parser = parseJavaArchive 19 20 var archiveFormatGlobs = []string{ 21 "**/*.jar", 22 "**/*.war", 23 "**/*.ear", 24 "**/*.par", 25 "**/*.sar", 26 "**/*.nar", 27 "**/*.jpi", 28 "**/*.hpi", 29 "**/*.lpkg", // Zip-compressed package used to deploy applications 30 // (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter 31 // has information about the application and installation requirements. 32 // NOTE(jonasagx): If you would like to test it with lpkg file, 33 // use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403 34 // LifeRay makes it pretty cumbersome to make a such plugins; their docs are 35 // out of date, and they charge for their IDE. If you find an example 36 // project that we can build in CI feel free to include it 37 } 38 39 // javaArchiveHashes are all the current hash algorithms used to calculate archive digests 40 var javaArchiveHashes = []crypto.Hash{ 41 crypto.SHA1, 42 } 43 44 type archiveParser struct { 45 fileManifest intFile.ZipFileManifest 46 location file.Location 47 archivePath string 48 contentPath string 49 fileInfo archiveFilename 50 detectNested bool 51 } 52 53 // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives. 54 func parseJavaArchive(_ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { 55 parser, cleanupFn, err := newJavaArchiveParser(reader, true) 56 // note: even on error, we should always run cleanup functions 57 defer cleanupFn() 58 if err != nil { 59 return nil, nil, err 60 } 61 return parser.parse() 62 } 63 64 // uniquePkgKey creates a unique string to identify the given package. 65 func uniquePkgKey(p *pkg.Package) string { 66 if p == nil { 67 return "" 68 } 69 return fmt.Sprintf("%s|%s", p.Name, p.Version) 70 } 71 72 // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover 73 // and parse nested archives or ignore them. 74 func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool) (*archiveParser, func(), error) { 75 // fetch the last element of the virtual path 76 virtualElements := strings.Split(reader.AccessPath(), ":") 77 currentFilepath := virtualElements[len(virtualElements)-1] 78 79 contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader) 80 if err != nil { 81 return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err) 82 } 83 84 fileManifest, err := intFile.NewZipFileManifest(archivePath) 85 if err != nil { 86 return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err) 87 } 88 89 return &archiveParser{ 90 fileManifest: fileManifest, 91 location: reader.Location, 92 archivePath: archivePath, 93 contentPath: contentPath, 94 fileInfo: newJavaArchiveFilename(currentFilepath), 95 detectNested: detectNested, 96 }, cleanupFn, nil 97 } 98 99 // parse the loaded archive and return all packages found. 100 func (j *archiveParser) parse() ([]pkg.Package, []artifact.Relationship, error) { 101 var pkgs []pkg.Package 102 var relationships []artifact.Relationship 103 104 // find the parent package from the java manifest 105 parentPkg, err := j.discoverMainPackage() 106 if err != nil { 107 return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err) 108 } 109 110 // find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg 111 // NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml 112 auxPkgs, err := j.discoverPkgsFromAllMavenFiles(parentPkg) 113 if err != nil { 114 return nil, nil, err 115 } 116 pkgs = append(pkgs, auxPkgs...) 117 118 if j.detectNested { 119 // find nested java archive packages 120 nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(parentPkg) 121 if err != nil { 122 return nil, nil, err 123 } 124 pkgs = append(pkgs, nestedPkgs...) 125 relationships = append(relationships, nestedRelationships...) 126 } 127 128 // lastly, add the parent package to the list (assuming the parent exists) 129 if parentPkg != nil { 130 pkgs = append([]pkg.Package{*parentPkg}, pkgs...) 131 } 132 133 // add pURLs to all packages found 134 // note: since package information may change after initial creation when parsing multiple locations within the 135 // jar, we wait until the conclusion of the parsing process before synthesizing pURLs. 136 for i := range pkgs { 137 p := &pkgs[i] 138 if m, ok := p.Metadata.(pkg.JavaMetadata); ok { 139 p.PURL = packageURL(p.Name, p.Version, m) 140 } else { 141 log.WithFields("package", p.String()).Warn("unable to extract java metadata to generate purl") 142 } 143 p.SetID() 144 } 145 146 return pkgs, relationships, nil 147 } 148 149 // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages. 150 func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) { 151 // search and parse java manifest files 152 // TODO: do we want to prefer or check for pom files over manifest here? 153 manifestMatches := j.fileManifest.GlobMatch(manifestGlob) 154 if len(manifestMatches) > 1 { 155 return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches) 156 } else if len(manifestMatches) == 0 { 157 // we did not find any manifests, but that may not be a problem (there may be other information to generate packages for) 158 return nil, nil 159 } 160 161 // fetch the manifest file 162 contents, err := intFile.ContentsFromZip(j.archivePath, manifestMatches...) 163 if err != nil { 164 return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err) 165 } 166 167 // parse the manifest file into a rich object 168 manifestContents := contents[manifestMatches[0]] 169 manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents)) 170 if err != nil { 171 log.Warnf("failed to parse java manifest (%s): %+v", j.location, err) 172 return nil, nil 173 } 174 175 archiveCloser, err := os.Open(j.archivePath) 176 if err != nil { 177 return nil, fmt.Errorf("unable to open archive path (%s): %w", j.archivePath, err) 178 } 179 defer archiveCloser.Close() 180 181 // grab and assign digest for the entire archive 182 digests, err := file.NewDigestsFromFile(archiveCloser, javaArchiveHashes) 183 if err != nil { 184 log.Warnf("failed to create digest for file=%q: %+v", j.archivePath, err) 185 } 186 187 // we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest 188 licenses := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...) 189 return &pkg.Package{ 190 Name: selectName(manifest, j.fileInfo), 191 Version: selectVersion(manifest, j.fileInfo), 192 Language: pkg.Java, 193 Licenses: pkg.NewLicenseSet(licenses...), 194 Locations: file.NewLocationSet( 195 j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 196 ), 197 Type: j.fileInfo.pkgType(), 198 MetadataType: pkg.JavaMetadataType, 199 Metadata: pkg.JavaMetadata{ 200 VirtualPath: j.location.AccessPath(), 201 Manifest: manifest, 202 ArchiveDigests: digests, 203 }, 204 }, nil 205 } 206 207 // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given 208 // parent package, returning all listed Java packages found for each pom 209 // properties discovered and potentially updating the given parentPkg with new 210 // data. 211 func (j *archiveParser) discoverPkgsFromAllMavenFiles(parentPkg *pkg.Package) ([]pkg.Package, error) { 212 if parentPkg == nil { 213 return nil, nil 214 } 215 216 var pkgs []pkg.Package 217 218 // pom.properties 219 properties, err := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(pomPropertiesGlob)) 220 if err != nil { 221 return nil, err 222 } 223 224 // pom.xml 225 projects, err := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(pomXMLGlob)) 226 if err != nil { 227 return nil, err 228 } 229 230 for parentPath, propertiesObj := range properties { 231 var pomProject *pkg.PomProject 232 if proj, exists := projects[parentPath]; exists { 233 pomProject = &proj 234 } 235 236 pkgFromPom := newPackageFromMavenData(propertiesObj, pomProject, parentPkg, j.location) 237 if pkgFromPom != nil { 238 pkgs = append(pkgs, *pkgFromPom) 239 } 240 } 241 242 return pkgs, nil 243 } 244 245 func (j *archiveParser) discoverPkgsFromNestedArchives(parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 246 // we know that all java archives are zip formatted files, so we can use the shared zip helper 247 return discoverPkgsFromZip(j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg) 248 } 249 250 // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and 251 // associating each discovered package to the given parent package. 252 func discoverPkgsFromZip(location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 253 // search and parse pom.properties files & fetch the contents 254 openers, err := intFile.ExtractFromZipToUniqueTempFile(archivePath, contentPath, fileManifest.GlobMatch(archiveFormatGlobs...)...) 255 if err != nil { 256 return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err) 257 } 258 259 return discoverPkgsFromOpeners(location, openers, parentPkg) 260 } 261 262 // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package. 263 func discoverPkgsFromOpeners(location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 264 var pkgs []pkg.Package 265 var relationships []artifact.Relationship 266 267 for pathWithinArchive, archiveOpener := range openers { 268 nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(location, pathWithinArchive, archiveOpener) 269 if err != nil { 270 log.WithFields("location", location.AccessPath()).Warnf("unable to discover java packages from opener: %+v", err) 271 continue 272 } 273 274 // attach the parent package to all discovered packages that are not already associated with a java archive 275 for _, p := range nestedPkgs { 276 if metadata, ok := p.Metadata.(pkg.JavaMetadata); ok { 277 if metadata.Parent == nil { 278 metadata.Parent = parentPkg 279 } 280 p.Metadata = metadata 281 } 282 pkgs = append(pkgs, p) 283 } 284 285 relationships = append(relationships, nestedRelationships...) 286 } 287 288 return pkgs, relationships, nil 289 } 290 291 // discoverPkgsFromOpener finds Java archives within the given file. 292 func discoverPkgsFromOpener(location file.Location, pathWithinArchive string, archiveOpener intFile.Opener) ([]pkg.Package, []artifact.Relationship, error) { 293 archiveReadCloser, err := archiveOpener.Open() 294 if err != nil { 295 return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err) 296 } 297 defer func() { 298 if closeErr := archiveReadCloser.Close(); closeErr != nil { 299 log.Warnf("unable to close archived file from tempdir: %+v", closeErr) 300 } 301 }() 302 303 nestedPath := fmt.Sprintf("%s:%s", location.AccessPath(), pathWithinArchive) 304 nestedLocation := file.NewLocationFromCoordinates(location.Coordinates) 305 nestedLocation.VirtualPath = nestedPath 306 nestedPkgs, nestedRelationships, err := parseJavaArchive(nil, nil, file.LocationReadCloser{ 307 Location: nestedLocation, 308 ReadCloser: archiveReadCloser, 309 }) 310 if err != nil { 311 return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err) 312 } 313 314 return nestedPkgs, nestedRelationships, nil 315 } 316 317 func pomPropertiesByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.PomProperties, error) { 318 contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) 319 if err != nil { 320 return nil, fmt.Errorf("unable to extract maven files: %w", err) 321 } 322 323 propertiesByParentPath := make(map[string]pkg.PomProperties) 324 for filePath, fileContents := range contentsOfMavenPropertiesFiles { 325 pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents)) 326 if err != nil { 327 log.WithFields("contents-path", filePath, "location", location.AccessPath()).Warnf("failed to parse pom.properties: %+v", err) 328 continue 329 } 330 331 if pomProperties == nil { 332 continue 333 } 334 335 if pomProperties.Version == "" || pomProperties.ArtifactID == "" { 336 // TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package 337 continue 338 } 339 340 propertiesByParentPath[path.Dir(filePath)] = *pomProperties 341 } 342 343 return propertiesByParentPath, nil 344 } 345 346 func pomProjectByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.PomProject, error) { 347 contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) 348 if err != nil { 349 return nil, fmt.Errorf("unable to extract maven files: %w", err) 350 } 351 352 projectByParentPath := make(map[string]pkg.PomProject) 353 for filePath, fileContents := range contentsOfMavenProjectFiles { 354 pomProject, err := parsePomXMLProject(filePath, strings.NewReader(fileContents)) 355 if err != nil { 356 log.WithFields("contents-path", filePath, "location", location.AccessPath()).Warnf("failed to parse pom.xml: %+v", err) 357 continue 358 } 359 360 if pomProject == nil { 361 continue 362 } 363 364 if pomProject.Version == "" || pomProject.ArtifactID == "" { 365 // TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package 366 continue 367 } 368 369 projectByParentPath[path.Dir(filePath)] = *pomProject 370 } 371 return projectByParentPath, nil 372 } 373 374 // packagesFromPomProperties processes a single Maven POM properties for a given parent package, returning all listed Java packages found and 375 // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not. 376 func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.PomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package { 377 // keep the artifact name within the virtual path if this package does not match the parent package 378 vPathSuffix := "" 379 if !strings.HasPrefix(pomProperties.ArtifactID, parentPkg.Name) { 380 vPathSuffix += ":" + pomProperties.ArtifactID 381 } 382 virtualPath := location.AccessPath() + vPathSuffix 383 384 // discovered props = new package 385 p := pkg.Package{ 386 Name: pomProperties.ArtifactID, 387 Version: pomProperties.Version, 388 Locations: file.NewLocationSet( 389 location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 390 ), 391 Language: pkg.Java, 392 Type: pomProperties.PkgTypeIndicated(), 393 MetadataType: pkg.JavaMetadataType, 394 Metadata: pkg.JavaMetadata{ 395 VirtualPath: virtualPath, 396 PomProperties: &pomProperties, 397 PomProject: pomProject, 398 Parent: parentPkg, 399 }, 400 } 401 402 if packageIdentitiesMatch(p, parentPkg) { 403 updateParentPackage(p, parentPkg) 404 return nil 405 } 406 407 return &p 408 } 409 410 func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool { 411 // the name/version pair matches... 412 if uniquePkgKey(&p) == uniquePkgKey(parentPkg) { 413 return true 414 } 415 416 metadata, ok := p.Metadata.(pkg.JavaMetadata) 417 if !ok { 418 log.WithFields("package", p.String()).Warn("unable to extract java metadata to check for matching package identity") 419 return false 420 } 421 422 parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata) 423 if !ok { 424 log.WithFields("package", p.String()).Warn("unable to extract java metadata from parent for verifying virtual path") 425 return false 426 } 427 428 // the virtual path matches... 429 if parentMetadata.VirtualPath == metadata.VirtualPath { 430 return true 431 } 432 433 // the pom artifactId is the parent name 434 // note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good 435 // example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but 436 // they do not indicate the same package. 437 if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID { 438 return true 439 } 440 441 return false 442 } 443 444 func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) { 445 // we've run across more information about our parent package, add this info to the parent package metadata 446 // the pom properties is typically a better source of information for name and version than the manifest 447 parentPkg.Name = p.Name 448 parentPkg.Version = p.Version 449 450 // we may have learned more about the type via data in the pom properties 451 parentPkg.Type = p.Type 452 453 metadata, ok := p.Metadata.(pkg.JavaMetadata) 454 if !ok { 455 return 456 } 457 pomPropertiesCopy := *metadata.PomProperties 458 459 // keep the pom properties, but don't overwrite existing pom properties 460 parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata) 461 if ok && parentMetadata.PomProperties == nil { 462 parentMetadata.PomProperties = &pomPropertiesCopy 463 parentPkg.Metadata = parentMetadata 464 } 465 }