github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/java/archive_parser.go (about) 1 package java 2 3 import ( 4 "cmp" 5 "context" 6 "crypto" 7 "fmt" 8 "io" 9 "iter" 10 "os" 11 "path" 12 "slices" 13 "strings" 14 15 "github.com/scylladb/go-set/strset" 16 "golang.org/x/exp/maps" 17 18 "github.com/anchore/syft/internal" 19 intFile "github.com/anchore/syft/internal/file" 20 "github.com/anchore/syft/internal/log" 21 "github.com/anchore/syft/internal/unknown" 22 "github.com/anchore/syft/syft/artifact" 23 "github.com/anchore/syft/syft/file" 24 "github.com/anchore/syft/syft/pkg" 25 "github.com/anchore/syft/syft/pkg/cataloger/generic" 26 "github.com/anchore/syft/syft/pkg/cataloger/internal/licenses" 27 "github.com/anchore/syft/syft/pkg/cataloger/java/internal/maven" 28 ) 29 30 var archiveFormatGlobs = []string{ 31 "**/*.jar", 32 "**/*.war", 33 "**/*.ear", 34 "**/*.par", 35 "**/*.sar", 36 "**/*.nar", 37 "**/*.jpi", 38 "**/*.hpi", 39 "**/*.kar", 40 "**/*.far", 41 "**/*.lpkg", // Zip-compressed package used to deploy applications 42 // (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter 43 // has information about the application and installation requirements. 44 // NOTE(jonasagx): If you would like to test it with lpkg file, 45 // use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403 46 // LifeRay makes it pretty cumbersome to make a such plugins; their docs are 47 // out of date, and they charge for their IDE. If you find an example 48 // project that we can build in CI feel free to include it 49 "**/*.rar", // Java Resource Adapter Archive 50 } 51 52 // javaArchiveHashes are all the current hash algorithms used to calculate archive digests 53 var javaArchiveHashes = []crypto.Hash{ 54 crypto.SHA1, 55 } 56 57 type archiveParser struct { 58 fileManifest intFile.ZipFileManifest 59 location file.Location 60 archivePath string 61 contentPath string 62 fileInfo archiveFilename 63 detectNested bool 64 cfg ArchiveCatalogerConfig 65 maven *maven.Resolver 66 } 67 68 type genericArchiveParserAdapter struct { 69 cfg ArchiveCatalogerConfig 70 } 71 72 func newGenericArchiveParserAdapter(cfg ArchiveCatalogerConfig) genericArchiveParserAdapter { 73 return genericArchiveParserAdapter{cfg: cfg} 74 } 75 76 // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives 77 func (gap genericArchiveParserAdapter) parseJavaArchive(ctx context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { 78 return gap.processJavaArchive(ctx, reader, nil) 79 } 80 81 // processJavaArchive processes an archive for java contents, returning all Java libraries and nested archives 82 func (gap genericArchiveParserAdapter) processJavaArchive(ctx context.Context, reader file.LocationReadCloser, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 83 parser, cleanupFn, err := newJavaArchiveParser(ctx, reader, true, gap.cfg) 84 // note: even on error, we should always run cleanup functions 85 defer cleanupFn() 86 if err != nil { 87 return nil, nil, err 88 } 89 return parser.parse(ctx, parentPkg) 90 } 91 92 // uniquePkgKey creates a unique string to identify the given package. 93 func uniquePkgKey(groupID string, p *pkg.Package) string { 94 if p == nil { 95 return "" 96 } 97 return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version) 98 } 99 100 // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover 101 // and parse nested archives or ignore them. 102 func newJavaArchiveParser(ctx context.Context, reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig) (*archiveParser, func(), error) { 103 // fetch the last element of the virtual path 104 virtualElements := strings.Split(reader.Path(), ":") 105 currentFilepath := virtualElements[len(virtualElements)-1] 106 107 contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader) 108 if err != nil { 109 return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err) 110 } 111 112 fileManifest, err := intFile.NewZipFileManifest(ctx, archivePath) 113 if err != nil { 114 return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err) 115 } 116 117 return &archiveParser{ 118 fileManifest: fileManifest, 119 location: reader.Location, 120 archivePath: archivePath, 121 contentPath: contentPath, 122 fileInfo: newJavaArchiveFilename(currentFilepath), 123 detectNested: detectNested, 124 cfg: cfg, 125 maven: maven.NewResolver(nil, cfg.mavenConfig()), 126 }, cleanupFn, nil 127 } 128 129 // parse the loaded archive and return all packages found. 130 func (j *archiveParser) parse(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 131 var pkgs []pkg.Package 132 var relationships []artifact.Relationship 133 134 // find the parent package from the java manifest 135 mainPkg, err := j.discoverMainPackage(ctx) 136 if err != nil { 137 return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err) 138 } 139 140 // find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg 141 // NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml 142 // IMPORTANT!: discoverPkgsFromAllMavenFiles may change mainPkg information, so needs to be called before SetID and before copying for relationships, etc. 143 auxPkgs, err := j.discoverPkgsFromAllMavenFiles(ctx, mainPkg) 144 if err != nil { 145 return nil, nil, err 146 } 147 148 if mainPkg != nil { 149 finalizePackage(mainPkg) 150 pkgs = append(pkgs, *mainPkg) 151 152 if parentPkg != nil { 153 relationships = append(relationships, artifact.Relationship{ 154 From: *mainPkg, 155 To: *parentPkg, 156 Type: artifact.DependencyOfRelationship, 157 }) 158 } 159 } 160 161 for i := range auxPkgs { 162 auxPkg := &auxPkgs[i] 163 164 finalizePackage(auxPkg) 165 pkgs = append(pkgs, *auxPkg) 166 167 if mainPkg != nil { 168 relationships = append(relationships, artifact.Relationship{ 169 From: *auxPkg, 170 To: *mainPkg, 171 Type: artifact.DependencyOfRelationship, 172 }) 173 } 174 } 175 176 var errs error 177 if j.detectNested { 178 // find nested java archive packages 179 nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(ctx, mainPkg) 180 if err != nil { 181 errs = unknown.Append(errs, j.location, err) 182 } 183 pkgs = append(pkgs, nestedPkgs...) 184 relationships = append(relationships, nestedRelationships...) 185 } else { 186 // .jar and .war files are present in archives, are others? or generally just consider them top-level? 187 nestedArchives := j.fileManifest.GlobMatch(true, "**/*.jar", "**/*.war") 188 if len(nestedArchives) > 0 { 189 slices.Sort(nestedArchives) 190 errs = unknown.Appendf(errs, j.location, "nested archives not cataloged: %v", strings.Join(nestedArchives, ", ")) 191 } 192 } 193 194 if len(pkgs) == 0 { 195 errs = unknown.Appendf(errs, j.location, "no package identified in archive") 196 } 197 198 return pkgs, relationships, errs 199 } 200 201 // finalizePackage potentially updates some package information such as classifying the package as a Jenkins plugin, 202 // sets the PURL, and calls p.SetID() 203 func finalizePackage(p *pkg.Package) { 204 if m, ok := p.Metadata.(pkg.JavaArchive); ok { 205 p.PURL = packageURL(p.Name, p.Version, m) 206 207 if strings.Contains(p.PURL, "io.jenkins.plugins") || strings.Contains(p.PURL, "org.jenkins-ci.plugins") { 208 p.Type = pkg.JenkinsPluginPkg 209 } 210 } else { 211 log.WithFields("package", p.String()).Debug("unable to extract java metadata to generate purl") 212 } 213 214 p.SetID() 215 } 216 217 // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages. 218 func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, error) { 219 // search and parse java manifest files 220 manifestMatches := j.fileManifest.GlobMatch(false, manifestGlob) 221 if len(manifestMatches) > 1 { 222 return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches) 223 } else if len(manifestMatches) == 0 { 224 // we did not find any manifests, but that may not be a problem (there may be other information to generate packages for) 225 return nil, nil 226 } 227 228 // fetch the manifest file 229 contents, err := intFile.ContentsFromZip(ctx, j.archivePath, manifestMatches...) 230 if err != nil { 231 return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err) 232 } 233 234 // parse the manifest file into a rich object 235 manifestContents := contents[manifestMatches[0]] 236 manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents)) 237 if err != nil { 238 log.Debugf("failed to parse java manifest (%s): %+v", j.location, err) 239 return nil, nil 240 } 241 242 // check for existence of Weave-Classes manifest key in order to exclude jars getting misrepresented as 243 // their targeted counterparts, e.g. newrelic spring and tomcat instrumentation 244 if _, ok := manifest.Main.Get("Weave-Classes"); ok { 245 log.Debugf("excluding archive due to Weave-Classes manifest entry: %s", j.location) 246 return nil, nil 247 } 248 249 // grab and assign digest for the entire archive 250 digests, err := getDigestsFromArchive(ctx, j.archivePath) 251 if err != nil { 252 return nil, err 253 } 254 255 name, version, lics, parsedPom := j.discoverNameVersionLicense(ctx, manifest) 256 var pkgPomProject *pkg.JavaPomProject 257 if parsedPom != nil { 258 pkgPomProject = newPomProject(ctx, j.maven, parsedPom.path, parsedPom.project) 259 } 260 261 return &pkg.Package{ 262 // TODO: maybe select name should just have a pom properties in it? 263 Name: name, 264 Version: version, 265 Language: pkg.Java, 266 Licenses: pkg.NewLicenseSet(lics...), 267 Locations: file.NewLocationSet( 268 j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 269 ), 270 Type: j.fileInfo.pkgType(), 271 Metadata: pkg.JavaArchive{ 272 VirtualPath: j.location.Path(), 273 Manifest: manifest, 274 PomProject: pkgPomProject, 275 ArchiveDigests: digests, 276 }, 277 }, nil 278 } 279 280 func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest *pkg.JavaManifest) (string, string, []pkg.License, *parsedPomProject) { 281 // we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest 282 // TODO: when we support locations of paths within archives we should start passing the specific manifest location object instead of the top jar 283 lics := pkg.NewLicensesFromLocationWithContext(ctx, j.location, selectLicenses(manifest)...) 284 /* 285 We should name and version from, in this order: 286 1. pom.properties if we find exactly 1 287 2. pom.xml if we find exactly 1 288 3. manifest 289 4. filename 290 */ 291 groupID, artifactID, version, parsedPom := j.discoverMainPackageFromPomInfo(ctx) 292 if artifactID == "" { 293 artifactID = selectName(manifest, j.fileInfo) 294 } 295 if version == "" { 296 version = selectVersion(manifest, j.fileInfo) 297 } 298 299 if len(lics) == 0 { 300 fileLicenses := j.getLicenseFromFileInArchive(ctx) 301 if fileLicenses != nil { 302 lics = append(lics, fileLicenses...) 303 } 304 } 305 306 // If we didn't find any licenses in the archive so far, we'll try again in Maven Central using groupIDFromJavaMetadata 307 if len(lics) == 0 { 308 // Today we don't have a way to distinguish between licenses from the manifest and licenses from the pom.xml 309 // until the file.Location object can support sub-paths (i.e. paths within archives, recursively; issue https://github.com/anchore/syft/issues/2211). 310 // Until then it's less confusing to use the licenses from the pom.xml only if the manifest did not list any. 311 lics = j.findLicenseFromJavaMetadata(ctx, groupID, artifactID, version, parsedPom, manifest) 312 } 313 314 return artifactID, version, lics, parsedPom 315 } 316 317 // findLicenseFromJavaMetadata attempts to find license information from all available maven metadata properties and pom info 318 func (j *archiveParser) findLicenseFromJavaMetadata(ctx context.Context, groupID, artifactID, version string, parsedPom *parsedPomProject, manifest *pkg.JavaManifest) []pkg.License { 319 if groupID == "" { 320 if gID := groupIDFromJavaMetadata(artifactID, pkg.JavaArchive{Manifest: manifest}); gID != "" { 321 groupID = gID 322 } 323 } 324 325 var err error 326 var pomLicenses []maven.License 327 if parsedPom != nil { 328 pomLicenses, err = j.maven.ResolveLicenses(ctx, parsedPom.project) 329 if err != nil { 330 log.WithFields("error", err, "mavenID", j.maven.ResolveID(ctx, parsedPom.project)).Trace("error attempting to resolve pom licenses") 331 } 332 } 333 334 if err == nil && len(pomLicenses) == 0 { 335 pomLicenses, err = j.maven.FindLicenses(ctx, groupID, artifactID, version) 336 if err != nil { 337 log.WithFields("error", err, "mavenID", maven.NewID(groupID, artifactID, version)).Trace("error attempting to find licenses") 338 } 339 } 340 341 if len(pomLicenses) == 0 { 342 // Try removing the last part of the groupId, as sometimes it duplicates the artifactId 343 packages := strings.Split(groupID, ".") 344 groupID = strings.Join(packages[:len(packages)-1], ".") 345 pomLicenses, err = j.maven.FindLicenses(ctx, groupID, artifactID, version) 346 if err != nil { 347 log.WithFields("error", err, "mavenID", maven.NewID(groupID, artifactID, version)).Trace("error attempting to find sub-group licenses") 348 } 349 } 350 351 return toPkgLicenses(ctx, &j.location, pomLicenses) 352 } 353 354 func toPkgLicenses(ctx context.Context, location *file.Location, licenses []maven.License) []pkg.License { 355 var out []pkg.License 356 for _, license := range licenses { 357 name := "" 358 if license.Name != nil { 359 name = *license.Name 360 } 361 url := "" 362 if license.URL != nil { 363 url = *license.URL 364 } 365 // note: it is possible to: 366 // - have a license without a URL 367 // - have license and a URL 368 // - have a URL without a license (this is weird, but can happen) 369 if name == "" && url == "" { 370 continue 371 } 372 out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, name, url, location)) 373 } 374 return out 375 } 376 377 type parsedPomProject struct { 378 path string 379 project *maven.Project 380 } 381 382 // discoverMainPackageFromPomInfo attempts to resolve maven groupId, artifactId, version and other info from found pom information 383 func (j *archiveParser) discoverMainPackageFromPomInfo(ctx context.Context) (group, name, version string, parsedPom *parsedPomProject) { 384 // Find the pom.properties/pom.xml if the names seem like a plausible match 385 properties, _ := pomPropertiesByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob)) 386 projects, _ := pomProjectByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob)) 387 388 artifactsMap := j.buildArtifactsMap(properties) 389 pomProperties, parsedPom := j.findBestPomMatch(properties, projects, artifactsMap) 390 391 parsedPom = j.handleSinglePomXML(properties, projects, parsedPom) 392 393 return j.resolveIdentity(ctx, pomProperties, parsedPom) 394 } 395 396 func (j *archiveParser) buildArtifactsMap(properties map[string]pkg.JavaPomProperties) *strset.Set { 397 artifactsMap := strset.New() 398 for _, propertiesObj := range properties { 399 artifactsMap.Add(propertiesObj.ArtifactID) 400 } 401 return artifactsMap 402 } 403 404 func (j *archiveParser) findBestPomMatch(properties map[string]pkg.JavaPomProperties, 405 projects map[string]*parsedPomProject, artifactsMap *strset.Set) (pkg.JavaPomProperties, *parsedPomProject) { 406 var pomProperties pkg.JavaPomProperties 407 var parsedPom *parsedPomProject 408 409 for parentPath, propertiesObj := range sortedIter(properties) { 410 if !artifactIDMatchesFilename(propertiesObj.ArtifactID, j.fileInfo.name, artifactsMap) { 411 continue 412 } 413 414 pomProperties, parsedPom = j.updateMatchIfBetter(pomProperties, parsedPom, propertiesObj, parentPath, projects) 415 416 if j.isExactMatch(propertiesObj, parsedPom) { 417 break 418 } 419 } 420 421 return pomProperties, parsedPom 422 } 423 424 func (j *archiveParser) updateMatchIfBetter(currentProps pkg.JavaPomProperties, currentPom *parsedPomProject, 425 newProps pkg.JavaPomProperties, parentPath string, projects map[string]*parsedPomProject) (pkg.JavaPomProperties, *parsedPomProject) { 426 // Keep the first match 427 if currentProps.ArtifactID == "" { 428 proj, hasProject := projects[parentPath] 429 if hasProject { 430 return newProps, proj 431 } 432 return newProps, currentPom 433 } 434 435 proj, hasProject := projects[parentPath] 436 if !hasProject { 437 return currentProps, currentPom 438 } 439 440 // Keep the first matching artifact with a pom.xml 441 if currentPom == nil { 442 return newProps, proj 443 } 444 445 // Prefer exact matches 446 if j.isExactMatch(newProps, proj) { 447 return newProps, proj 448 } 449 450 return currentProps, currentPom 451 } 452 453 func (j *archiveParser) isExactMatch(props pkg.JavaPomProperties, pom *parsedPomProject) bool { 454 if pom == nil { 455 return false 456 } 457 return strings.Contains(j.fileInfo.name, props.GroupID) || j.fileInfo.name == props.ArtifactID 458 } 459 460 func (j *archiveParser) handleSinglePomXML(properties map[string]pkg.JavaPomProperties, 461 projects map[string]*parsedPomProject, currentPom *parsedPomProject) *parsedPomProject { 462 if len(properties) == 0 && len(projects) == 1 { 463 for _, projectsObj := range projects { 464 return projectsObj 465 } 466 } 467 return currentPom 468 } 469 470 func (j *archiveParser) resolveIdentity(ctx context.Context, pomProperties pkg.JavaPomProperties, 471 parsedPom *parsedPomProject) (group, name, version string, pom *parsedPomProject) { 472 group = pomProperties.GroupID 473 name = pomProperties.ArtifactID 474 version = pomProperties.Version 475 476 if parsedPom != nil && parsedPom.project != nil { 477 id := j.maven.ResolveID(ctx, parsedPom.project) 478 if group == "" { 479 group = id.GroupID 480 } 481 if name == "" { 482 name = id.ArtifactID 483 } 484 if version == "" { 485 version = id.Version 486 } 487 } 488 489 return group, name, version, parsedPom 490 } 491 492 // artifactIDMatchesFilename returns true if one starts with the other 493 func artifactIDMatchesFilename(artifactID, fileName string, artifactsMap *strset.Set) bool { 494 if artifactID == "" || fileName == "" { 495 return false 496 } 497 // Ensure true is returned when filename matches the artifact ID, prevent random retrieval by checking prefix and suffix 498 if artifactsMap.Has(fileName) { 499 return artifactID == fileName 500 } 501 // Use fallback check with suffix and prefix if no POM properties file matches the exact artifact name 502 return strings.HasPrefix(artifactID, fileName) || strings.HasSuffix(fileName, artifactID) 503 } 504 505 // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given 506 // parent package, returning all listed Java packages found for each pom 507 // properties discovered and potentially updating the given parentPkg with new 508 // data. 509 func (j *archiveParser) discoverPkgsFromAllMavenFiles(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, error) { 510 if parentPkg == nil { 511 return nil, nil 512 } 513 514 var pkgs []pkg.Package 515 516 // pom.properties 517 properties, err := pomPropertiesByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob)) 518 if err != nil { 519 return nil, err 520 } 521 522 // pom.xml 523 projects, err := pomProjectByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob)) 524 if err != nil { 525 return nil, err 526 } 527 528 for parentPath, propertiesObj := range sortedIter(properties) { 529 var parsedPom *parsedPomProject 530 if proj, exists := projects[parentPath]; exists { 531 parsedPom = proj 532 } 533 534 pkgFromPom := newPackageFromMavenData(ctx, j.maven, propertiesObj, parsedPom, parentPkg, j.location) 535 if pkgFromPom != nil { 536 pkgs = append(pkgs, *pkgFromPom) 537 } 538 } 539 540 return pkgs, nil 541 } 542 543 func getDigestsFromArchive(ctx context.Context, archivePath string) ([]file.Digest, error) { 544 archiveCloser, err := os.Open(archivePath) 545 if err != nil { 546 return nil, fmt.Errorf("unable to open archive path (%s): %w", archivePath, err) 547 } 548 defer internal.CloseAndLogError(archiveCloser, archivePath) 549 550 // grab and assign digest for the entire archive 551 digests, err := intFile.NewDigestsFromFile(ctx, archiveCloser, javaArchiveHashes) 552 if err != nil { 553 log.Debugf("failed to create digest for file=%q: %+v", archivePath, err) 554 } 555 556 return digests, nil 557 } 558 559 func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) []pkg.License { 560 // prefer identified licenses, fall back to unknown 561 var identified []pkg.License 562 var unidentified []pkg.License 563 564 for _, glob := range []string{"/META-INF/*", "/*"} { 565 var licenseMatches []string 566 for _, f := range j.fileManifest.GlobMatch(true, glob) { 567 if licenses.IsLicenseFile(path.Base(f)) { 568 licenseMatches = append(licenseMatches, f) 569 } 570 } 571 572 if len(licenseMatches) > 0 { 573 contents, err := intFile.ContentsFromZip(ctx, j.archivePath, licenseMatches...) 574 if err != nil { 575 log.Debugf("unable to extract java license (%s): %w", j.location, err) 576 continue 577 } 578 579 for _, licenseMatch := range licenseMatches { 580 licenseContents := contents[licenseMatch] 581 r := strings.NewReader(licenseContents) 582 foundLicenses := pkg.NewLicensesFromReadCloserWithContext(ctx, file.NewLocationReadCloser(j.location, io.NopCloser(r))) 583 for _, l := range foundLicenses { 584 if l.SPDXExpression != "" { 585 identified = append(identified, l) 586 } else { 587 unidentified = append(unidentified, l) 588 } 589 } 590 } 591 592 // prefer licenses found in /META-INF 593 if len(identified) > 0 { 594 break 595 } 596 } 597 } 598 599 if len(identified) == 0 { 600 return unidentified 601 } 602 603 return identified 604 } 605 606 func (j *archiveParser) discoverPkgsFromNestedArchives(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 607 // we know that all java archives are zip formatted files, so we can use the shared zip helper 608 return discoverPkgsFromZip(ctx, j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg, j.cfg) 609 } 610 611 // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and 612 // associating each discovered package to the given parent package. 613 func discoverPkgsFromZip(ctx context.Context, location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) { 614 // search and parse pom.properties files & fetch the contents 615 openers, err := intFile.ExtractFromZipToUniqueTempFile(ctx, archivePath, contentPath, fileManifest.GlobMatch(false, archiveFormatGlobs...)...) 616 if err != nil { 617 return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err) 618 } 619 620 return discoverPkgsFromOpeners(ctx, location, openers, parentPkg, cfg) 621 } 622 623 // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package. 624 func discoverPkgsFromOpeners(ctx context.Context, location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) { 625 var pkgs []pkg.Package 626 var relationships []artifact.Relationship 627 628 for pathWithinArchive, archiveOpener := range sortedIter(openers) { 629 nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(ctx, location, pathWithinArchive, archiveOpener, cfg, parentPkg) 630 if err != nil { 631 log.WithFields("location", location.Path(), "error", err).Debug("unable to discover java packages from opener") 632 continue 633 } 634 635 // attach the parent package to all discovered packages that are not already associated with a java archive 636 for _, p := range nestedPkgs { 637 if metadata, ok := p.Metadata.(pkg.JavaArchive); ok { 638 if metadata.Parent == nil { 639 metadata.Parent = parentPkg 640 } 641 p.Metadata = metadata 642 } 643 pkgs = append(pkgs, p) 644 } 645 646 relationships = append(relationships, nestedRelationships...) 647 } 648 649 return pkgs, relationships, nil 650 } 651 652 // discoverPkgsFromOpener finds Java archives within the given file. 653 func discoverPkgsFromOpener(ctx context.Context, location file.Location, pathWithinArchive string, archiveOpener intFile.Opener, cfg ArchiveCatalogerConfig, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 654 archiveReadCloser, err := archiveOpener.Open() 655 if err != nil { 656 return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err) 657 } 658 defer func() { 659 if closeErr := archiveReadCloser.Close(); closeErr != nil { 660 log.Debugf("unable to close archived file from tempdir: %+v", closeErr) 661 } 662 }() 663 664 nestedPath := fmt.Sprintf("%s:%s", location.Path(), pathWithinArchive) 665 nestedLocation := file.NewLocationFromCoordinates(location.Coordinates) 666 nestedLocation.AccessPath = nestedPath 667 gap := newGenericArchiveParserAdapter(cfg) 668 nestedPkgs, nestedRelationships, err := gap.processJavaArchive(ctx, file.LocationReadCloser{ 669 Location: nestedLocation, 670 ReadCloser: archiveReadCloser, 671 }, parentPkg) 672 if err != nil { 673 return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err) 674 } 675 676 return nestedPkgs, nestedRelationships, nil 677 } 678 679 func pomPropertiesByParentPath(ctx context.Context, archivePath string, location file.Location, extractPaths []string) (map[string]pkg.JavaPomProperties, error) { 680 contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(ctx, archivePath, extractPaths...) 681 if err != nil { 682 return nil, fmt.Errorf("unable to extract maven files: %w", err) 683 } 684 685 propertiesByParentPath := make(map[string]pkg.JavaPomProperties) 686 for filePath, fileContents := range sortedIter(contentsOfMavenPropertiesFiles) { 687 pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents)) 688 if err != nil { 689 log.WithFields("contents-path", filePath, "location", location.Path(), "error", err).Debug("failed to parse pom.properties") 690 continue 691 } 692 693 if pomProperties == nil { 694 continue 695 } 696 697 if pomProperties.Version == "" || pomProperties.ArtifactID == "" { 698 // TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package 699 continue 700 } 701 702 propertiesByParentPath[path.Dir(filePath)] = *pomProperties 703 } 704 705 return propertiesByParentPath, nil 706 } 707 708 func pomProjectByParentPath(ctx context.Context, archivePath string, location file.Location, extractPaths []string) (map[string]*parsedPomProject, error) { 709 contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(ctx, archivePath, extractPaths...) 710 if err != nil { 711 return nil, fmt.Errorf("unable to extract maven files: %w", err) 712 } 713 714 projectByParentPath := make(map[string]*parsedPomProject) 715 for filePath, fileContents := range sortedIter(contentsOfMavenProjectFiles) { 716 // TODO: when we support locations of paths within archives we should start passing the specific pom.xml location object instead of the top jar 717 pom, err := maven.ParsePomXML(strings.NewReader(fileContents)) 718 if err != nil { 719 log.WithFields("contents-path", filePath, "location", location.Path(), "error", err).Debug("failed to parse pom.xml") 720 continue 721 } 722 if pom == nil { 723 continue 724 } 725 726 projectByParentPath[path.Dir(filePath)] = &parsedPomProject{ 727 path: filePath, 728 project: pom, 729 } 730 } 731 return projectByParentPath, nil 732 } 733 734 // newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and 735 // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not. 736 func newPackageFromMavenData(ctx context.Context, r *maven.Resolver, pomProperties pkg.JavaPomProperties, parsedPom *parsedPomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package { 737 // keep the artifact name within the virtual path if this package does not match the parent package 738 vPathSuffix := "" 739 groupID := "" 740 if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive); ok { 741 groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) 742 } 743 744 parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version) 745 // Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package 746 // See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical 747 // packages and create equal virtual paths so they are de duped in the future 748 pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version) 749 if parentKey != pomProjectKey { 750 // build a new virtual path suffix for the package that is different from the parent package 751 // we want to use the GroupID and ArtifactID here to preserve uniqueness 752 // Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core") 753 // https://github.com/anchore/syft/issues/1944 754 vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID 755 } 756 virtualPath := location.Path() + vPathSuffix 757 758 var pkgPomProject *pkg.JavaPomProject 759 760 var err error 761 var pomLicenses []maven.License 762 if parsedPom == nil { 763 // If we have no pom.xml, check maven central using pom.properties 764 pomLicenses, err = r.FindLicenses(ctx, pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version) 765 } else { 766 pkgPomProject = newPomProject(ctx, r, parsedPom.path, parsedPom.project) 767 pomLicenses, err = r.ResolveLicenses(ctx, parsedPom.project) 768 } 769 770 if err != nil { 771 log.WithFields("error", err, "mavenID", maven.NewID(pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)).Trace("error attempting to resolve licenses") 772 } 773 774 licenseSet := pkg.NewLicenseSet(toPkgLicenses(ctx, &location, pomLicenses)...) 775 776 p := pkg.Package{ 777 Name: pomProperties.ArtifactID, 778 Version: pomProperties.Version, 779 Locations: file.NewLocationSet( 780 location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 781 ), 782 Licenses: licenseSet, 783 Language: pkg.Java, 784 Type: pomProperties.PkgTypeIndicated(), 785 Metadata: pkg.JavaArchive{ 786 VirtualPath: virtualPath, 787 PomProperties: &pomProperties, 788 PomProject: pkgPomProject, 789 Parent: parentPkg, 790 }, 791 } 792 793 if packageIdentitiesMatch(p, parentPkg) { 794 updateParentPackage(p, parentPkg) 795 return nil 796 } 797 798 return &p 799 } 800 801 func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool { 802 metadata, ok := p.Metadata.(pkg.JavaArchive) 803 parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaArchive) 804 if !ok || !parentOk { 805 switch { 806 case !ok: 807 log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name) 808 default: // !parentOk 809 log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name) 810 } 811 // if we can't extract metadata, we can check for matching identities via the package name 812 // this is not ideal, but it's better than nothing - this should not be used if we have Metadata 813 814 return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg) 815 } 816 817 // try to determine identity with the metadata 818 groupID := groupIDFromJavaMetadata(p.Name, metadata) 819 parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) 820 if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) { 821 return true 822 } 823 824 // the virtual path matches... 825 if parentMetadata.VirtualPath == metadata.VirtualPath { 826 return true 827 } 828 829 // the pom artifactId is the parent name 830 // note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good 831 // example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but 832 // they do not indicate the same package. 833 // NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name 834 // from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core") 835 // we will use this check as a last resort 836 if metadata.PomProperties != nil { 837 if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID { 838 return true 839 } 840 } 841 return false 842 } 843 844 func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) { 845 // we've run across more information about our parent package, add this info to the parent package metadata 846 // the pom properties is typically a better source of information for name and version than the manifest 847 parentPkg.Name = p.Name 848 parentPkg.Version = p.Version 849 850 // we may have learned more about the type via data in the pom properties 851 parentPkg.Type = p.Type 852 853 metadata, ok := p.Metadata.(pkg.JavaArchive) 854 if !ok { 855 return 856 } 857 pomPropertiesCopy := *metadata.PomProperties 858 859 // keep the pom properties, but don't overwrite existing pom properties 860 parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive) 861 if ok && parentMetadata.PomProperties == nil { 862 parentMetadata.PomProperties = &pomPropertiesCopy 863 parentPkg.Metadata = parentMetadata 864 } 865 } 866 867 func sortedIter[K cmp.Ordered, V any](values map[K]V) iter.Seq2[K, V] { 868 return func(yield func(K, V) bool) { 869 keys := maps.Keys(values) 870 slices.Sort(keys) 871 for _, key := range keys { 872 if !yield(key, values[key]) { 873 return 874 } 875 } 876 } 877 }