github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/pkg/cataloger/java/archive_parser.go (about) 1 package java 2 3 import ( 4 "crypto" 5 "fmt" 6 "io" 7 "net/http" 8 "net/url" 9 "os" 10 "path" 11 "strings" 12 "time" 13 14 "github.com/vifraa/gopom" 15 16 "github.com/anchore/syft/syft/artifact" 17 "github.com/anchore/syft/syft/file" 18 "github.com/anchore/syft/syft/pkg" 19 "github.com/anchore/syft/syft/pkg/cataloger/generic" 20 intFile "github.com/lineaje-labs/syft/internal/file" 21 "github.com/lineaje-labs/syft/internal/licenses" 22 "github.com/lineaje-labs/syft/internal/log" 23 ) 24 25 var archiveFormatGlobs = []string{ 26 "**/*.jar", 27 "**/*.war", 28 "**/*.ear", 29 "**/*.par", 30 "**/*.sar", 31 "**/*.nar", 32 "**/*.jpi", 33 "**/*.hpi", 34 "**/*.lpkg", // Zip-compressed package used to deploy applications 35 // (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter 36 // has information about the application and installation requirements. 37 // NOTE(jonasagx): If you would like to test it with lpkg file, 38 // use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403 39 // LifeRay makes it pretty cumbersome to make a such plugins; their docs are 40 // out of date, and they charge for their IDE. If you find an example 41 // project that we can build in CI feel free to include it 42 } 43 44 // javaArchiveHashes are all the current hash algorithms used to calculate archive digests 45 var javaArchiveHashes = []crypto.Hash{ 46 crypto.MD5, 47 crypto.SHA1, 48 crypto.SHA256, 49 } 50 51 type archiveParser struct { 52 fileManifest intFile.ZipFileManifest 53 location file.Location 54 archivePath string 55 contentPath string 56 fileInfo archiveFilename 57 detectNested bool 58 cfg ArchiveCatalogerConfig 59 } 60 61 type genericArchiveParserAdapter struct { 62 cfg ArchiveCatalogerConfig 63 } 64 65 func newGenericArchiveParserAdapter(cfg ArchiveCatalogerConfig) genericArchiveParserAdapter { 66 return genericArchiveParserAdapter{cfg: cfg} 67 } 68 69 // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives. 70 func (gap genericArchiveParserAdapter) parseJavaArchive( 71 _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser, 72 ) ([]pkg.Package, []artifact.Relationship, error) { 73 parser, cleanupFn, err := newJavaArchiveParser(reader, true, gap.cfg) 74 // note: even on error, we should always run cleanup functions 75 defer cleanupFn() 76 if err != nil { 77 return nil, nil, err 78 } 79 return parser.parse() 80 } 81 82 // uniquePkgKey creates a unique string to identify the given package. 83 func uniquePkgKey(groupID string, p *pkg.Package) string { 84 if p == nil { 85 return "" 86 } 87 return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version) 88 } 89 90 // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover 91 // and parse nested archives or ignore them. 92 func newJavaArchiveParser( 93 reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig, 94 ) (*archiveParser, func(), error) { 95 // fetch the last element of the virtual path 96 virtualElements := strings.Split(reader.Path(), ":") 97 currentFilepath := virtualElements[len(virtualElements)-1] 98 99 contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader) 100 if err != nil { 101 return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err) 102 } 103 104 fileManifest, err := intFile.NewZipFileManifest(archivePath) 105 if err != nil { 106 return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err) 107 } 108 109 return &archiveParser{ 110 fileManifest: fileManifest, 111 location: reader.Location, 112 archivePath: archivePath, 113 contentPath: contentPath, 114 fileInfo: newJavaArchiveFilename(currentFilepath), 115 detectNested: detectNested, 116 cfg: cfg, 117 }, cleanupFn, nil 118 } 119 120 // parse the loaded archive and return all packages found. 121 func (j *archiveParser) parse() ([]pkg.Package, []artifact.Relationship, error) { 122 var pkgs []pkg.Package 123 var relationships []artifact.Relationship 124 125 // find the parent package from the java manifest 126 parentPkg, err := j.discoverMainPackage() 127 if err != nil { 128 return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err) 129 } 130 131 // find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg 132 // NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml 133 auxPkgs, err := j.discoverPkgsFromAllMavenFiles(parentPkg) 134 if err != nil { 135 return nil, nil, err 136 } 137 pkgs = append(pkgs, auxPkgs...) 138 139 if j.detectNested { 140 // find nested java archive packages 141 nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(parentPkg) 142 if err != nil { 143 return nil, nil, err 144 } 145 pkgs = append(pkgs, nestedPkgs...) 146 relationships = append(relationships, nestedRelationships...) 147 } 148 149 // lastly, add the parent package to the list (assuming the parent exists) 150 if parentPkg != nil { 151 pkgs = append([]pkg.Package{*parentPkg}, pkgs...) 152 } 153 154 // add pURLs to all packages found 155 // note: since package information may change after initial creation when parsing multiple locations within the 156 // jar, we wait until the conclusion of the parsing process before synthesizing pURLs. 157 for i := range pkgs { 158 p := &pkgs[i] 159 if m, ok := p.Metadata.(pkg.JavaArchive); ok { 160 p.PURL = packageURL(p.Name, p.Version, m) 161 } else { 162 log.WithFields("package", p.String()).Warn("unable to extract java metadata to generate purl") 163 } 164 p.SetID() 165 } 166 167 return pkgs, relationships, nil 168 } 169 170 // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages. 171 func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) { 172 // search and parse java manifest files 173 manifestMatches := j.fileManifest.GlobMatch(false, manifestGlob) 174 if len(manifestMatches) > 1 { 175 return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches) 176 } else if len(manifestMatches) == 0 { 177 // we did not find any manifests, but that may not be a problem (there may be other information to generate packages for) 178 return nil, nil 179 } 180 181 // fetch the manifest file 182 contents, err := intFile.ContentsFromZip(j.archivePath, manifestMatches...) 183 if err != nil { 184 return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err) 185 } 186 187 // parse the manifest file into a rich object 188 manifestContents := contents[manifestMatches[0]] 189 manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents)) 190 if err != nil { 191 log.Warnf("failed to parse java manifest (%s): %+v", j.location, err) 192 return nil, nil 193 } 194 195 // grab and assign digest for the entire archive 196 digests, err := getDigestsFromArchive(j.archivePath) 197 if err != nil { 198 return nil, err 199 } 200 201 licenses, name, version, err := j.parseLicenses(manifest) 202 if err != nil { 203 return nil, err 204 } 205 206 return &pkg.Package{ 207 // TODO: maybe select name should just have a pom properties in it? 208 Name: name, 209 Version: version, 210 Language: pkg.Java, 211 Licenses: pkg.NewLicenseSet(licenses...), 212 Locations: file.NewLocationSet( 213 j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 214 ), 215 Type: j.fileInfo.pkgType(), 216 Metadata: pkg.JavaArchive{ 217 VirtualPath: j.location.Path(), 218 Manifest: manifest, 219 ArchiveDigests: digests, 220 }, 221 }, nil 222 } 223 224 func (j *archiveParser) parseLicenses(manifest *pkg.JavaManifest) ([]pkg.License, string, string, error) { 225 // we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest 226 // TODO: when we support locations of paths within archives we should start passing the specific manifest location object instead of the top jar 227 licenses := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...) 228 /* 229 We should name and version from, in this order: 230 1. pom.properties if we find exactly 1 231 2. pom.xml if we find exactly 1 232 3. manifest 233 4. filename 234 */ 235 name, version, pomLicenses := j.guessMainPackageNameAndVersionFromPomInfo() 236 if name == "" { 237 name = selectName(manifest, j.fileInfo) 238 } 239 if version == "" { 240 version = selectVersion(manifest, j.fileInfo) 241 } 242 if len(licenses) == 0 { 243 // Today we don't have a way to distinguish between licenses from the manifest and licenses from the pom.xml 244 // until the file.Location object can support sub-paths (i.e. paths within archives, recursively; issue https://github.com/anchore/syft/issues/2211). 245 // Until then it's less confusing to use the licenses from the pom.xml only if the manifest did not list any. 246 licenses = append(licenses, pomLicenses...) 247 } 248 249 if len(licenses) == 0 { 250 fileLicenses, err := j.getLicenseFromFileInArchive() 251 if err != nil { 252 return nil, "", "", err 253 } 254 if fileLicenses != nil { 255 licenses = append(licenses, fileLicenses...) 256 } 257 } 258 259 // If we didn't find any licenses in the archive so far, we'll try again in Maven Central using groupIDFromJavaMetadata 260 if len(licenses) == 0 && j.cfg.UseNetwork { 261 licenses = findLicenseFromJavaMetadata(name, manifest, version, j, licenses) 262 } 263 264 return licenses, name, version, nil 265 } 266 267 func findLicenseFromJavaMetadata( 268 name string, manifest *pkg.JavaManifest, version string, j *archiveParser, licenses []pkg.License, 269 ) []pkg.License { 270 var groupID = name 271 if gID := groupIDFromJavaMetadata(name, pkg.JavaArchive{Manifest: manifest}); gID != "" { 272 groupID = gID 273 } 274 pomLicenses, err := recursivelyFindLicensesFromParentPom(groupID, name, version, j.cfg) 275 if err != nil { 276 log.Tracef("unable to get parent pom from Maven central: %v", err) 277 } 278 279 if len(pomLicenses) == 0 { 280 // Try removing the last part of the groupId, as sometimes it duplicates the artifactId 281 packages := strings.Split(groupID, ".") 282 groupID = strings.Join(packages[:len(packages)-1], ".") 283 pomLicenses, err = recursivelyFindLicensesFromParentPom(groupID, name, version, j.cfg) 284 if err != nil { 285 log.Tracef("unable to get parent pom from Maven central: %v", err) 286 } 287 } 288 289 if len(pomLicenses) > 0 { 290 pkgLicenses := pkg.NewLicensesFromLocation(j.location, pomLicenses...) 291 if pkgLicenses != nil { 292 licenses = append(licenses, pkgLicenses...) 293 } 294 } 295 return licenses 296 } 297 298 type parsedPomProject struct { 299 *pkg.JavaPomProject 300 Licenses []pkg.License 301 } 302 303 func (j *archiveParser) guessMainPackageNameAndVersionFromPomInfo() (name, version string, licenses []pkg.License) { 304 pomPropertyMatches := j.fileManifest.GlobMatch(false, pomPropertiesGlob) 305 pomMatches := j.fileManifest.GlobMatch(false, pomXMLGlob) 306 var pomPropertiesObject pkg.JavaPomProperties 307 var pomProjectObject *parsedPomProject 308 309 // Find the pom.properties/pom.xml if the names seem like a plausible match 310 properties, _ := pomPropertiesByParentPath(j.archivePath, j.location, pomPropertyMatches) 311 projects, _ := pomProjectByParentPath(j.archivePath, j.location, pomMatches) 312 313 for parentPath, propertiesObj := range properties { 314 if artifactIDMatchesFilename(propertiesObj.ArtifactID, j.fileInfo.name) { 315 pomPropertiesObject = propertiesObj 316 if proj, exists := projects[parentPath]; exists { 317 pomProjectObject = proj 318 break 319 } 320 } 321 } 322 323 name = pomPropertiesObject.ArtifactID 324 if name == "" && pomProjectObject != nil { 325 name = pomProjectObject.ArtifactID 326 } 327 version = pomPropertiesObject.Version 328 if version == "" && pomProjectObject != nil { 329 version = pomProjectObject.Version 330 } 331 if pomProjectObject != nil && j.cfg.UseNetwork { 332 findPomLicenses(pomProjectObject, j.cfg) 333 } 334 335 if pomProjectObject != nil { 336 licenses = pomProjectObject.Licenses 337 } 338 339 return name, version, licenses 340 } 341 342 func artifactIDMatchesFilename(artifactID, fileName string) bool { 343 if artifactID == "" || fileName == "" { 344 return false 345 } 346 return strings.HasPrefix(artifactID, fileName) || strings.HasSuffix(fileName, artifactID) 347 } 348 349 func findPomLicenses(pomProjectObject *parsedPomProject, cfg ArchiveCatalogerConfig) { 350 // If we don't have any licenses until now, and if we have a parent Pom, then we'll check the parent pom in maven central for licenses. 351 if pomProjectObject != nil && pomProjectObject.Parent != nil && len(pomProjectObject.Licenses) == 0 { 352 parentLicenses, err := recursivelyFindLicensesFromParentPom( 353 pomProjectObject.Parent.GroupID, 354 pomProjectObject.Parent.ArtifactID, 355 pomProjectObject.Parent.Version, 356 cfg) 357 if err != nil { 358 // We don't want to abort here as the parent pom might not exist in Maven Central, we'll just log the error 359 log.Tracef("unable to get parent pom from Maven central: %v", err) 360 return 361 } 362 if len(parentLicenses) > 0 { 363 for _, licenseName := range parentLicenses { 364 pomProjectObject.Licenses = append(pomProjectObject.Licenses, pkg.NewLicenseFromFields(licenseName, "", nil)) 365 } 366 } 367 } 368 } 369 370 func formatMavenPomURL(groupID, artifactID, version, mavenBaseURL string) (requestURL string, err error) { 371 // groupID needs to go from maven.org -> maven/org 372 urlPath := strings.Split(groupID, ".") 373 artifactPom := fmt.Sprintf("%s-%s.pom", artifactID, version) 374 urlPath = append(urlPath, artifactID, version, artifactPom) 375 376 // ex:"https://repo1.maven.org/maven2/groupID/artifactID/artifactPom 377 requestURL, err = url.JoinPath(mavenBaseURL, urlPath...) 378 if err != nil { 379 return requestURL, fmt.Errorf("could not construct maven url: %w", err) 380 } 381 return requestURL, err 382 } 383 384 func recursivelyFindLicensesFromParentPom( 385 groupID, artifactID, version string, cfg ArchiveCatalogerConfig, 386 ) ([]string, error) { 387 var licenses []string 388 // As there can be nested parent poms, we'll recursively check for licenses until we reach the max depth 389 for i := 0; i < cfg.MaxParentRecursiveDepth; i++ { 390 parentPom, err := getPomFromMavenRepo(groupID, artifactID, version, cfg.MavenBaseURL) 391 if err != nil { 392 return nil, err 393 } 394 parentLicenses := parseLicensesFromPom(parentPom) 395 if len(parentLicenses) > 0 || parentPom == nil || parentPom.Parent == nil { 396 licenses = parentLicenses 397 break 398 } 399 400 groupID = *parentPom.Parent.GroupID 401 artifactID = *parentPom.Parent.ArtifactID 402 version = *parentPom.Parent.Version 403 } 404 405 return licenses, nil 406 } 407 408 func getPomFromMavenRepo(groupID, artifactID, version, mavenBaseURL string) (*gopom.Project, error) { 409 requestURL, err := formatMavenPomURL(groupID, artifactID, version, mavenBaseURL) 410 if err != nil { 411 return nil, err 412 } 413 log.Tracef("trying to fetch parent pom from Maven central %s", requestURL) 414 415 mavenRequest, err := http.NewRequest(http.MethodGet, requestURL, nil) 416 if err != nil { 417 return nil, fmt.Errorf("unable to format request for Maven central: %w", err) 418 } 419 420 httpClient := &http.Client{ 421 Timeout: time.Second * 10, 422 } 423 424 resp, err := httpClient.Do(mavenRequest) 425 if err != nil { 426 return nil, fmt.Errorf("unable to get pom from Maven central: %w", err) 427 } 428 defer func() { 429 if err := resp.Body.Close(); err != nil { 430 log.Errorf("unable to close body: %+v", err) 431 } 432 }() 433 434 bytes, err := io.ReadAll(resp.Body) 435 if err != nil { 436 return nil, fmt.Errorf("unable to parse pom from Maven central: %w", err) 437 } 438 439 pom, err := decodePomXML(strings.NewReader(string(bytes))) 440 if err != nil { 441 return nil, fmt.Errorf("unable to parse pom from Maven central: %w", err) 442 } 443 444 return &pom, nil 445 } 446 447 func parseLicensesFromPom(pom *gopom.Project) []string { 448 var licenses []string 449 if pom != nil && pom.Licenses != nil { 450 for _, license := range *pom.Licenses { 451 if license.Name != nil { 452 licenses = append(licenses, *license.Name) 453 } else if license.URL != nil { 454 licenses = append(licenses, *license.URL) 455 } 456 } 457 } 458 459 return licenses 460 } 461 462 // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given 463 // parent package, returning all listed Java packages found for each pom 464 // properties discovered and potentially updating the given parentPkg with new 465 // data. 466 func (j *archiveParser) discoverPkgsFromAllMavenFiles(parentPkg *pkg.Package) ([]pkg.Package, error) { 467 if parentPkg == nil { 468 return nil, nil 469 } 470 471 var pkgs []pkg.Package 472 473 // pom.properties 474 properties, err := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob)) 475 if err != nil { 476 return nil, err 477 } 478 479 // pom.xml 480 projects, err := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob)) 481 if err != nil { 482 return nil, err 483 } 484 485 for parentPath, propertiesObj := range properties { 486 var pomProject *parsedPomProject 487 if proj, exists := projects[parentPath]; exists { 488 pomProject = proj 489 } 490 491 pkgFromPom := newPackageFromMavenData(propertiesObj, pomProject, parentPkg, j.location, j.cfg) 492 if pkgFromPom != nil { 493 pkgs = append(pkgs, *pkgFromPom) 494 } 495 } 496 497 return pkgs, nil 498 } 499 500 func getDigestsFromArchive(archivePath string) ([]file.Digest, error) { 501 archiveCloser, err := os.Open(archivePath) 502 if err != nil { 503 return nil, fmt.Errorf("unable to open archive path (%s): %w", archivePath, err) 504 } 505 defer archiveCloser.Close() 506 507 // grab and assign digest for the entire archive 508 digests, err := intFile.NewDigestsFromFile(archiveCloser, javaArchiveHashes) 509 if err != nil { 510 log.Warnf("failed to create digest for file=%q: %+v", archivePath, err) 511 } 512 513 return digests, nil 514 } 515 516 func (j *archiveParser) getLicenseFromFileInArchive() ([]pkg.License, error) { 517 var fileLicenses []pkg.License 518 for _, filename := range licenses.FileNames() { 519 licenseMatches := j.fileManifest.GlobMatch(true, "/META-INF/"+filename) 520 if len(licenseMatches) == 0 { 521 // Try the root directory if it's not in META-INF 522 licenseMatches = j.fileManifest.GlobMatch(true, "/"+filename) 523 } 524 525 if len(licenseMatches) > 0 { 526 contents, err := intFile.ContentsFromZip(j.archivePath, licenseMatches...) 527 if err != nil { 528 return nil, fmt.Errorf("unable to extract java license (%s): %w", j.location, err) 529 } 530 531 for _, licenseMatch := range licenseMatches { 532 licenseContents := contents[licenseMatch] 533 parsed, err := licenses.Parse(strings.NewReader(licenseContents), j.location) 534 if err != nil { 535 return nil, err 536 } 537 538 if len(parsed) > 0 { 539 fileLicenses = append(fileLicenses, parsed...) 540 } 541 } 542 } 543 } 544 545 return fileLicenses, nil 546 } 547 548 func (j *archiveParser) discoverPkgsFromNestedArchives(parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 549 // we know that all java archives are zip formatted files, so we can use the shared zip helper 550 return discoverPkgsFromZip(j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg, j.cfg) 551 } 552 553 // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and 554 // associating each discovered package to the given parent package. 555 func discoverPkgsFromZip( 556 location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, 557 parentPkg *pkg.Package, cfg ArchiveCatalogerConfig, 558 ) ([]pkg.Package, []artifact.Relationship, error) { 559 // search and parse pom.properties files & fetch the contents 560 openers, err := intFile.ExtractFromZipToUniqueTempFile(archivePath, contentPath, fileManifest.GlobMatch(false, archiveFormatGlobs...)...) 561 if err != nil { 562 return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err) 563 } 564 565 return discoverPkgsFromOpeners(location, openers, parentPkg, cfg) 566 } 567 568 // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package. 569 func discoverPkgsFromOpeners( 570 location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig, 571 ) ([]pkg.Package, []artifact.Relationship, error) { 572 var pkgs []pkg.Package 573 var relationships []artifact.Relationship 574 575 for pathWithinArchive, archiveOpener := range openers { 576 nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(location, pathWithinArchive, archiveOpener, cfg) 577 if err != nil { 578 log.WithFields("location", location.Path()).Warnf("unable to discover java packages from opener: %+v", err) 579 continue 580 } 581 582 // attach the parent package to all discovered packages that are not already associated with a java archive 583 for _, p := range nestedPkgs { 584 if metadata, ok := p.Metadata.(pkg.JavaArchive); ok { 585 if metadata.Parent == nil { 586 metadata.Parent = parentPkg 587 } 588 p.Metadata = metadata 589 } 590 pkgs = append(pkgs, p) 591 } 592 593 relationships = append(relationships, nestedRelationships...) 594 } 595 596 return pkgs, relationships, nil 597 } 598 599 // discoverPkgsFromOpener finds Java archives within the given file. 600 func discoverPkgsFromOpener( 601 location file.Location, pathWithinArchive string, archiveOpener intFile.Opener, cfg ArchiveCatalogerConfig, 602 ) ([]pkg.Package, []artifact.Relationship, error) { 603 archiveReadCloser, err := archiveOpener.Open() 604 if err != nil { 605 return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err) 606 } 607 defer func() { 608 if closeErr := archiveReadCloser.Close(); closeErr != nil { 609 log.Warnf("unable to close archived file from tempdir: %+v", closeErr) 610 } 611 }() 612 613 nestedPath := fmt.Sprintf("%s:%s", location.Path(), pathWithinArchive) 614 nestedLocation := file.NewLocationFromCoordinates(location.Coordinates) 615 nestedLocation.AccessPath = nestedPath 616 gap := newGenericArchiveParserAdapter(cfg) 617 nestedPkgs, nestedRelationships, err := gap.parseJavaArchive(nil, nil, file.LocationReadCloser{ 618 Location: nestedLocation, 619 ReadCloser: archiveReadCloser, 620 }) 621 if err != nil { 622 return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err) 623 } 624 625 return nestedPkgs, nestedRelationships, nil 626 } 627 628 func pomPropertiesByParentPath( 629 archivePath string, location file.Location, extractPaths []string, 630 ) (map[string]pkg.JavaPomProperties, error) { 631 contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) 632 if err != nil { 633 return nil, fmt.Errorf("unable to extract maven files: %w", err) 634 } 635 636 propertiesByParentPath := make(map[string]pkg.JavaPomProperties) 637 for filePath, fileContents := range contentsOfMavenPropertiesFiles { 638 pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents)) 639 if err != nil { 640 log.WithFields("contents-path", filePath, "location", location.Path()).Warnf("failed to parse pom.properties: %+v", err) 641 continue 642 } 643 644 if pomProperties == nil { 645 continue 646 } 647 648 if pomProperties.Version == "" || pomProperties.ArtifactID == "" { 649 // TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package 650 continue 651 } 652 653 propertiesByParentPath[path.Dir(filePath)] = *pomProperties 654 } 655 656 return propertiesByParentPath, nil 657 } 658 659 func pomProjectByParentPath( 660 archivePath string, location file.Location, extractPaths []string, 661 ) (map[string]*parsedPomProject, error) { 662 contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) 663 if err != nil { 664 return nil, fmt.Errorf("unable to extract maven files: %w", err) 665 } 666 667 projectByParentPath := make(map[string]*parsedPomProject) 668 for filePath, fileContents := range contentsOfMavenProjectFiles { 669 // TODO: when we support locations of paths within archives we should start passing the specific pom.xml location object instead of the top jar 670 pomProject, err := parsePomXMLProject(filePath, strings.NewReader(fileContents), location) 671 if err != nil { 672 log.WithFields("contents-path", filePath, "location", location.Path()).Warnf("failed to parse pom.xml: %+v", err) 673 continue 674 } 675 676 if pomProject == nil { 677 continue 678 } 679 680 // If we don't have a version, then maybe the parent pom has it... 681 if (pomProject.Parent == nil && pomProject.Version == "") || pomProject.ArtifactID == "" { 682 // TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package 683 continue 684 } 685 686 projectByParentPath[path.Dir(filePath)] = pomProject 687 } 688 return projectByParentPath, nil 689 } 690 691 // newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and 692 // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not. 693 func newPackageFromMavenData( 694 pomProperties pkg.JavaPomProperties, parsedPomProject *parsedPomProject, parentPkg *pkg.Package, 695 location file.Location, cfg ArchiveCatalogerConfig, 696 ) *pkg.Package { 697 // keep the artifact name within the virtual path if this package does not match the parent package 698 vPathSuffix := "" 699 groupID := "" 700 if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive); ok { 701 groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) 702 } 703 704 parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version) 705 // Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package 706 // See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical 707 // packages and create equal virtual paths so they are de duped in the future 708 pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version) 709 if parentKey != pomProjectKey { 710 // build a new virtual path suffix for the package that is different from the parent package 711 // we want to use the GroupID and ArtifactID here to preserve uniqueness 712 // Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core") 713 // https://github.com/anchore/syft/issues/1944 714 vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID 715 } 716 virtualPath := location.Path() + vPathSuffix 717 718 var pkgPomProject *pkg.JavaPomProject 719 licenses := make([]pkg.License, 0) 720 if parsedPomProject != nil { 721 if cfg.UseNetwork { 722 findPomLicenses(parsedPomProject, cfg) 723 } 724 pkgPomProject = parsedPomProject.JavaPomProject 725 licenses = append(licenses, parsedPomProject.Licenses...) 726 } 727 728 p := pkg.Package{ 729 Name: pomProperties.ArtifactID, 730 Version: pomProperties.Version, 731 Locations: file.NewLocationSet( 732 location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 733 ), 734 Licenses: pkg.NewLicenseSet(licenses...), 735 Language: pkg.Java, 736 Type: pomProperties.PkgTypeIndicated(), 737 Metadata: pkg.JavaArchive{ 738 VirtualPath: virtualPath, 739 PomProperties: &pomProperties, 740 PomProject: pkgPomProject, 741 Parent: parentPkg, 742 }, 743 } 744 745 if packageIdentitiesMatch(p, parentPkg) { 746 updateParentPackage(p, parentPkg) 747 return nil 748 } 749 750 return &p 751 } 752 753 func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool { 754 metadata, ok := p.Metadata.(pkg.JavaArchive) 755 parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaArchive) 756 if !ok || !parentOk { 757 switch { 758 case !ok: 759 log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name) 760 case !parentOk: 761 log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name) 762 } 763 // if we can't extract metadata, we can check for matching identities via the package name 764 // this is not ideal, but it's better than nothing - this should not be used if we have Metadata 765 766 return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg) 767 } 768 769 // try to determine identity with the metadata 770 groupID := groupIDFromJavaMetadata(p.Name, metadata) 771 parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) 772 if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) { 773 return true 774 } 775 776 // the virtual path matches... 777 if parentMetadata.VirtualPath == metadata.VirtualPath { 778 return true 779 } 780 781 // the pom artifactId is the parent name 782 // note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good 783 // example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but 784 // they do not indicate the same package. 785 // NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name 786 // from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core") 787 // we will use this check as a last resort 788 if metadata.PomProperties != nil { 789 if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID { 790 return true 791 } 792 } 793 return false 794 } 795 796 func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) { 797 // we've run across more information about our parent package, add this info to the parent package metadata 798 // the pom properties is typically a better source of information for name and version than the manifest 799 parentPkg.Name = p.Name 800 parentPkg.Version = p.Version 801 802 // we may have learned more about the type via data in the pom properties 803 parentPkg.Type = p.Type 804 805 metadata, ok := p.Metadata.(pkg.JavaArchive) 806 if !ok { 807 return 808 } 809 pomPropertiesCopy := *metadata.PomProperties 810 811 // keep the pom properties, but don't overwrite existing pom properties 812 parentMetadata, ok := parentPkg.Metadata.(pkg.JavaArchive) 813 if ok && parentMetadata.PomProperties == nil { 814 parentMetadata.PomProperties = &pomPropertiesCopy 815 parentPkg.Metadata = parentMetadata 816 } 817 }