github.com/noqcks/syft@v0.0.0-20230920222752-a9e2c4e288e5/syft/pkg/cataloger/java/archive_parser.go (about) 1 package java 2 3 import ( 4 "crypto" 5 "fmt" 6 "os" 7 "path" 8 "strings" 9 10 intFile "github.com/anchore/syft/internal/file" 11 "github.com/anchore/syft/internal/log" 12 "github.com/anchore/syft/syft/artifact" 13 "github.com/anchore/syft/syft/file" 14 "github.com/anchore/syft/syft/pkg" 15 "github.com/anchore/syft/syft/pkg/cataloger/generic" 16 ) 17 18 var _ generic.Parser = parseJavaArchive 19 20 var archiveFormatGlobs = []string{ 21 "**/*.jar", 22 "**/*.war", 23 "**/*.ear", 24 "**/*.par", 25 "**/*.sar", 26 "**/*.nar", 27 "**/*.jpi", 28 "**/*.hpi", 29 "**/*.lpkg", // Zip-compressed package used to deploy applications 30 // (aka plugins) to Liferay Portal server. Those files contains .JAR(s) and a .PROPERTIES file, the latter 31 // has information about the application and installation requirements. 32 // NOTE(jonasagx): If you would like to test it with lpkg file, 33 // use: https://web.liferay.com/marketplace/-/mp/download/25019275/7403 34 // LifeRay makes it pretty cumbersome to make a such plugins; their docs are 35 // out of date, and they charge for their IDE. If you find an example 36 // project that we can build in CI feel free to include it 37 } 38 39 // javaArchiveHashes are all the current hash algorithms used to calculate archive digests 40 var javaArchiveHashes = []crypto.Hash{ 41 crypto.SHA1, 42 } 43 44 type archiveParser struct { 45 fileManifest intFile.ZipFileManifest 46 location file.Location 47 archivePath string 48 contentPath string 49 fileInfo archiveFilename 50 detectNested bool 51 } 52 53 // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives. 54 func parseJavaArchive(_ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { 55 parser, cleanupFn, err := newJavaArchiveParser(reader, true) 56 // note: even on error, we should always run cleanup functions 57 defer cleanupFn() 58 if err != nil { 59 return nil, nil, err 60 } 61 return parser.parse() 62 } 63 64 // uniquePkgKey creates a unique string to identify the given package. 65 func uniquePkgKey(groupID string, p *pkg.Package) string { 66 if p == nil { 67 return "" 68 } 69 return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version) 70 } 71 72 // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover 73 // and parse nested archives or ignore them. 74 func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool) (*archiveParser, func(), error) { 75 // fetch the last element of the virtual path 76 virtualElements := strings.Split(reader.AccessPath(), ":") 77 currentFilepath := virtualElements[len(virtualElements)-1] 78 79 contentPath, archivePath, cleanupFn, err := saveArchiveToTmp(currentFilepath, reader) 80 if err != nil { 81 return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err) 82 } 83 84 fileManifest, err := intFile.NewZipFileManifest(archivePath) 85 if err != nil { 86 return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err) 87 } 88 89 return &archiveParser{ 90 fileManifest: fileManifest, 91 location: reader.Location, 92 archivePath: archivePath, 93 contentPath: contentPath, 94 fileInfo: newJavaArchiveFilename(currentFilepath), 95 detectNested: detectNested, 96 }, cleanupFn, nil 97 } 98 99 // parse the loaded archive and return all packages found. 100 func (j *archiveParser) parse() ([]pkg.Package, []artifact.Relationship, error) { 101 var pkgs []pkg.Package 102 var relationships []artifact.Relationship 103 104 // find the parent package from the java manifest 105 parentPkg, err := j.discoverMainPackage() 106 if err != nil { 107 return nil, nil, fmt.Errorf("could not generate package from %s: %w", j.location, err) 108 } 109 110 // find aux packages from pom.properties/pom.xml and potentially modify the existing parentPkg 111 // NOTE: we cannot generate sha1 digests from packages discovered via pom.properties/pom.xml 112 auxPkgs, err := j.discoverPkgsFromAllMavenFiles(parentPkg) 113 if err != nil { 114 return nil, nil, err 115 } 116 pkgs = append(pkgs, auxPkgs...) 117 118 if j.detectNested { 119 // find nested java archive packages 120 nestedPkgs, nestedRelationships, err := j.discoverPkgsFromNestedArchives(parentPkg) 121 if err != nil { 122 return nil, nil, err 123 } 124 pkgs = append(pkgs, nestedPkgs...) 125 relationships = append(relationships, nestedRelationships...) 126 } 127 128 // lastly, add the parent package to the list (assuming the parent exists) 129 if parentPkg != nil { 130 pkgs = append([]pkg.Package{*parentPkg}, pkgs...) 131 } 132 133 // add pURLs to all packages found 134 // note: since package information may change after initial creation when parsing multiple locations within the 135 // jar, we wait until the conclusion of the parsing process before synthesizing pURLs. 136 for i := range pkgs { 137 p := &pkgs[i] 138 if m, ok := p.Metadata.(pkg.JavaMetadata); ok { 139 p.PURL = packageURL(p.Name, p.Version, m) 140 } else { 141 log.WithFields("package", p.String()).Warn("unable to extract java metadata to generate purl") 142 } 143 p.SetID() 144 } 145 146 return pkgs, relationships, nil 147 } 148 149 // discoverMainPackage parses the root Java manifest used as the parent package to all discovered nested packages. 150 func (j *archiveParser) discoverMainPackage() (*pkg.Package, error) { 151 // search and parse java manifest files 152 manifestMatches := j.fileManifest.GlobMatch(manifestGlob) 153 if len(manifestMatches) > 1 { 154 return nil, fmt.Errorf("found multiple manifests in the jar: %+v", manifestMatches) 155 } else if len(manifestMatches) == 0 { 156 // we did not find any manifests, but that may not be a problem (there may be other information to generate packages for) 157 return nil, nil 158 } 159 160 // fetch the manifest file 161 contents, err := intFile.ContentsFromZip(j.archivePath, manifestMatches...) 162 if err != nil { 163 return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err) 164 } 165 166 // parse the manifest file into a rich object 167 manifestContents := contents[manifestMatches[0]] 168 manifest, err := parseJavaManifest(j.archivePath, strings.NewReader(manifestContents)) 169 if err != nil { 170 log.Warnf("failed to parse java manifest (%s): %+v", j.location, err) 171 return nil, nil 172 } 173 174 archiveCloser, err := os.Open(j.archivePath) 175 if err != nil { 176 return nil, fmt.Errorf("unable to open archive path (%s): %w", j.archivePath, err) 177 } 178 defer archiveCloser.Close() 179 180 // grab and assign digest for the entire archive 181 digests, err := intFile.NewDigestsFromFile(archiveCloser, javaArchiveHashes) 182 if err != nil { 183 log.Warnf("failed to create digest for file=%q: %+v", j.archivePath, err) 184 } 185 186 // we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest 187 licenses := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...) 188 /* 189 We should name and version from, in this order: 190 1. pom.properties if we find exactly 1 191 2. pom.xml if we find exactly 1 192 3. manifest 193 4. filename 194 */ 195 name, version := j.guessMainPackageNameAndVersionFromPomInfo() 196 if name == "" { 197 name = selectName(manifest, j.fileInfo) 198 } 199 if version == "" { 200 version = selectVersion(manifest, j.fileInfo) 201 } 202 return &pkg.Package{ 203 // TODO: maybe select name should just have a pom properties in it? 204 Name: name, 205 Version: version, 206 Language: pkg.Java, 207 Licenses: pkg.NewLicenseSet(licenses...), 208 Locations: file.NewLocationSet( 209 j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 210 ), 211 Type: j.fileInfo.pkgType(), 212 MetadataType: pkg.JavaMetadataType, 213 Metadata: pkg.JavaMetadata{ 214 VirtualPath: j.location.AccessPath(), 215 Manifest: manifest, 216 ArchiveDigests: digests, 217 }, 218 }, nil 219 } 220 221 func (j *archiveParser) guessMainPackageNameAndVersionFromPomInfo() (string, string) { 222 pomPropertyMatches := j.fileManifest.GlobMatch(pomPropertiesGlob) 223 pomMatches := j.fileManifest.GlobMatch(pomXMLGlob) 224 var pomPropertiesObject pkg.PomProperties 225 var pomProjectObject pkg.PomProject 226 if len(pomPropertyMatches) == 1 || len(pomMatches) == 1 { 227 // we have exactly 1 pom.properties or pom.xml in the archive; assume it represents the 228 // package we're scanning if the names seem like a plausible match 229 properties, _ := pomPropertiesByParentPath(j.archivePath, j.location, pomPropertyMatches) 230 projects, _ := pomProjectByParentPath(j.archivePath, j.location, pomMatches) 231 232 for parentPath, propertiesObj := range properties { 233 if propertiesObj.ArtifactID != "" && j.fileInfo.name != "" && strings.HasPrefix(propertiesObj.ArtifactID, j.fileInfo.name) { 234 pomPropertiesObject = propertiesObj 235 if proj, exists := projects[parentPath]; exists { 236 pomProjectObject = proj 237 } 238 } 239 } 240 } 241 name := pomPropertiesObject.ArtifactID 242 if name == "" { 243 name = pomProjectObject.ArtifactID 244 } 245 version := pomPropertiesObject.Version 246 if version == "" { 247 version = pomProjectObject.Version 248 } 249 return name, version 250 } 251 252 // discoverPkgsFromAllMavenFiles parses Maven POM properties/xml for a given 253 // parent package, returning all listed Java packages found for each pom 254 // properties discovered and potentially updating the given parentPkg with new 255 // data. 256 func (j *archiveParser) discoverPkgsFromAllMavenFiles(parentPkg *pkg.Package) ([]pkg.Package, error) { 257 if parentPkg == nil { 258 return nil, nil 259 } 260 261 var pkgs []pkg.Package 262 263 // pom.properties 264 properties, err := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(pomPropertiesGlob)) 265 if err != nil { 266 return nil, err 267 } 268 269 // pom.xml 270 projects, err := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(pomXMLGlob)) 271 if err != nil { 272 return nil, err 273 } 274 275 for parentPath, propertiesObj := range properties { 276 var pomProject *pkg.PomProject 277 if proj, exists := projects[parentPath]; exists { 278 pomProject = &proj 279 } 280 281 pkgFromPom := newPackageFromMavenData(propertiesObj, pomProject, parentPkg, j.location) 282 if pkgFromPom != nil { 283 pkgs = append(pkgs, *pkgFromPom) 284 } 285 } 286 287 return pkgs, nil 288 } 289 290 func (j *archiveParser) discoverPkgsFromNestedArchives(parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 291 // we know that all java archives are zip formatted files, so we can use the shared zip helper 292 return discoverPkgsFromZip(j.location, j.archivePath, j.contentPath, j.fileManifest, parentPkg) 293 } 294 295 // discoverPkgsFromZip finds Java archives within Java archives, returning all listed Java packages found and 296 // associating each discovered package to the given parent package. 297 func discoverPkgsFromZip(location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 298 // search and parse pom.properties files & fetch the contents 299 openers, err := intFile.ExtractFromZipToUniqueTempFile(archivePath, contentPath, fileManifest.GlobMatch(archiveFormatGlobs...)...) 300 if err != nil { 301 return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err) 302 } 303 304 return discoverPkgsFromOpeners(location, openers, parentPkg) 305 } 306 307 // discoverPkgsFromOpeners finds Java archives within the given files and associates them with the given parent package. 308 func discoverPkgsFromOpeners(location file.Location, openers map[string]intFile.Opener, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { 309 var pkgs []pkg.Package 310 var relationships []artifact.Relationship 311 312 for pathWithinArchive, archiveOpener := range openers { 313 nestedPkgs, nestedRelationships, err := discoverPkgsFromOpener(location, pathWithinArchive, archiveOpener) 314 if err != nil { 315 log.WithFields("location", location.AccessPath()).Warnf("unable to discover java packages from opener: %+v", err) 316 continue 317 } 318 319 // attach the parent package to all discovered packages that are not already associated with a java archive 320 for _, p := range nestedPkgs { 321 if metadata, ok := p.Metadata.(pkg.JavaMetadata); ok { 322 if metadata.Parent == nil { 323 metadata.Parent = parentPkg 324 } 325 p.Metadata = metadata 326 } 327 pkgs = append(pkgs, p) 328 } 329 330 relationships = append(relationships, nestedRelationships...) 331 } 332 333 return pkgs, relationships, nil 334 } 335 336 // discoverPkgsFromOpener finds Java archives within the given file. 337 func discoverPkgsFromOpener(location file.Location, pathWithinArchive string, archiveOpener intFile.Opener) ([]pkg.Package, []artifact.Relationship, error) { 338 archiveReadCloser, err := archiveOpener.Open() 339 if err != nil { 340 return nil, nil, fmt.Errorf("unable to open archived file from tempdir: %w", err) 341 } 342 defer func() { 343 if closeErr := archiveReadCloser.Close(); closeErr != nil { 344 log.Warnf("unable to close archived file from tempdir: %+v", closeErr) 345 } 346 }() 347 348 nestedPath := fmt.Sprintf("%s:%s", location.AccessPath(), pathWithinArchive) 349 nestedLocation := file.NewLocationFromCoordinates(location.Coordinates) 350 nestedLocation.VirtualPath = nestedPath 351 nestedPkgs, nestedRelationships, err := parseJavaArchive(nil, nil, file.LocationReadCloser{ 352 Location: nestedLocation, 353 ReadCloser: archiveReadCloser, 354 }) 355 if err != nil { 356 return nil, nil, fmt.Errorf("unable to process nested java archive (%s): %w", pathWithinArchive, err) 357 } 358 359 return nestedPkgs, nestedRelationships, nil 360 } 361 362 func pomPropertiesByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.PomProperties, error) { 363 contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) 364 if err != nil { 365 return nil, fmt.Errorf("unable to extract maven files: %w", err) 366 } 367 368 propertiesByParentPath := make(map[string]pkg.PomProperties) 369 for filePath, fileContents := range contentsOfMavenPropertiesFiles { 370 pomProperties, err := parsePomProperties(filePath, strings.NewReader(fileContents)) 371 if err != nil { 372 log.WithFields("contents-path", filePath, "location", location.AccessPath()).Warnf("failed to parse pom.properties: %+v", err) 373 continue 374 } 375 376 if pomProperties == nil { 377 continue 378 } 379 380 if pomProperties.Version == "" || pomProperties.ArtifactID == "" { 381 // TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package 382 continue 383 } 384 385 propertiesByParentPath[path.Dir(filePath)] = *pomProperties 386 } 387 388 return propertiesByParentPath, nil 389 } 390 391 func pomProjectByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.PomProject, error) { 392 contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) 393 if err != nil { 394 return nil, fmt.Errorf("unable to extract maven files: %w", err) 395 } 396 397 projectByParentPath := make(map[string]pkg.PomProject) 398 for filePath, fileContents := range contentsOfMavenProjectFiles { 399 pomProject, err := parsePomXMLProject(filePath, strings.NewReader(fileContents)) 400 if err != nil { 401 log.WithFields("contents-path", filePath, "location", location.AccessPath()).Warnf("failed to parse pom.xml: %+v", err) 402 continue 403 } 404 405 if pomProject == nil { 406 continue 407 } 408 409 if pomProject.Version == "" || pomProject.ArtifactID == "" { 410 // TODO: if there is no parentPkg (no java manifest) one of these poms could be the parent. We should discover the right parent and attach the correct info accordingly to each discovered package 411 continue 412 } 413 414 projectByParentPath[path.Dir(filePath)] = *pomProject 415 } 416 return projectByParentPath, nil 417 } 418 419 // newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and 420 // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not. 421 func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.PomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package { 422 // keep the artifact name within the virtual path if this package does not match the parent package 423 vPathSuffix := "" 424 groupID := "" 425 if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata); ok { 426 groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) 427 } 428 429 parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version) 430 // Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package 431 // See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical 432 // packages and create equal virtual paths so they are de duped in the future 433 pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version) 434 if parentKey != pomProjectKey { 435 // build a new virtual path suffix for the package that is different from the parent package 436 // we want to use the GroupID and ArtifactID here to preserve uniqueness 437 // Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core") 438 // https://github.com/anchore/syft/issues/1944 439 vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID 440 } 441 virtualPath := location.AccessPath() + vPathSuffix 442 443 // discovered props = new package 444 p := pkg.Package{ 445 Name: pomProperties.ArtifactID, 446 Version: pomProperties.Version, 447 Locations: file.NewLocationSet( 448 location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 449 ), 450 Language: pkg.Java, 451 Type: pomProperties.PkgTypeIndicated(), 452 MetadataType: pkg.JavaMetadataType, 453 Metadata: pkg.JavaMetadata{ 454 VirtualPath: virtualPath, 455 PomProperties: &pomProperties, 456 PomProject: pomProject, 457 Parent: parentPkg, 458 }, 459 } 460 461 if packageIdentitiesMatch(p, parentPkg) { 462 updateParentPackage(p, parentPkg) 463 return nil 464 } 465 466 return &p 467 } 468 469 func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool { 470 metadata, ok := p.Metadata.(pkg.JavaMetadata) 471 parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaMetadata) 472 if !ok || !parentOk { 473 switch { 474 case !ok: 475 log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name) 476 case !parentOk: 477 log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name) 478 } 479 // if we can't extract metadata, we can check for matching identities via the package name 480 // this is not ideal, but it's better than nothing - this should not be used if we have Metadata 481 482 return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg) 483 } 484 485 // try to determine identity with the metadata 486 groupID := groupIDFromJavaMetadata(p.Name, metadata) 487 parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) 488 if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) { 489 return true 490 } 491 492 // the virtual path matches... 493 if parentMetadata.VirtualPath == metadata.VirtualPath { 494 return true 495 } 496 497 // the pom artifactId is the parent name 498 // note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good 499 // example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but 500 // they do not indicate the same package. 501 // NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name 502 // from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core") 503 // we will use this check as a last resort 504 if metadata.PomProperties != nil { 505 if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID { 506 return true 507 } 508 } 509 return false 510 } 511 512 func updateParentPackage(p pkg.Package, parentPkg *pkg.Package) { 513 // we've run across more information about our parent package, add this info to the parent package metadata 514 // the pom properties is typically a better source of information for name and version than the manifest 515 parentPkg.Name = p.Name 516 parentPkg.Version = p.Version 517 518 // we may have learned more about the type via data in the pom properties 519 parentPkg.Type = p.Type 520 521 metadata, ok := p.Metadata.(pkg.JavaMetadata) 522 if !ok { 523 return 524 } 525 pomPropertiesCopy := *metadata.PomProperties 526 527 // keep the pom properties, but don't overwrite existing pom properties 528 parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata) 529 if ok && parentMetadata.PomProperties == nil { 530 parentMetadata.PomProperties = &pomPropertiesCopy 531 parentPkg.Metadata = parentMetadata 532 } 533 }