github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/format/common/spdxhelpers/to_syft_model.go (about)

     1  package spdxhelpers
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"net/url"
     7  	"path"
     8  	"regexp"
     9  	"strconv"
    10  	"strings"
    11  
    12  	"github.com/spdx/tools-golang/spdx"
    13  	"github.com/spdx/tools-golang/spdx/v2/common"
    14  
    15  	"github.com/anchore/packageurl-go"
    16  	"github.com/anchore/syft/syft/artifact"
    17  	"github.com/anchore/syft/syft/cpe"
    18  	"github.com/anchore/syft/syft/file"
    19  	"github.com/anchore/syft/syft/format/common/util"
    20  	"github.com/anchore/syft/syft/license"
    21  	"github.com/anchore/syft/syft/linux"
    22  	"github.com/anchore/syft/syft/pkg"
    23  	"github.com/anchore/syft/syft/sbom"
    24  	"github.com/anchore/syft/syft/source"
    25  	"github.com/lineaje-labs/syft/internal/log"
    26  	"github.com/lineaje-labs/syft/internal/spdxlicense"
    27  )
    28  
    29  func ToSyftModel(doc *spdx.Document) (*sbom.SBOM, error) {
    30  	if doc == nil {
    31  		return nil, errors.New("cannot convert SPDX document to Syft model because document is nil")
    32  	}
    33  
    34  	spdxIDMap := make(map[string]any)
    35  
    36  	s := &sbom.SBOM{
    37  		Source: extractSource(spdxIDMap, doc),
    38  		Artifacts: sbom.Artifacts{
    39  			Packages:          pkg.NewCollection(),
    40  			FileMetadata:      map[file.Coordinates]file.Metadata{},
    41  			FileDigests:       map[file.Coordinates][]file.Digest{},
    42  			LinuxDistribution: findLinuxReleaseByPURL(doc),
    43  		},
    44  	}
    45  
    46  	collectSyftPackages(s, spdxIDMap, doc.Packages)
    47  
    48  	collectSyftFiles(s, spdxIDMap, doc)
    49  
    50  	s.Relationships = toSyftRelationships(spdxIDMap, doc)
    51  
    52  	return s, nil
    53  }
    54  
    55  func isDirectory(name string) bool {
    56  	if name == "." || name == ".." || strings.HasSuffix(name, "/") || !strings.Contains(path.Base(name), ".") {
    57  		return true
    58  	}
    59  	return false
    60  }
    61  
    62  func removePackage(packages []*spdx.Package, remove *spdx.Package) (pkgs []*spdx.Package) {
    63  	for _, p := range packages {
    64  		if p == remove {
    65  			continue
    66  		}
    67  		pkgs = append(pkgs, p)
    68  	}
    69  	return
    70  }
    71  
    72  func removeRelationships(relationships []*spdx.Relationship, spdxID spdx.ElementID) (relations []*spdx.Relationship) {
    73  	for _, r := range relationships {
    74  		if r.RefA.ElementRefID == spdxID || r.RefB.ElementRefID == spdxID {
    75  			continue
    76  		}
    77  		relations = append(relations, r)
    78  	}
    79  	return
    80  }
    81  
    82  func findRootPackages(doc *spdx.Document) (out []*spdx.Package) {
    83  	for _, p := range doc.Packages {
    84  		for _, r := range doc.Relationships {
    85  			describes := r.RefA.ElementRefID == "DOCUMENT" &&
    86  				r.Relationship == spdx.RelationshipDescribes &&
    87  				r.RefB.ElementRefID == p.PackageSPDXIdentifier
    88  
    89  			describedBy := r.RefB.ElementRefID == "DOCUMENT" &&
    90  				r.Relationship == spdx.RelationshipDescribedBy &&
    91  				r.RefA.ElementRefID == p.PackageSPDXIdentifier
    92  
    93  			if !describes && !describedBy {
    94  				continue
    95  			}
    96  
    97  			out = append(out, p)
    98  		}
    99  	}
   100  	return
   101  }
   102  
   103  func extractSource(spdxIDMap map[string]any, doc *spdx.Document) source.Description {
   104  	src := extractSourceFromNamespace(doc.DocumentNamespace)
   105  
   106  	rootPackages := findRootPackages(doc)
   107  
   108  	if len(rootPackages) != 1 {
   109  		return src
   110  	}
   111  
   112  	p := rootPackages[0]
   113  
   114  	switch p.PrimaryPackagePurpose {
   115  	case spdxPrimaryPurposeContainer:
   116  		src = containerSource(p)
   117  	case spdxPrimaryPurposeFile:
   118  		src = fileSource(p)
   119  	default:
   120  		return src
   121  	}
   122  
   123  	spdxIDMap[string(p.PackageSPDXIdentifier)] = src
   124  
   125  	doc.Packages = removePackage(doc.Packages, p)
   126  	doc.Relationships = removeRelationships(doc.Relationships, p.PackageSPDXIdentifier)
   127  
   128  	return src
   129  }
   130  
   131  func containerSource(p *spdx.Package) source.Description {
   132  	id := string(p.PackageSPDXIdentifier)
   133  
   134  	container := p.PackageName
   135  	v := p.PackageVersion
   136  	if v != "" {
   137  		container += ":" + v
   138  	}
   139  
   140  	digest := ""
   141  	if len(p.PackageChecksums) > 0 {
   142  		c := p.PackageChecksums[0]
   143  		digest = fmt.Sprintf("%s:%s", fromChecksumAlgorithm(c.Algorithm), c.Value)
   144  	}
   145  	return source.Description{
   146  		ID:      id,
   147  		Name:    p.PackageName,
   148  		Version: p.PackageVersion,
   149  		Metadata: source.StereoscopeImageSourceMetadata{
   150  			UserInput:      container,
   151  			ID:             id,
   152  			Layers:         nil, // TODO handle formats with nested layer packages like Tern and K8s BOM tool
   153  			ManifestDigest: digest,
   154  		},
   155  	}
   156  }
   157  
   158  func fileSource(p *spdx.Package) source.Description {
   159  	typeRegex := regexp.MustCompile("^DocumentRoot-([^-]+)-.*$")
   160  	typeName := typeRegex.ReplaceAllString(string(p.PackageSPDXIdentifier), "$1")
   161  
   162  	var version string
   163  	var metadata any
   164  	switch {
   165  	case typeName == prefixDirectory:
   166  		// is a Syft SBOM, explicitly a directory source
   167  		metadata, version = directorySourceMetadata(p)
   168  	case typeName == prefixFile:
   169  		// is a Syft SBOM, explicitly a file source
   170  		metadata, version = fileSourceMetadata(p)
   171  	case isDirectory(p.PackageName):
   172  		// is a non-Syft SBOM, which looks like a directory
   173  		metadata, version = directorySourceMetadata(p)
   174  	default:
   175  		// is a non-Syft SBOM, which is probably a file
   176  		metadata, version = fileSourceMetadata(p)
   177  	}
   178  
   179  	return source.Description{
   180  		ID:       string(p.PackageSPDXIdentifier),
   181  		Name:     p.PackageName,
   182  		Version:  version,
   183  		Metadata: metadata,
   184  	}
   185  }
   186  
   187  func fileSourceMetadata(p *spdx.Package) (any, string) {
   188  	version := p.PackageVersion
   189  
   190  	m := source.FileSourceMetadata{
   191  		Path: p.PackageName,
   192  	}
   193  	// if this is a Syft SBOM, we might have output a digest as the version
   194  	checksum := toChecksum(p.PackageVersion)
   195  	for _, d := range p.PackageChecksums {
   196  		if checksum != nil && checksum.Value == d.Value {
   197  			version = ""
   198  		}
   199  		m.Digests = append(m.Digests, file.Digest{
   200  			Algorithm: fromChecksumAlgorithm(d.Algorithm),
   201  			Value:     d.Value,
   202  		})
   203  	}
   204  
   205  	return m, version
   206  }
   207  
   208  func directorySourceMetadata(p *spdx.Package) (any, string) {
   209  	return source.DirectorySourceMetadata{
   210  		Path: p.PackageName,
   211  		Base: "",
   212  	}, p.PackageVersion
   213  }
   214  
   215  // NOTE(jonas): SPDX doesn't inform what an SBOM is about,
   216  // image, directory, for example. This is our best effort to determine
   217  // the scheme. Syft-generated SBOMs have in the namespace
   218  // field a type encoded, which we try to identify here.
   219  func extractSourceFromNamespace(ns string) source.Description {
   220  	u, err := url.Parse(ns)
   221  	if err != nil {
   222  		return source.Description{
   223  			Metadata: nil,
   224  		}
   225  	}
   226  
   227  	parts := strings.Split(u.Path, "/")
   228  	for _, p := range parts {
   229  		switch p {
   230  		case inputFile:
   231  			return source.Description{
   232  				Metadata: source.FileSourceMetadata{},
   233  			}
   234  		case inputImage:
   235  			return source.Description{
   236  				Metadata: source.StereoscopeImageSourceMetadata{},
   237  			}
   238  		case inputDirectory:
   239  			return source.Description{
   240  				Metadata: source.DirectorySourceMetadata{},
   241  			}
   242  		}
   243  	}
   244  	return source.Description{}
   245  }
   246  
   247  func findLinuxReleaseByPURL(doc *spdx.Document) *linux.Release {
   248  	for _, p := range doc.Packages {
   249  		purlValue := findPURLValue(p)
   250  		if purlValue == "" {
   251  			continue
   252  		}
   253  		purl, err := packageurl.FromString(purlValue)
   254  		if err != nil {
   255  			log.Warnf("unable to parse purl: %s", purlValue)
   256  			continue
   257  		}
   258  		distro := findQualifierValue(purl, pkg.PURLQualifierDistro)
   259  		if distro != "" {
   260  			parts := strings.Split(distro, "-")
   261  			name := parts[0]
   262  			version := ""
   263  			if len(parts) > 1 {
   264  				version = parts[1]
   265  			}
   266  			return &linux.Release{
   267  				PrettyName: name,
   268  				Name:       name,
   269  				ID:         name,
   270  				IDLike:     []string{name},
   271  				Version:    version,
   272  				VersionID:  version,
   273  			}
   274  		}
   275  	}
   276  
   277  	return nil
   278  }
   279  
   280  func collectSyftPackages(s *sbom.SBOM, spdxIDMap map[string]any, packages []*spdx.Package) {
   281  	for _, p := range packages {
   282  		syftPkg := toSyftPackage(p)
   283  		spdxIDMap[string(p.PackageSPDXIdentifier)] = syftPkg
   284  		s.Artifacts.Packages.Add(syftPkg)
   285  	}
   286  }
   287  
   288  func collectSyftFiles(s *sbom.SBOM, spdxIDMap map[string]any, doc *spdx.Document) {
   289  	for _, p := range doc.Packages {
   290  		for _, f := range p.Files {
   291  			l := toSyftLocation(f)
   292  			spdxIDMap[string(f.FileSPDXIdentifier)] = l
   293  
   294  			s.Artifacts.FileMetadata[l.Coordinates] = toFileMetadata(f)
   295  			s.Artifacts.FileDigests[l.Coordinates] = toFileDigests(f)
   296  		}
   297  	}
   298  
   299  	for _, f := range doc.Files {
   300  		l := toSyftLocation(f)
   301  		spdxIDMap[string(f.FileSPDXIdentifier)] = l
   302  
   303  		s.Artifacts.FileMetadata[l.Coordinates] = toFileMetadata(f)
   304  		s.Artifacts.FileDigests[l.Coordinates] = toFileDigests(f)
   305  	}
   306  }
   307  
   308  func toFileDigests(f *spdx.File) (digests []file.Digest) {
   309  	for _, digest := range f.Checksums {
   310  		digests = append(digests, file.Digest{
   311  			Algorithm: fromChecksumAlgorithm(digest.Algorithm),
   312  			Value:     digest.Value,
   313  		})
   314  	}
   315  	return digests
   316  }
   317  
   318  func fromChecksumAlgorithm(algorithm common.ChecksumAlgorithm) string {
   319  	return strings.ToLower(string(algorithm))
   320  }
   321  
   322  func toFileMetadata(f *spdx.File) (meta file.Metadata) {
   323  	// FIXME Syft is currently lossy due to the SPDX 2.2.1 spec not supporting arbitrary mimetypes
   324  	for _, typ := range f.FileTypes {
   325  		switch FileType(typ) {
   326  		case ImageFileType:
   327  			meta.MIMEType = "image/"
   328  		case VideoFileType:
   329  			meta.MIMEType = "video/"
   330  		case ApplicationFileType:
   331  			meta.MIMEType = "application/"
   332  		case TextFileType:
   333  			meta.MIMEType = "text/"
   334  		case AudioFileType:
   335  			meta.MIMEType = "audio/"
   336  		case BinaryFileType:
   337  		case ArchiveFileType:
   338  		case OtherFileType:
   339  		}
   340  	}
   341  	return meta
   342  }
   343  
   344  func toSyftRelationships(spdxIDMap map[string]any, doc *spdx.Document) []artifact.Relationship {
   345  	out := collectDocRelationships(spdxIDMap, doc)
   346  
   347  	out = append(out, collectPackageFileRelationships(spdxIDMap, doc)...)
   348  
   349  	return out
   350  }
   351  
   352  func collectDocRelationships(spdxIDMap map[string]any, doc *spdx.Document) (out []artifact.Relationship) {
   353  	for _, r := range doc.Relationships {
   354  		// FIXME what to do with r.RefA.DocumentRefID and r.RefA.SpecialID
   355  		if r.RefA.DocumentRefID != "" && requireAndTrimPrefix(r.RefA.DocumentRefID, "DocumentRef-") != string(doc.SPDXIdentifier) {
   356  			log.Debugf("ignoring relationship to external document: %+v", r)
   357  			continue
   358  		}
   359  		a := spdxIDMap[string(r.RefA.ElementRefID)]
   360  		b := spdxIDMap[string(r.RefB.ElementRefID)]
   361  		from, fromOk := a.(pkg.Package)
   362  		toPackage, toPackageOk := b.(pkg.Package)
   363  		toLocation, toLocationOk := b.(file.Location)
   364  		if !fromOk || !(toPackageOk || toLocationOk) {
   365  			log.Debugf("unable to find valid relationship mapping from SPDX, ignoring: (from: %+v) (to: %+v)", a, b)
   366  			continue
   367  		}
   368  		var to artifact.Identifiable
   369  		var typ artifact.RelationshipType
   370  		if toLocationOk {
   371  			switch RelationshipType(r.Relationship) {
   372  			case ContainsRelationship:
   373  				typ = artifact.ContainsRelationship
   374  				to = toLocation
   375  			case OtherRelationship:
   376  				// Encoding uses a specifically formatted comment...
   377  				if strings.Index(r.RelationshipComment, string(artifact.EvidentByRelationship)) == 0 {
   378  					typ = artifact.EvidentByRelationship
   379  					to = toLocation
   380  				}
   381  			}
   382  		} else {
   383  			switch RelationshipType(r.Relationship) {
   384  			case ContainsRelationship:
   385  				typ = artifact.ContainsRelationship
   386  				to = toPackage
   387  			case OtherRelationship:
   388  				// Encoding uses a specifically formatted comment...
   389  				if strings.Index(r.RelationshipComment, string(artifact.OwnershipByFileOverlapRelationship)) == 0 {
   390  					typ = artifact.OwnershipByFileOverlapRelationship
   391  					to = toPackage
   392  				}
   393  			}
   394  		}
   395  		if typ != "" && to != nil {
   396  			out = append(out, artifact.Relationship{
   397  				From: from,
   398  				To:   to,
   399  				Type: typ,
   400  			})
   401  		}
   402  	}
   403  	return out
   404  }
   405  
   406  // collectPackageFileRelationships add relationships for direct files
   407  func collectPackageFileRelationships(spdxIDMap map[string]any, doc *spdx.Document) (out []artifact.Relationship) {
   408  	for _, p := range doc.Packages {
   409  		a := spdxIDMap[string(p.PackageSPDXIdentifier)]
   410  		from, fromOk := a.(pkg.Package)
   411  		if !fromOk {
   412  			continue
   413  		}
   414  		for _, f := range p.Files {
   415  			b := spdxIDMap[string(f.FileSPDXIdentifier)]
   416  			to, toLocationOk := b.(file.Location)
   417  			if !toLocationOk {
   418  				continue
   419  			}
   420  			out = append(out, artifact.Relationship{
   421  				From: from,
   422  				To:   to,
   423  				Type: artifact.ContainsRelationship,
   424  			})
   425  		}
   426  	}
   427  	return out
   428  }
   429  
   430  func toSyftCoordinates(f *spdx.File) file.Coordinates {
   431  	const layerIDPrefix = "layerID: "
   432  	var fileSystemID string
   433  	if strings.Index(f.FileComment, layerIDPrefix) == 0 {
   434  		fileSystemID = strings.TrimPrefix(f.FileComment, layerIDPrefix)
   435  	}
   436  	if strings.Index(string(f.FileSPDXIdentifier), layerIDPrefix) == 0 {
   437  		fileSystemID = strings.TrimPrefix(string(f.FileSPDXIdentifier), layerIDPrefix)
   438  	}
   439  	return file.Coordinates{
   440  		RealPath:     f.FileName,
   441  		FileSystemID: fileSystemID,
   442  	}
   443  }
   444  
   445  func toSyftLocation(f *spdx.File) file.Location {
   446  	l := file.NewVirtualLocationFromCoordinates(toSyftCoordinates(f), f.FileName)
   447  	return l
   448  }
   449  
   450  func requireAndTrimPrefix(val interface{}, prefix string) string {
   451  	if v, ok := val.(string); ok {
   452  		if i := strings.Index(v, prefix); i == 0 {
   453  			return strings.Replace(v, prefix, "", 1)
   454  		}
   455  	}
   456  	return ""
   457  }
   458  
   459  type pkgInfo struct {
   460  	purl packageurl.PackageURL
   461  	typ  pkg.Type
   462  	lang pkg.Language
   463  }
   464  
   465  func (p *pkgInfo) qualifierValue(name string) string {
   466  	return findQualifierValue(p.purl, name)
   467  }
   468  
   469  func findQualifierValue(purl packageurl.PackageURL, qualifier string) string {
   470  	for _, q := range purl.Qualifiers {
   471  		if q.Key == qualifier {
   472  			return q.Value
   473  		}
   474  	}
   475  	return ""
   476  }
   477  
   478  func extractPkgInfo(p *spdx.Package) pkgInfo {
   479  	pu := findPURLValue(p)
   480  	purl, err := packageurl.FromString(pu)
   481  	if err != nil {
   482  		return pkgInfo{}
   483  	}
   484  	return pkgInfo{
   485  		purl,
   486  		pkg.TypeByName(purl.Type),
   487  		pkg.LanguageByName(purl.Type),
   488  	}
   489  }
   490  
   491  func toSyftPackage(p *spdx.Package) pkg.Package {
   492  	info := extractPkgInfo(p)
   493  	sP := &pkg.Package{
   494  		Type:     info.typ,
   495  		Name:     p.PackageName,
   496  		Version:  p.PackageVersion,
   497  		Licenses: pkg.NewLicenseSet(parseSPDXLicenses(p)...),
   498  		CPEs:     extractCPEs(p),
   499  		PURL:     purlValue(info.purl),
   500  		Language: info.lang,
   501  		Metadata: extractMetadata(p, info),
   502  	}
   503  
   504  	sP.SetID()
   505  
   506  	return *sP
   507  }
   508  
   509  func purlValue(purl packageurl.PackageURL) string {
   510  	val := purl.String()
   511  	if _, err := packageurl.FromString(val); err != nil {
   512  		return ""
   513  	}
   514  	return val
   515  }
   516  
   517  func parseSPDXLicenses(p *spdx.Package) []pkg.License {
   518  	licenses := make([]pkg.License, 0)
   519  
   520  	// concluded
   521  	if p.PackageLicenseConcluded != NOASSERTION && p.PackageLicenseConcluded != NONE && p.PackageLicenseConcluded != "" {
   522  		l := pkg.NewLicense(cleanSPDXID(p.PackageLicenseConcluded))
   523  		l.Type = license.Concluded
   524  		licenses = append(licenses, l)
   525  	}
   526  
   527  	// declared
   528  	if p.PackageLicenseDeclared != NOASSERTION && p.PackageLicenseDeclared != NONE && p.PackageLicenseDeclared != "" {
   529  		l := pkg.NewLicense(cleanSPDXID(p.PackageLicenseDeclared))
   530  		l.Type = license.Declared
   531  		licenses = append(licenses, l)
   532  	}
   533  
   534  	return licenses
   535  }
   536  
   537  func cleanSPDXID(id string) string {
   538  	return strings.TrimPrefix(id, spdxlicense.LicenseRefPrefix)
   539  }
   540  
   541  //nolint:funlen
   542  func extractMetadata(p *spdx.Package, info pkgInfo) any {
   543  	arch := info.qualifierValue(pkg.PURLQualifierArch)
   544  	upstreamValue := info.qualifierValue(pkg.PURLQualifierUpstream)
   545  	upstream := strings.SplitN(upstreamValue, "@", 2)
   546  	upstreamName := upstream[0]
   547  	upstreamVersion := ""
   548  	if len(upstream) > 1 {
   549  		upstreamVersion = upstream[1]
   550  	}
   551  	supplier := ""
   552  	if p.PackageSupplier != nil {
   553  		supplier = p.PackageSupplier.Supplier
   554  	}
   555  	originator := ""
   556  	if p.PackageOriginator != nil {
   557  		originator = p.PackageOriginator.Originator
   558  	}
   559  	switch info.typ {
   560  	case pkg.ApkPkg:
   561  		return pkg.ApkDBEntry{
   562  			Package:       p.PackageName,
   563  			OriginPackage: upstreamName,
   564  			Maintainer:    supplier,
   565  			Version:       p.PackageVersion,
   566  			Architecture:  arch,
   567  			URL:           p.PackageHomePage,
   568  			Description:   p.PackageDescription,
   569  		}
   570  	case pkg.RpmPkg:
   571  		converted, err := strconv.Atoi(info.qualifierValue(pkg.PURLQualifierEpoch))
   572  		var epoch *int
   573  		if err != nil {
   574  			epoch = nil
   575  		} else {
   576  			epoch = &converted
   577  		}
   578  		return pkg.RpmDBEntry{
   579  			Name:      p.PackageName,
   580  			Version:   p.PackageVersion,
   581  			Epoch:     epoch,
   582  			Arch:      arch,
   583  			SourceRpm: upstreamValue,
   584  			Vendor:    originator,
   585  		}
   586  	case pkg.DebPkg:
   587  		return pkg.DpkgDBEntry{
   588  			Package:       p.PackageName,
   589  			Source:        upstreamName,
   590  			Version:       p.PackageVersion,
   591  			SourceVersion: upstreamVersion,
   592  			Architecture:  arch,
   593  			Maintainer:    originator,
   594  		}
   595  	case pkg.JavaPkg:
   596  		var digests []file.Digest
   597  		for _, value := range p.PackageChecksums {
   598  			digests = append(digests, file.Digest{Algorithm: fromChecksumAlgorithm(value.Algorithm), Value: value.Value})
   599  		}
   600  		return pkg.JavaArchive{
   601  			ArchiveDigests: digests,
   602  		}
   603  	case pkg.GoModulePkg:
   604  		var h1Digest string
   605  		for _, value := range p.PackageChecksums {
   606  			digest, err := util.HDigestFromSHA(fromChecksumAlgorithm(value.Algorithm), value.Value)
   607  			if err != nil {
   608  				log.Debugf("invalid h1digest: %v %v", value, err)
   609  				continue
   610  			}
   611  			h1Digest = digest
   612  			break
   613  		}
   614  		return pkg.GolangBinaryBuildinfoEntry{
   615  			H1Digest: h1Digest,
   616  		}
   617  	}
   618  	return nil
   619  }
   620  
   621  func findPURLValue(p *spdx.Package) string {
   622  	for _, r := range p.PackageExternalReferences {
   623  		if r.RefType == string(PurlExternalRefType) {
   624  			return r.Locator
   625  		}
   626  	}
   627  	return ""
   628  }
   629  
   630  func extractCPEs(p *spdx.Package) (cpes []cpe.CPE) {
   631  	for _, r := range p.PackageExternalReferences {
   632  		if r.RefType == string(Cpe23ExternalRefType) {
   633  			c, err := cpe.New(r.Locator)
   634  			if err != nil {
   635  				log.Warnf("unable to extract SPDX CPE=%q: %+v", r.Locator, err)
   636  				continue
   637  			}
   638  			cpes = append(cpes, c)
   639  		}
   640  	}
   641  	return cpes
   642  }