github.com/anchore/syft@v1.38.2/syft/pkg/license.go (about)

     1  package pkg
     2  
     3  import (
     4  	"context"
     5  	"crypto/sha256"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"net/url"
    10  	"sort"
    11  	"strings"
    12  
    13  	"github.com/scylladb/go-set/strset"
    14  
    15  	"github.com/anchore/syft/internal/licenses"
    16  	"github.com/anchore/syft/internal/log"
    17  	"github.com/anchore/syft/internal/spdxlicense"
    18  	"github.com/anchore/syft/syft/artifact"
    19  	"github.com/anchore/syft/syft/file"
    20  	"github.com/anchore/syft/syft/license"
    21  )
    22  
    23  var _ sort.Interface = (*Licenses)(nil)
    24  
    25  // License represents an SPDX Expression or license value extracted from a package's metadata.
    26  // A License is a unique combination of value, expression and type, where its sources are always
    27  // considered merged and additions to the evidence of where it was found and how it was sourced.
    28  // This is different from how we treat a package since we consider package paths in order to
    29  // distinguish if packages should be kept separate. This is different for licenses since we're
    30  // only looking for evidence of where a license was declared/concluded for a given package.
    31  type License struct {
    32  	// SPDXExpression is parsed SPDX license expression (e.g. "MIT OR Apache-2.0")
    33  	SPDXExpression string
    34  
    35  	// Value is original raw license string as found in metadata (e.g. "mit or apache-2")
    36  	Value string
    37  
    38  	// Type is classification of how this license was discovered (declared, concluded, etc.).
    39  	// A Concluded License type is the license the SBOM creator believes governs the package (human crafted or altered SBOM).
    40  	// The Declared License is what the authors of a project believe govern the package (this is the default type syft uses).
    41  	Type license.Type
    42  
    43  	// Contents is full license text if available. If a license is given as its full text in the
    44  	// metadata rather than its value or SPDX expression, this field is used to represent that data.
    45  	Contents string `hash:"ignore"`
    46  
    47  	// URLs are the list of URLs where license information was found. These are ignored for uniqueness
    48  	// since we merge these fields across equal licenses.
    49  	URLs []string `hash:"ignore"`
    50  
    51  	// Locations are the file locations where this license was discovered. These are ignored for uniqueness
    52  	// since we merge these fields across equal licenses.
    53  	Locations file.LocationSet `hash:"ignore"`
    54  }
    55  
    56  // Licenses is a sortable collection of License objects implementing sort.Interface.
    57  type Licenses []License
    58  
    59  func (l Licenses) Len() int {
    60  	return len(l)
    61  }
    62  
    63  func (l Licenses) Less(i, j int) bool {
    64  	if l[i].Value == l[j].Value {
    65  		if l[i].SPDXExpression == l[j].SPDXExpression {
    66  			if l[i].Type == l[j].Type {
    67  				if l[i].Contents == l[j].Contents {
    68  					// While URLs and location are not exclusive fields
    69  					// returning true here reduces the number of swaps
    70  					// while keeping a consistent sort order of
    71  					// the order that they appear in the list initially
    72  					// If users in the future have a preference to sorting based
    73  					// on the slice representation of either field, we can update this code
    74  					return true
    75  				}
    76  				return l[i].Contents < l[j].Contents
    77  			}
    78  			return l[i].Type < l[j].Type
    79  		}
    80  		return l[i].SPDXExpression < l[j].SPDXExpression
    81  	}
    82  	return l[i].Value < l[j].Value
    83  }
    84  
    85  func (l Licenses) Swap(i, j int) {
    86  	l[i], l[j] = l[j], l[i]
    87  }
    88  
    89  func NewLicensesFromReadCloserWithContext(ctx context.Context, closer file.LocationReadCloser) []License {
    90  	//Definition: The license that the auditor or scanning tool concludes applies, based on the actual contents of the files.
    91  	//Source: Derived from analyzing the source code, license headers, and full license texts in the files.
    92  	// Given we are scanning the contents of the file, we should use the Concluded License type.
    93  	return newLicenseBuilder().WithContents(closer).WithLocations(closer.Location).WithType(license.Concluded).Build(ctx).ToSlice()
    94  }
    95  
    96  func NewLicenseWithContext(ctx context.Context, value string) License {
    97  	return NewLicenseFromTypeWithContext(ctx, value, license.Declared)
    98  }
    99  
   100  func NewLicenseFromTypeWithContext(ctx context.Context, value string, t license.Type) License {
   101  	lics := newLicenseBuilder().WithValues(value).WithType(t).Build(ctx).ToSlice()
   102  	if len(lics) > 0 {
   103  		return lics[0]
   104  	}
   105  	// TODO: this is not ideal, but also not expected given the input of "value"
   106  	return License{}
   107  }
   108  
   109  func NewLicensesFromValuesWithContext(ctx context.Context, values ...string) []License {
   110  	return newLicenseBuilder().WithValues(values...).Build(ctx).ToSlice()
   111  }
   112  
   113  func NewLicensesFromLocationWithContext(ctx context.Context, location file.Location, values ...string) []License {
   114  	return newLicenseBuilder().WithValues(values...).WithLocations(location).Build(ctx).ToSlice()
   115  }
   116  
   117  func NewLicenseFromLocationsWithContext(ctx context.Context, value string, locations ...file.Location) License {
   118  	lics := newLicenseBuilder().WithValues(value).WithLocations(locations...).Build(ctx).ToSlice()
   119  	if len(lics) > 0 {
   120  		return lics[0]
   121  	}
   122  	// TODO: this is not ideal, but also not expected given the input of "value"
   123  	return License{}
   124  }
   125  
   126  func NewLicenseFromURLsWithContext(ctx context.Context, value string, urls ...string) License {
   127  	lics := newLicenseBuilder().WithValues(value).WithURLs(urls...).Build(ctx).ToSlice()
   128  	if len(lics) > 0 {
   129  		return lics[0]
   130  	}
   131  	// TODO: this is not ideal, but also not expected given the input of "value"
   132  	return License{}
   133  }
   134  
   135  func stripUnwantedCharacters(rawURL string) (string, error) {
   136  	cleanedURL := strings.TrimSpace(rawURL)
   137  	_, err := url.ParseRequestURI(cleanedURL)
   138  	if err != nil {
   139  		return "", fmt.Errorf("invalid URL: %w", err)
   140  	}
   141  
   142  	return cleanedURL, nil
   143  }
   144  
   145  func NewLicenseFromFieldsWithContext(ctx context.Context, value, url string, location *file.Location) License {
   146  	// If value is empty but URL is provided, try to enrich from SPDX database
   147  	if value == "" && url != "" {
   148  		if info, found := spdxlicense.LicenseByURL(url); found {
   149  			value = info.ID
   150  		}
   151  	}
   152  
   153  	lics := newLicenseBuilder().WithValues(value).WithURLs(url).WithOptionalLocation(location).Build(ctx).ToSlice()
   154  	if len(lics) > 0 {
   155  		return lics[0]
   156  	}
   157  	// TODO: this is not ideal, but also not expected given the input of "value"
   158  	return License{}
   159  }
   160  
   161  func (s License) Empty() bool {
   162  	return s.Value == "" && s.SPDXExpression == "" && s.Contents == "" && len(s.URLs) == 0
   163  }
   164  
   165  // Merge two licenses into a new license object. If the merge is not possible due to unmergeable fields
   166  // (e.g. different values for Value, SPDXExpression, Type, or any non-collection type) an error is returned.
   167  // TODO: this is a bit of a hack to not infinitely recurse when hashing a license
   168  func (s License) Merge(l License) (*License, error) {
   169  	sHash, err := artifact.IDByHash(s)
   170  	if err != nil {
   171  		return nil, err
   172  	}
   173  	lHash, err := artifact.IDByHash(l)
   174  	if err != nil {
   175  		return nil, err
   176  	}
   177  	if sHash != lHash {
   178  		return nil, fmt.Errorf("cannot merge licenses with different hash")
   179  	}
   180  
   181  	// try to keep s.URLs unallocated unless necessary (which is the default state from the constructor)
   182  	if len(l.URLs) > 0 {
   183  		s.URLs = append(s.URLs, l.URLs...)
   184  	}
   185  
   186  	if len(s.URLs) > 0 {
   187  		s.URLs = strset.New(s.URLs...).List()
   188  		sort.Strings(s.URLs)
   189  	}
   190  
   191  	if l.Locations.Empty() {
   192  		return &s, nil
   193  	}
   194  
   195  	// since the set instance has a reference type (map), we must make a new instance
   196  	locations := file.NewLocationSet(s.Locations.ToSlice()...)
   197  	locations.Add(l.Locations.ToSlice()...)
   198  	s.Locations = locations
   199  
   200  	return &s, nil
   201  }
   202  
   203  // licenseBuilder is an internal builder for constructing License objects with validation and normalization.
   204  type licenseBuilder struct {
   205  	// values are raw license strings or SPDX expressions to process.
   206  	values []string
   207  	// contents are readers for full license text content.
   208  	contents []io.ReadCloser
   209  	// locations are file locations where license information was discovered.
   210  	locations []file.Location
   211  	// urls are web URLs where license information can be found.
   212  	urls []string
   213  	// tp is the license type classification (declared, concluded, etc.).
   214  	tp license.Type
   215  }
   216  
   217  func newLicenseBuilder() *licenseBuilder {
   218  	return &licenseBuilder{
   219  		tp: license.Declared,
   220  	}
   221  }
   222  
   223  func (b *licenseBuilder) WithValues(expr ...string) *licenseBuilder {
   224  	for _, v := range expr {
   225  		if v == "" {
   226  			continue
   227  		}
   228  		b.values = append(b.values, v)
   229  	}
   230  	return b
   231  }
   232  
   233  func (b *licenseBuilder) WithOptionalLocation(location *file.Location) *licenseBuilder {
   234  	if location != nil {
   235  		b.locations = append(b.locations, *location)
   236  	}
   237  	return b
   238  }
   239  
   240  func (b *licenseBuilder) WithURLs(urls ...string) *licenseBuilder {
   241  	s := strset.New()
   242  	for _, u := range urls {
   243  		if u != "" {
   244  			sanitizedURL, err := stripUnwantedCharacters(u)
   245  			if err != nil {
   246  				log.Tracef("unable to sanitize url=%q: %s", u, err)
   247  				continue
   248  			}
   249  			s.Add(sanitizedURL)
   250  		}
   251  	}
   252  
   253  	b.urls = append(b.urls, s.List()...)
   254  	sort.Strings(b.urls)
   255  	return b
   256  }
   257  
   258  func (b *licenseBuilder) WithLocations(locations ...file.Location) *licenseBuilder {
   259  	for _, loc := range locations {
   260  		if loc.Path() != "" {
   261  			b.locations = append(b.locations, loc)
   262  		}
   263  	}
   264  	return b
   265  }
   266  
   267  func (b *licenseBuilder) WithContents(contents ...io.ReadCloser) *licenseBuilder {
   268  	for _, content := range contents {
   269  		if content != nil {
   270  			b.contents = append(b.contents, content)
   271  		}
   272  	}
   273  	return b
   274  }
   275  
   276  func (b *licenseBuilder) WithType(t license.Type) *licenseBuilder {
   277  	b.tp = t // last one wins, multiple is not valid
   278  	return b
   279  }
   280  
   281  func (b *licenseBuilder) Build(ctx context.Context) LicenseSet {
   282  	// for every value make a license with all locations
   283  	// or for every reader make a license with all locations
   284  	// if given a reader and a value, this is invalid
   285  
   286  	locations := file.NewLocationSet(b.locations...)
   287  
   288  	set := NewLicenseSet()
   289  	for _, v := range b.values {
   290  		if strings.Contains(v, "\n") {
   291  			var loc file.Location
   292  			if len(b.locations) > 0 {
   293  				loc = b.locations[0]
   294  			}
   295  			b.contents = append(b.contents, file.NewLocationReadCloser(loc, io.NopCloser(strings.NewReader(v))))
   296  			continue
   297  		}
   298  
   299  		// we want to check if the SPDX field should be set
   300  		var expression string
   301  		if ex, err := license.ParseExpression(v); err == nil {
   302  			expression = ex
   303  		}
   304  
   305  		set.Add(License{
   306  			SPDXExpression: expression,
   307  			Value:          strings.TrimSpace(v),
   308  			Type:           b.tp,
   309  			URLs:           b.urls,
   310  			Locations:      locations,
   311  		})
   312  	}
   313  
   314  	// we have some readers (with no values); let's try to turn into licenses if we can
   315  	for _, content := range b.contents {
   316  		set.Add(b.buildFromContents(ctx, content)...)
   317  	}
   318  
   319  	if set.Empty() && len(b.urls) > 0 {
   320  		// if we have no values or contents, but we do have URLs, let's make a license with the URLs
   321  		// try to enrich the license by looking up the URL in the SPDX database
   322  		license := License{
   323  			Type:      b.tp,
   324  			URLs:      b.urls,
   325  			Locations: locations,
   326  		}
   327  
   328  		// attempt to fill in missing license information from the first URL
   329  		if len(b.urls) > 0 {
   330  			if info, found := spdxlicense.LicenseByURL(b.urls[0]); found {
   331  				license.Value = info.ID
   332  				license.SPDXExpression = info.ID
   333  			}
   334  		}
   335  
   336  		set.Add(license)
   337  	}
   338  
   339  	return set
   340  }
   341  
   342  func (b *licenseBuilder) buildFromContents(ctx context.Context, contents io.ReadCloser) []License {
   343  	if !licenses.IsContextLicenseScannerSet(ctx) {
   344  		// we do not have a scanner; we don't want to create one; we sha256 the content and populate the value
   345  		internal, err := contentFromReader(contents)
   346  		if err != nil {
   347  			log.WithFields("error", err).Trace("could not read content")
   348  			return nil
   349  		}
   350  		return []License{b.licenseFromContentHash(internal)}
   351  	}
   352  
   353  	scanner, err := licenses.ContextLicenseScanner(ctx)
   354  	if err != nil {
   355  		log.WithFields("error", err).Trace("could not find license scanner")
   356  		internal, err := contentFromReader(contents)
   357  		if err != nil {
   358  			log.WithFields("error", err).Trace("could not read content")
   359  			return nil
   360  		}
   361  		return []License{b.licenseFromContentHash(internal)}
   362  	}
   363  
   364  	evidence, content, err := scanner.FindEvidence(ctx, contents)
   365  	if err != nil {
   366  		log.WithFields("error", err).Trace("scanner failed to scan contents")
   367  		return nil
   368  	}
   369  
   370  	if len(evidence) > 0 {
   371  		// we have some ID and offsets to apply to our content; let's make some detailed licenses
   372  		return b.licensesFromEvidenceAndContent(evidence, content)
   373  	}
   374  	// scanner couldn't find anything, but we still have the file contents; sha256 and send it back with value
   375  	return []License{b.licenseFromContentHash(string(content))}
   376  }
   377  
   378  func (b *licenseBuilder) licensesFromEvidenceAndContent(evidence []licenses.Evidence, content []byte) []License {
   379  	ls := make([]License, 0)
   380  	for _, e := range evidence {
   381  		// basic license
   382  		candidate := License{
   383  			Value:     e.ID,
   384  			Locations: file.NewLocationSet(b.locations...),
   385  			Type:      b.tp,
   386  		}
   387  		// get content offset
   388  		if e.Start >= 0 && e.End <= len(content) && e.Start <= e.End {
   389  			candidate.Contents = string(content[e.Start:e.End])
   390  		}
   391  		// check for SPDX Validity
   392  		if ex, err := license.ParseExpression(e.ID); err == nil {
   393  			candidate.SPDXExpression = ex
   394  		}
   395  
   396  		ls = append(ls, candidate)
   397  	}
   398  	return ls
   399  }
   400  
   401  func (b *licenseBuilder) licenseFromContentHash(content string) License {
   402  	hash := sha256HexFromString(content)
   403  	value := "sha256:" + hash
   404  
   405  	return License{
   406  		Value:     value,
   407  		Contents:  content,
   408  		Type:      b.tp,
   409  		Locations: file.NewLocationSet(b.locations...),
   410  	}
   411  }
   412  
   413  func contentFromReader(r io.Reader) (string, error) {
   414  	bytes, err := io.ReadAll(r)
   415  	if err != nil {
   416  		return "", err
   417  	}
   418  	return strings.TrimSpace(string(bytes)), nil
   419  }
   420  
   421  func sha256HexFromString(s string) string {
   422  	hash := sha256.Sum256([]byte(s))
   423  	return hex.EncodeToString(hash[:])
   424  }