github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/pkg/cataloger/catalog.go (about)

     1  package cataloger
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"runtime/debug"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/go-multierror"
    10  	"github.com/wagoodman/go-partybus"
    11  	"github.com/wagoodman/go-progress"
    12  
    13  	"github.com/anchore/syft/syft/artifact"
    14  	"github.com/anchore/syft/syft/event"
    15  	"github.com/anchore/syft/syft/file"
    16  	"github.com/anchore/syft/syft/linux"
    17  	"github.com/anchore/syft/syft/pkg"
    18  	"github.com/anchore/syft/syft/pkg/cataloger/common/cpe"
    19  	"github.com/lineaje-labs/syft/internal/bus"
    20  	"github.com/lineaje-labs/syft/internal/log"
    21  )
    22  
    23  // Monitor provides progress-related data for observing the progress of a Catalog() call (published on the event bus).
    24  type Monitor struct {
    25  	FilesProcessed     progress.Monitorable // the number of files selected and contents analyzed from all registered catalogers
    26  	PackagesDiscovered progress.Monitorable // the number of packages discovered from all registered catalogers
    27  }
    28  
    29  // catalogResult provides the result of running a single cataloger against source
    30  type catalogResult struct {
    31  	Packages      []pkg.Package
    32  	Relationships []artifact.Relationship
    33  	// Discovered may sometimes be more than len(packages)
    34  	Discovered int64
    35  	Error      error
    36  }
    37  
    38  // newMonitor creates a new Monitor object and publishes the object on the bus as a PackageCatalogerStarted event.
    39  func newMonitor() (*progress.Manual, *progress.Manual) {
    40  	filesProcessed := progress.Manual{}
    41  	packagesDiscovered := progress.Manual{}
    42  
    43  	bus.Publish(partybus.Event{
    44  		Type: event.PackageCatalogerStarted,
    45  		Value: Monitor{
    46  			FilesProcessed:     progress.Monitorable(&filesProcessed),
    47  			PackagesDiscovered: progress.Monitorable(&packagesDiscovered),
    48  		},
    49  	})
    50  	return &filesProcessed, &packagesDiscovered
    51  }
    52  
    53  func runCataloger(cataloger pkg.Cataloger, resolver file.Resolver) (catalogerResult *catalogResult, err error) {
    54  	// handle individual cataloger panics
    55  	defer func() {
    56  		if e := recover(); e != nil {
    57  			err = fmt.Errorf("%v at:\n%s", e, string(debug.Stack()))
    58  		}
    59  	}()
    60  
    61  	catalogerResult = new(catalogResult)
    62  
    63  	// find packages from the underlying raw data
    64  	log.WithFields("cataloger", cataloger.Name()).Trace("cataloging started")
    65  	packages, relationships, err := cataloger.Catalog(resolver)
    66  	if err != nil {
    67  		log.WithFields("cataloger", cataloger.Name()).Warn("error while cataloging")
    68  		return catalogerResult, err
    69  	}
    70  
    71  	catalogedPackages := len(packages)
    72  
    73  	log.WithFields("cataloger", cataloger.Name()).Debugf("discovered %d packages", catalogedPackages)
    74  	catalogerResult.Discovered = int64(catalogedPackages)
    75  
    76  	for _, p := range packages {
    77  		// generate CPEs (note: this is excluded from package ID, so is safe to mutate)
    78  		// we might have binary classified CPE already with the package so we want to append here
    79  
    80  		dictionaryCPE, ok := cpe.DictionaryFind(p)
    81  		if ok {
    82  			log.Debugf("used CPE dictionary to find CPE for %s package %q: %s", p.Type, p.Name, dictionaryCPE.BindToFmtString())
    83  			p.CPEs = append(p.CPEs, dictionaryCPE)
    84  		} else {
    85  			p.CPEs = append(p.CPEs, cpe.Generate(p)...)
    86  		}
    87  
    88  		// if we were not able to identify the language we have an opportunity
    89  		// to try and get this value from the PURL. Worst case we assert that
    90  		// we could not identify the language at either stage and set UnknownLanguage
    91  		if p.Language == "" {
    92  			p.Language = pkg.LanguageFromPURL(p.PURL)
    93  		}
    94  
    95  		// create file-to-package relationships for files owned by the package
    96  		owningRelationships, err := packageFileOwnershipRelationships(p, resolver)
    97  		if err != nil {
    98  			log.WithFields("cataloger", cataloger.Name(), "package", p.Name, "error", err).Warnf("unable to create any package-file relationships")
    99  		} else {
   100  			catalogerResult.Relationships = append(catalogerResult.Relationships, owningRelationships...)
   101  		}
   102  		catalogerResult.Packages = append(catalogerResult.Packages, p)
   103  	}
   104  	catalogerResult.Relationships = append(catalogerResult.Relationships, relationships...)
   105  	log.WithFields("cataloger", cataloger.Name()).Trace("cataloging complete")
   106  	return catalogerResult, err
   107  }
   108  
   109  // Catalog a given source (container image or filesystem) with the given catalogers, returning all discovered packages.
   110  // In order to efficiently retrieve contents from a underlying container image the content fetch requests are
   111  // done in bulk. Specifically, all files of interest are collected from each catalogers and accumulated into a single
   112  // request.
   113  //
   114  //nolint:funlen
   115  func Catalog(
   116  	resolver file.Resolver, _ *linux.Release, parallelism int, catalogers ...pkg.Cataloger,
   117  ) (*pkg.Collection, []artifact.Relationship, error) {
   118  	catalog := pkg.NewCollection()
   119  	var allRelationships []artifact.Relationship
   120  
   121  	filesProcessed, packagesDiscovered := newMonitor()
   122  	defer filesProcessed.SetCompleted()
   123  	defer packagesDiscovered.SetCompleted()
   124  
   125  	// perform analysis, accumulating errors for each failed analysis
   126  	var errs error
   127  
   128  	nCatalogers := len(catalogers)
   129  
   130  	// we do not need more parallelism than there are `catalogers`.
   131  	parallelism = int(math.Min(float64(nCatalogers), math.Max(1.0, float64(parallelism))))
   132  	log.WithFields("parallelism", parallelism, "catalogers", nCatalogers).Debug("cataloging packages")
   133  
   134  	jobs := make(chan pkg.Cataloger, nCatalogers)
   135  	results := make(chan *catalogResult, nCatalogers)
   136  	discoveredPackages := make(chan int64, nCatalogers)
   137  
   138  	waitGroup := sync.WaitGroup{}
   139  
   140  	for i := 0; i < parallelism; i++ {
   141  		waitGroup.Add(1)
   142  
   143  		go func() {
   144  			defer waitGroup.Done()
   145  
   146  			// wait for / get the next cataloger job available.
   147  			for cataloger := range jobs {
   148  				result, err := runCataloger(cataloger, resolver)
   149  
   150  				// ensure we set the error to be aggregated
   151  				result.Error = err
   152  
   153  				discoveredPackages <- result.Discovered
   154  
   155  				results <- result
   156  			}
   157  		}()
   158  	}
   159  
   160  	// dynamically show updated discovered package status
   161  	go func() {
   162  		for discovered := range discoveredPackages {
   163  			packagesDiscovered.Add(discovered)
   164  		}
   165  	}()
   166  
   167  	// Enqueue the jobs
   168  	for _, cataloger := range catalogers {
   169  		jobs <- cataloger
   170  	}
   171  	close(jobs)
   172  
   173  	// Wait for the jobs to finish
   174  	waitGroup.Wait()
   175  	close(results)
   176  	close(discoveredPackages)
   177  
   178  	// collect the results
   179  	for result := range results {
   180  		if result.Error != nil {
   181  			errs = multierror.Append(errs, result.Error)
   182  		}
   183  		for _, p := range result.Packages {
   184  			catalog.Add(p)
   185  		}
   186  		allRelationships = append(allRelationships, result.Relationships...)
   187  	}
   188  
   189  	allRelationships = append(allRelationships, pkg.NewRelationships(catalog)...)
   190  
   191  	return catalog, allRelationships, errs
   192  }
   193  
   194  func packageFileOwnershipRelationships(p pkg.Package, resolver file.PathResolver) ([]artifact.Relationship, error) {
   195  	fileOwner, ok := p.Metadata.(pkg.FileOwner)
   196  	if !ok {
   197  		return nil, nil
   198  	}
   199  
   200  	locations := map[artifact.ID]file.Location{}
   201  
   202  	for _, path := range fileOwner.OwnedFiles() {
   203  		pathRefs, err := resolver.FilesByPath(path)
   204  		if err != nil {
   205  			return nil, fmt.Errorf("unable to find path for path=%q: %w", path, err)
   206  		}
   207  
   208  		if len(pathRefs) == 0 {
   209  			// ideally we want to warn users about missing files from a package, however, it is very common for
   210  			// container image authors to delete files that are not needed in order to keep image sizes small. Adding
   211  			// a warning here would be needlessly noisy (even for popular base images).
   212  			continue
   213  		}
   214  
   215  		for _, ref := range pathRefs {
   216  			if oldRef, ok := locations[ref.Coordinates.ID()]; ok {
   217  				log.Debugf("found path duplicate of %s", oldRef.RealPath)
   218  			}
   219  			locations[ref.Coordinates.ID()] = ref
   220  		}
   221  	}
   222  
   223  	var relationships []artifact.Relationship
   224  	for _, location := range locations {
   225  		relationships = append(relationships, artifact.Relationship{
   226  			From: p,
   227  			To:   location.Coordinates,
   228  			Type: artifact.ContainsRelationship,
   229  		})
   230  	}
   231  	return relationships, nil
   232  }