github.com/noqcks/syft@v0.0.0-20230920222752-a9e2c4e288e5/syft/pkg/cataloger/catalog.go (about)

     1  package cataloger
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"runtime/debug"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/go-multierror"
    10  	"github.com/wagoodman/go-partybus"
    11  	"github.com/wagoodman/go-progress"
    12  
    13  	"github.com/anchore/syft/internal/bus"
    14  	"github.com/anchore/syft/internal/log"
    15  	"github.com/anchore/syft/syft/artifact"
    16  	"github.com/anchore/syft/syft/event"
    17  	"github.com/anchore/syft/syft/file"
    18  	"github.com/anchore/syft/syft/linux"
    19  	"github.com/anchore/syft/syft/pkg"
    20  	"github.com/anchore/syft/syft/pkg/cataloger/common/cpe"
    21  )
    22  
    23  // Monitor provides progress-related data for observing the progress of a Catalog() call (published on the event bus).
    24  type Monitor struct {
    25  	FilesProcessed     progress.Monitorable // the number of files selected and contents analyzed from all registered catalogers
    26  	PackagesDiscovered progress.Monitorable // the number of packages discovered from all registered catalogers
    27  }
    28  
    29  // catalogResult provides the result of running a single cataloger against source
    30  type catalogResult struct {
    31  	Packages      []pkg.Package
    32  	Relationships []artifact.Relationship
    33  	// Discovered may sometimes be more than len(packages)
    34  	Discovered int64
    35  	Error      error
    36  }
    37  
    38  // newMonitor creates a new Monitor object and publishes the object on the bus as a PackageCatalogerStarted event.
    39  func newMonitor() (*progress.Manual, *progress.Manual) {
    40  	filesProcessed := progress.Manual{}
    41  	packagesDiscovered := progress.Manual{}
    42  
    43  	bus.Publish(partybus.Event{
    44  		Type: event.PackageCatalogerStarted,
    45  		Value: Monitor{
    46  			FilesProcessed:     progress.Monitorable(&filesProcessed),
    47  			PackagesDiscovered: progress.Monitorable(&packagesDiscovered),
    48  		},
    49  	})
    50  	return &filesProcessed, &packagesDiscovered
    51  }
    52  
    53  func runCataloger(cataloger pkg.Cataloger, resolver file.Resolver) (catalogerResult *catalogResult, err error) {
    54  	// handle individual cataloger panics
    55  	defer func() {
    56  		if e := recover(); e != nil {
    57  			err = fmt.Errorf("%v at:\n%s", e, string(debug.Stack()))
    58  		}
    59  	}()
    60  
    61  	catalogerResult = new(catalogResult)
    62  
    63  	// find packages from the underlying raw data
    64  	log.WithFields("cataloger", cataloger.Name()).Trace("cataloging started")
    65  	packages, relationships, err := cataloger.Catalog(resolver)
    66  	if err != nil {
    67  		log.WithFields("cataloger", cataloger.Name()).Warn("error while cataloging")
    68  		return catalogerResult, err
    69  	}
    70  
    71  	catalogedPackages := len(packages)
    72  
    73  	log.WithFields("cataloger", cataloger.Name()).Debugf("discovered %d packages", catalogedPackages)
    74  	catalogerResult.Discovered = int64(catalogedPackages)
    75  
    76  	for _, p := range packages {
    77  		// generate CPEs (note: this is excluded from package ID, so is safe to mutate)
    78  		// we might have binary classified CPE already with the package so we want to append here
    79  
    80  		dictionaryCPE, ok := cpe.DictionaryFind(p)
    81  		if ok {
    82  			log.Debugf("used CPE dictionary to find CPE for %s package %q: %s", p.Type, p.Name, dictionaryCPE.BindToFmtString())
    83  			p.CPEs = append(p.CPEs, dictionaryCPE)
    84  		} else {
    85  			p.CPEs = append(p.CPEs, cpe.Generate(p)...)
    86  		}
    87  
    88  		// if we were not able to identify the language we have an opportunity
    89  		// to try and get this value from the PURL. Worst case we assert that
    90  		// we could not identify the language at either stage and set UnknownLanguage
    91  		if p.Language == "" {
    92  			p.Language = pkg.LanguageFromPURL(p.PURL)
    93  		}
    94  
    95  		// create file-to-package relationships for files owned by the package
    96  		owningRelationships, err := packageFileOwnershipRelationships(p, resolver)
    97  		if err != nil {
    98  			log.WithFields("cataloger", cataloger.Name(), "package", p.Name, "error", err).Warnf("unable to create any package-file relationships")
    99  		} else {
   100  			catalogerResult.Relationships = append(catalogerResult.Relationships, owningRelationships...)
   101  		}
   102  		catalogerResult.Packages = append(catalogerResult.Packages, p)
   103  	}
   104  	catalogerResult.Relationships = append(catalogerResult.Relationships, relationships...)
   105  	log.WithFields("cataloger", cataloger.Name()).Trace("cataloging complete")
   106  	return catalogerResult, err
   107  }
   108  
   109  // Catalog a given source (container image or filesystem) with the given catalogers, returning all discovered packages.
   110  // In order to efficiently retrieve contents from a underlying container image the content fetch requests are
   111  // done in bulk. Specifically, all files of interest are collected from each catalogers and accumulated into a single
   112  // request.
   113  //
   114  //nolint:funlen
   115  func Catalog(resolver file.Resolver, _ *linux.Release, parallelism int, catalogers ...pkg.Cataloger) (*pkg.Collection, []artifact.Relationship, error) {
   116  	catalog := pkg.NewCollection()
   117  	var allRelationships []artifact.Relationship
   118  
   119  	filesProcessed, packagesDiscovered := newMonitor()
   120  	defer filesProcessed.SetCompleted()
   121  	defer packagesDiscovered.SetCompleted()
   122  
   123  	// perform analysis, accumulating errors for each failed analysis
   124  	var errs error
   125  
   126  	nCatalogers := len(catalogers)
   127  
   128  	// we do not need more parallelism than there are `catalogers`.
   129  	parallelism = int(math.Min(float64(nCatalogers), math.Max(1.0, float64(parallelism))))
   130  	log.WithFields("parallelism", parallelism, "catalogers", nCatalogers).Debug("cataloging packages")
   131  
   132  	jobs := make(chan pkg.Cataloger, nCatalogers)
   133  	results := make(chan *catalogResult, nCatalogers)
   134  	discoveredPackages := make(chan int64, nCatalogers)
   135  
   136  	waitGroup := sync.WaitGroup{}
   137  
   138  	for i := 0; i < parallelism; i++ {
   139  		waitGroup.Add(1)
   140  
   141  		go func() {
   142  			defer waitGroup.Done()
   143  
   144  			// wait for / get the next cataloger job available.
   145  			for cataloger := range jobs {
   146  				result, err := runCataloger(cataloger, resolver)
   147  
   148  				// ensure we set the error to be aggregated
   149  				result.Error = err
   150  
   151  				discoveredPackages <- result.Discovered
   152  
   153  				results <- result
   154  			}
   155  		}()
   156  	}
   157  
   158  	// dynamically show updated discovered package status
   159  	go func() {
   160  		for discovered := range discoveredPackages {
   161  			packagesDiscovered.Add(discovered)
   162  		}
   163  	}()
   164  
   165  	// Enqueue the jobs
   166  	for _, cataloger := range catalogers {
   167  		jobs <- cataloger
   168  	}
   169  	close(jobs)
   170  
   171  	// Wait for the jobs to finish
   172  	waitGroup.Wait()
   173  	close(results)
   174  	close(discoveredPackages)
   175  
   176  	// collect the results
   177  	for result := range results {
   178  		if result.Error != nil {
   179  			errs = multierror.Append(errs, result.Error)
   180  		}
   181  		for _, p := range result.Packages {
   182  			catalog.Add(p)
   183  		}
   184  		allRelationships = append(allRelationships, result.Relationships...)
   185  	}
   186  
   187  	allRelationships = append(allRelationships, pkg.NewRelationships(catalog)...)
   188  
   189  	return catalog, allRelationships, errs
   190  }
   191  
   192  func packageFileOwnershipRelationships(p pkg.Package, resolver file.PathResolver) ([]artifact.Relationship, error) {
   193  	fileOwner, ok := p.Metadata.(pkg.FileOwner)
   194  	if !ok {
   195  		return nil, nil
   196  	}
   197  
   198  	locations := map[artifact.ID]file.Location{}
   199  
   200  	for _, path := range fileOwner.OwnedFiles() {
   201  		pathRefs, err := resolver.FilesByPath(path)
   202  		if err != nil {
   203  			return nil, fmt.Errorf("unable to find path for path=%q: %w", path, err)
   204  		}
   205  
   206  		if len(pathRefs) == 0 {
   207  			// ideally we want to warn users about missing files from a package, however, it is very common for
   208  			// container image authors to delete files that are not needed in order to keep image sizes small. Adding
   209  			// a warning here would be needlessly noisy (even for popular base images).
   210  			continue
   211  		}
   212  
   213  		for _, ref := range pathRefs {
   214  			if oldRef, ok := locations[ref.Coordinates.ID()]; ok {
   215  				log.Debugf("found path duplicate of %s", oldRef.RealPath)
   216  			}
   217  			locations[ref.Coordinates.ID()] = ref
   218  		}
   219  	}
   220  
   221  	var relationships []artifact.Relationship
   222  	for _, location := range locations {
   223  		relationships = append(relationships, artifact.Relationship{
   224  			From: p,
   225  			To:   location.Coordinates,
   226  			Type: artifact.ContainsRelationship,
   227  		})
   228  	}
   229  	return relationships, nil
   230  }