github.com/nextlinux/gosbom@v0.81.1-0.20230627115839-1ff50c281391/gosbom/pkg/cataloger/catalog.go (about)

     1  package cataloger
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"runtime/debug"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/go-multierror"
    10  	"github.com/nextlinux/gosbom/gosbom/artifact"
    11  	"github.com/nextlinux/gosbom/gosbom/event"
    12  	"github.com/nextlinux/gosbom/gosbom/file"
    13  	"github.com/nextlinux/gosbom/gosbom/linux"
    14  	"github.com/nextlinux/gosbom/gosbom/pkg"
    15  	"github.com/nextlinux/gosbom/gosbom/pkg/cataloger/common/cpe"
    16  	"github.com/nextlinux/gosbom/internal/bus"
    17  	"github.com/nextlinux/gosbom/internal/log"
    18  	"github.com/wagoodman/go-partybus"
    19  	"github.com/wagoodman/go-progress"
    20  )
    21  
    22  // Monitor provides progress-related data for observing the progress of a Catalog() call (published on the event bus).
    23  type Monitor struct {
    24  	FilesProcessed     progress.Monitorable // the number of files selected and contents analyzed from all registered catalogers
    25  	PackagesDiscovered progress.Monitorable // the number of packages discovered from all registered catalogers
    26  }
    27  
    28  // catalogResult provides the result of running a single cataloger against source
    29  type catalogResult struct {
    30  	Packages      []pkg.Package
    31  	Relationships []artifact.Relationship
    32  	// Discovered may sometimes be more than len(packages)
    33  	Discovered int64
    34  	Error      error
    35  }
    36  
    37  // newMonitor creates a new Monitor object and publishes the object on the bus as a PackageCatalogerStarted event.
    38  func newMonitor() (*progress.Manual, *progress.Manual) {
    39  	filesProcessed := progress.Manual{}
    40  	packagesDiscovered := progress.Manual{}
    41  
    42  	bus.Publish(partybus.Event{
    43  		Type: event.PackageCatalogerStarted,
    44  		Value: Monitor{
    45  			FilesProcessed:     progress.Monitorable(&filesProcessed),
    46  			PackagesDiscovered: progress.Monitorable(&packagesDiscovered),
    47  		},
    48  	})
    49  	return &filesProcessed, &packagesDiscovered
    50  }
    51  
    52  func runCataloger(cataloger pkg.Cataloger, resolver file.Resolver) (catalogerResult *catalogResult, err error) {
    53  	// handle individual cataloger panics
    54  	defer func() {
    55  		if e := recover(); e != nil {
    56  			err = fmt.Errorf("%v at:\n%s", e, string(debug.Stack()))
    57  		}
    58  	}()
    59  
    60  	catalogerResult = new(catalogResult)
    61  
    62  	// find packages from the underlying raw data
    63  	log.WithFields("cataloger", cataloger.Name()).Trace("cataloging started")
    64  	packages, relationships, err := cataloger.Catalog(resolver)
    65  	if err != nil {
    66  		log.WithFields("cataloger", cataloger.Name()).Warn("error while cataloging")
    67  		return catalogerResult, err
    68  	}
    69  
    70  	catalogedPackages := len(packages)
    71  
    72  	log.WithFields("cataloger", cataloger.Name()).Debugf("discovered %d packages", catalogedPackages)
    73  	catalogerResult.Discovered = int64(catalogedPackages)
    74  
    75  	for _, p := range packages {
    76  		// generate CPEs (note: this is excluded from package ID, so is safe to mutate)
    77  		// we might have binary classified CPE already with the package so we want to append here
    78  		p.CPEs = append(p.CPEs, cpe.Generate(p)...)
    79  
    80  		// if we were not able to identify the language we have an opportunity
    81  		// to try and get this value from the PURL. Worst case we assert that
    82  		// we could not identify the language at either stage and set UnknownLanguage
    83  		if p.Language == "" {
    84  			p.Language = pkg.LanguageFromPURL(p.PURL)
    85  		}
    86  
    87  		// create file-to-package relationships for files owned by the package
    88  		owningRelationships, err := packageFileOwnershipRelationships(p, resolver)
    89  		if err != nil {
    90  			log.WithFields("cataloger", cataloger.Name(), "package", p.Name, "error", err).Warnf("unable to create any package-file relationships")
    91  		} else {
    92  			catalogerResult.Relationships = append(catalogerResult.Relationships, owningRelationships...)
    93  		}
    94  		catalogerResult.Packages = append(catalogerResult.Packages, p)
    95  	}
    96  	catalogerResult.Relationships = append(catalogerResult.Relationships, relationships...)
    97  	log.WithFields("cataloger", cataloger.Name()).Trace("cataloging complete")
    98  	return catalogerResult, err
    99  }
   100  
   101  // Catalog a given source (container image or filesystem) with the given catalogers, returning all discovered packages.
   102  // In order to efficiently retrieve contents from a underlying container image the content fetch requests are
   103  // done in bulk. Specifically, all files of interest are collected from each catalogers and accumulated into a single
   104  // request.
   105  //
   106  //nolint:funlen
   107  func Catalog(resolver file.Resolver, _ *linux.Release, parallelism int, catalogers ...pkg.Cataloger) (*pkg.Collection, []artifact.Relationship, error) {
   108  	catalog := pkg.NewCollection()
   109  	var allRelationships []artifact.Relationship
   110  
   111  	filesProcessed, packagesDiscovered := newMonitor()
   112  	defer filesProcessed.SetCompleted()
   113  	defer packagesDiscovered.SetCompleted()
   114  
   115  	// perform analysis, accumulating errors for each failed analysis
   116  	var errs error
   117  
   118  	nCatalogers := len(catalogers)
   119  
   120  	// we do not need more parallelism than there are `catalogers`.
   121  	parallelism = int(math.Min(float64(nCatalogers), math.Max(1.0, float64(parallelism))))
   122  	log.WithFields("parallelism", parallelism, "catalogers", nCatalogers).Debug("cataloging packages")
   123  
   124  	jobs := make(chan pkg.Cataloger, nCatalogers)
   125  	results := make(chan *catalogResult, nCatalogers)
   126  	discoveredPackages := make(chan int64, nCatalogers)
   127  
   128  	waitGroup := sync.WaitGroup{}
   129  
   130  	for i := 0; i < parallelism; i++ {
   131  		waitGroup.Add(1)
   132  
   133  		go func() {
   134  			defer waitGroup.Done()
   135  
   136  			// wait for / get the next cataloger job available.
   137  			for cataloger := range jobs {
   138  				result, err := runCataloger(cataloger, resolver)
   139  
   140  				// ensure we set the error to be aggregated
   141  				result.Error = err
   142  
   143  				discoveredPackages <- result.Discovered
   144  
   145  				results <- result
   146  			}
   147  		}()
   148  	}
   149  
   150  	// dynamically show updated discovered package status
   151  	go func() {
   152  		for discovered := range discoveredPackages {
   153  			packagesDiscovered.Add(discovered)
   154  		}
   155  	}()
   156  
   157  	// Enqueue the jobs
   158  	for _, cataloger := range catalogers {
   159  		jobs <- cataloger
   160  	}
   161  	close(jobs)
   162  
   163  	// Wait for the jobs to finish
   164  	waitGroup.Wait()
   165  	close(results)
   166  	close(discoveredPackages)
   167  
   168  	// collect the results
   169  	for result := range results {
   170  		if result.Error != nil {
   171  			errs = multierror.Append(errs, result.Error)
   172  		}
   173  		for _, p := range result.Packages {
   174  			catalog.Add(p)
   175  		}
   176  		allRelationships = append(allRelationships, result.Relationships...)
   177  	}
   178  
   179  	allRelationships = append(allRelationships, pkg.NewRelationships(catalog)...)
   180  
   181  	return catalog, allRelationships, errs
   182  }
   183  
   184  func packageFileOwnershipRelationships(p pkg.Package, resolver file.PathResolver) ([]artifact.Relationship, error) {
   185  	fileOwner, ok := p.Metadata.(pkg.FileOwner)
   186  	if !ok {
   187  		return nil, nil
   188  	}
   189  
   190  	locations := map[artifact.ID]file.Location{}
   191  
   192  	for _, path := range fileOwner.OwnedFiles() {
   193  		pathRefs, err := resolver.FilesByPath(path)
   194  		if err != nil {
   195  			return nil, fmt.Errorf("unable to find path for path=%q: %w", path, err)
   196  		}
   197  
   198  		if len(pathRefs) == 0 {
   199  			// ideally we want to warn users about missing files from a package, however, it is very common for
   200  			// container image authors to delete files that are not needed in order to keep image sizes small. Adding
   201  			// a warning here would be needlessly noisy (even for popular base images).
   202  			continue
   203  		}
   204  
   205  		for _, ref := range pathRefs {
   206  			if oldRef, ok := locations[ref.Coordinates.ID()]; ok {
   207  				log.Debugf("found path duplicate of %s", oldRef.RealPath)
   208  			}
   209  			locations[ref.Coordinates.ID()] = ref
   210  		}
   211  	}
   212  
   213  	var relationships []artifact.Relationship
   214  	for _, location := range locations {
   215  		relationships = append(relationships, artifact.Relationship{
   216  			From: p,
   217  			To:   location.Coordinates,
   218  			Type: artifact.ContainsRelationship,
   219  		})
   220  	}
   221  	return relationships, nil
   222  }