github.com/nextlinux/gosbom@v0.81.1-0.20230627115839-1ff50c281391/gosbom/pkg/cataloger/catalog.go (about) 1 package cataloger 2 3 import ( 4 "fmt" 5 "math" 6 "runtime/debug" 7 "sync" 8 9 "github.com/hashicorp/go-multierror" 10 "github.com/nextlinux/gosbom/gosbom/artifact" 11 "github.com/nextlinux/gosbom/gosbom/event" 12 "github.com/nextlinux/gosbom/gosbom/file" 13 "github.com/nextlinux/gosbom/gosbom/linux" 14 "github.com/nextlinux/gosbom/gosbom/pkg" 15 "github.com/nextlinux/gosbom/gosbom/pkg/cataloger/common/cpe" 16 "github.com/nextlinux/gosbom/internal/bus" 17 "github.com/nextlinux/gosbom/internal/log" 18 "github.com/wagoodman/go-partybus" 19 "github.com/wagoodman/go-progress" 20 ) 21 22 // Monitor provides progress-related data for observing the progress of a Catalog() call (published on the event bus). 23 type Monitor struct { 24 FilesProcessed progress.Monitorable // the number of files selected and contents analyzed from all registered catalogers 25 PackagesDiscovered progress.Monitorable // the number of packages discovered from all registered catalogers 26 } 27 28 // catalogResult provides the result of running a single cataloger against source 29 type catalogResult struct { 30 Packages []pkg.Package 31 Relationships []artifact.Relationship 32 // Discovered may sometimes be more than len(packages) 33 Discovered int64 34 Error error 35 } 36 37 // newMonitor creates a new Monitor object and publishes the object on the bus as a PackageCatalogerStarted event. 38 func newMonitor() (*progress.Manual, *progress.Manual) { 39 filesProcessed := progress.Manual{} 40 packagesDiscovered := progress.Manual{} 41 42 bus.Publish(partybus.Event{ 43 Type: event.PackageCatalogerStarted, 44 Value: Monitor{ 45 FilesProcessed: progress.Monitorable(&filesProcessed), 46 PackagesDiscovered: progress.Monitorable(&packagesDiscovered), 47 }, 48 }) 49 return &filesProcessed, &packagesDiscovered 50 } 51 52 func runCataloger(cataloger pkg.Cataloger, resolver file.Resolver) (catalogerResult *catalogResult, err error) { 53 // handle individual cataloger panics 54 defer func() { 55 if e := recover(); e != nil { 56 err = fmt.Errorf("%v at:\n%s", e, string(debug.Stack())) 57 } 58 }() 59 60 catalogerResult = new(catalogResult) 61 62 // find packages from the underlying raw data 63 log.WithFields("cataloger", cataloger.Name()).Trace("cataloging started") 64 packages, relationships, err := cataloger.Catalog(resolver) 65 if err != nil { 66 log.WithFields("cataloger", cataloger.Name()).Warn("error while cataloging") 67 return catalogerResult, err 68 } 69 70 catalogedPackages := len(packages) 71 72 log.WithFields("cataloger", cataloger.Name()).Debugf("discovered %d packages", catalogedPackages) 73 catalogerResult.Discovered = int64(catalogedPackages) 74 75 for _, p := range packages { 76 // generate CPEs (note: this is excluded from package ID, so is safe to mutate) 77 // we might have binary classified CPE already with the package so we want to append here 78 p.CPEs = append(p.CPEs, cpe.Generate(p)...) 79 80 // if we were not able to identify the language we have an opportunity 81 // to try and get this value from the PURL. Worst case we assert that 82 // we could not identify the language at either stage and set UnknownLanguage 83 if p.Language == "" { 84 p.Language = pkg.LanguageFromPURL(p.PURL) 85 } 86 87 // create file-to-package relationships for files owned by the package 88 owningRelationships, err := packageFileOwnershipRelationships(p, resolver) 89 if err != nil { 90 log.WithFields("cataloger", cataloger.Name(), "package", p.Name, "error", err).Warnf("unable to create any package-file relationships") 91 } else { 92 catalogerResult.Relationships = append(catalogerResult.Relationships, owningRelationships...) 93 } 94 catalogerResult.Packages = append(catalogerResult.Packages, p) 95 } 96 catalogerResult.Relationships = append(catalogerResult.Relationships, relationships...) 97 log.WithFields("cataloger", cataloger.Name()).Trace("cataloging complete") 98 return catalogerResult, err 99 } 100 101 // Catalog a given source (container image or filesystem) with the given catalogers, returning all discovered packages. 102 // In order to efficiently retrieve contents from a underlying container image the content fetch requests are 103 // done in bulk. Specifically, all files of interest are collected from each catalogers and accumulated into a single 104 // request. 105 // 106 //nolint:funlen 107 func Catalog(resolver file.Resolver, _ *linux.Release, parallelism int, catalogers ...pkg.Cataloger) (*pkg.Collection, []artifact.Relationship, error) { 108 catalog := pkg.NewCollection() 109 var allRelationships []artifact.Relationship 110 111 filesProcessed, packagesDiscovered := newMonitor() 112 defer filesProcessed.SetCompleted() 113 defer packagesDiscovered.SetCompleted() 114 115 // perform analysis, accumulating errors for each failed analysis 116 var errs error 117 118 nCatalogers := len(catalogers) 119 120 // we do not need more parallelism than there are `catalogers`. 121 parallelism = int(math.Min(float64(nCatalogers), math.Max(1.0, float64(parallelism)))) 122 log.WithFields("parallelism", parallelism, "catalogers", nCatalogers).Debug("cataloging packages") 123 124 jobs := make(chan pkg.Cataloger, nCatalogers) 125 results := make(chan *catalogResult, nCatalogers) 126 discoveredPackages := make(chan int64, nCatalogers) 127 128 waitGroup := sync.WaitGroup{} 129 130 for i := 0; i < parallelism; i++ { 131 waitGroup.Add(1) 132 133 go func() { 134 defer waitGroup.Done() 135 136 // wait for / get the next cataloger job available. 137 for cataloger := range jobs { 138 result, err := runCataloger(cataloger, resolver) 139 140 // ensure we set the error to be aggregated 141 result.Error = err 142 143 discoveredPackages <- result.Discovered 144 145 results <- result 146 } 147 }() 148 } 149 150 // dynamically show updated discovered package status 151 go func() { 152 for discovered := range discoveredPackages { 153 packagesDiscovered.Add(discovered) 154 } 155 }() 156 157 // Enqueue the jobs 158 for _, cataloger := range catalogers { 159 jobs <- cataloger 160 } 161 close(jobs) 162 163 // Wait for the jobs to finish 164 waitGroup.Wait() 165 close(results) 166 close(discoveredPackages) 167 168 // collect the results 169 for result := range results { 170 if result.Error != nil { 171 errs = multierror.Append(errs, result.Error) 172 } 173 for _, p := range result.Packages { 174 catalog.Add(p) 175 } 176 allRelationships = append(allRelationships, result.Relationships...) 177 } 178 179 allRelationships = append(allRelationships, pkg.NewRelationships(catalog)...) 180 181 return catalog, allRelationships, errs 182 } 183 184 func packageFileOwnershipRelationships(p pkg.Package, resolver file.PathResolver) ([]artifact.Relationship, error) { 185 fileOwner, ok := p.Metadata.(pkg.FileOwner) 186 if !ok { 187 return nil, nil 188 } 189 190 locations := map[artifact.ID]file.Location{} 191 192 for _, path := range fileOwner.OwnedFiles() { 193 pathRefs, err := resolver.FilesByPath(path) 194 if err != nil { 195 return nil, fmt.Errorf("unable to find path for path=%q: %w", path, err) 196 } 197 198 if len(pathRefs) == 0 { 199 // ideally we want to warn users about missing files from a package, however, it is very common for 200 // container image authors to delete files that are not needed in order to keep image sizes small. Adding 201 // a warning here would be needlessly noisy (even for popular base images). 202 continue 203 } 204 205 for _, ref := range pathRefs { 206 if oldRef, ok := locations[ref.Coordinates.ID()]; ok { 207 log.Debugf("found path duplicate of %s", oldRef.RealPath) 208 } 209 locations[ref.Coordinates.ID()] = ref 210 } 211 } 212 213 var relationships []artifact.Relationship 214 for _, location := range locations { 215 relationships = append(relationships, artifact.Relationship{ 216 From: p, 217 To: location.Coordinates, 218 Type: artifact.ContainsRelationship, 219 }) 220 } 221 return relationships, nil 222 }