github.com/google/osv-scalibr@v0.4.1/scalibr.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package scalibr provides an interface for running software inventory
    16  // extraction and security finding detection on a machine.
    17  package scalibr
    18  
    19  import (
    20  	"cmp"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"os"
    25  	"regexp"
    26  	"runtime"
    27  	"slices"
    28  	"time"
    29  
    30  	"github.com/gobwas/glob"
    31  	"github.com/google/osv-scalibr/annotator"
    32  	"github.com/google/osv-scalibr/artifact/image"
    33  	"github.com/google/osv-scalibr/artifact/image/layerscanning/trace"
    34  	"github.com/google/osv-scalibr/detector"
    35  	"github.com/google/osv-scalibr/detector/detectorrunner"
    36  	"github.com/google/osv-scalibr/enricher"
    37  	ce "github.com/google/osv-scalibr/enricher/secrets/convert"
    38  	"github.com/google/osv-scalibr/extractor"
    39  	"github.com/google/osv-scalibr/extractor/filesystem"
    40  	cf "github.com/google/osv-scalibr/extractor/filesystem/secrets/convert"
    41  	"github.com/google/osv-scalibr/extractor/standalone"
    42  	scalibrfs "github.com/google/osv-scalibr/fs"
    43  	"github.com/google/osv-scalibr/inventory"
    44  	"github.com/google/osv-scalibr/log"
    45  	"github.com/google/osv-scalibr/packageindex"
    46  	"github.com/google/osv-scalibr/plugin"
    47  	pl "github.com/google/osv-scalibr/plugin/list"
    48  	"github.com/google/osv-scalibr/result"
    49  	"github.com/google/osv-scalibr/stats"
    50  	"github.com/google/osv-scalibr/version"
    51  	"go.uber.org/multierr"
    52  
    53  	cpb "github.com/google/osv-scalibr/binary/proto/config_go_proto"
    54  )
    55  
    56  var (
    57  	errNoScanRoot            = errors.New("no scan root specified")
    58  	errFilesWithSeveralRoots = errors.New("can't extract specific files with several scan roots")
    59  )
    60  
    61  // Scanner is the main entry point of the scanner.
    62  type Scanner struct{}
    63  
    64  // New creates a new scanner instance.
    65  func New() *Scanner { return &Scanner{} }
    66  
    67  // ScanConfig stores the config settings of a scan run such as the plugins to
    68  // use and the dir to consider the root of the scanned system.
    69  type ScanConfig struct {
    70  	Plugins []plugin.Plugin
    71  	// Capabilities that the scanning environment satisfies, e.g. whether there's
    72  	// network access. Some plugins can only run if certain requirements are met.
    73  	Capabilities *plugin.Capabilities
    74  	// ScanRoots contain the list of root dir used by file walking during extraction.
    75  	// All extractors and detectors will assume files are relative to these dirs.
    76  	// Example use case: Scanning a container image or source code repo that is
    77  	// mounted to a local dir.
    78  	ScanRoots []*scalibrfs.ScanRoot
    79  	// Optional: Individual file or dir paths to extract inventory from. If specified,
    80  	// the extractors will only look at the specified files or at the contents of the
    81  	// specified directories during the filesystem traversal.
    82  	// Note that on real filesystems these are not relative to the ScanRoots and
    83  	// thus need to be in sub-directories of one of the ScanRoots.
    84  	PathsToExtract []string
    85  	// Optional: If true, only the files in the top-level directories in PathsToExtract are
    86  	// extracted and sub-directories are ignored.
    87  	IgnoreSubDirs bool
    88  	// Optional: Directories that the file system walk should ignore.
    89  	// Note that on real filesystems these are not relative to the ScanRoots and
    90  	// thus need to be in sub-directories of one of the ScanRoots.
    91  	// TODO(b/279413691): Also skip local paths, e.g. "Skip all .git dirs"
    92  	DirsToSkip []string
    93  	// Optional: If the regex matches a directory, it will be skipped.
    94  	SkipDirRegex *regexp.Regexp
    95  	// Optional: If the glob matches a directory, it will be skipped.
    96  	SkipDirGlob glob.Glob
    97  	// Optional: Files larger than this size in bytes are skipped. If 0, no limit is applied.
    98  	MaxFileSize int
    99  	// Optional: Skip files declared in .gitignore files in source repos.
   100  	UseGitignore bool
   101  	// Optional: stats allows to enter a metric hook. If left nil, no metrics will be recorded.
   102  	Stats stats.Collector
   103  	// Optional: Whether to read symlinks.
   104  	ReadSymlinks bool
   105  	// Optional: Limit for visited inodes. If 0, no limit is applied.
   106  	MaxInodes int
   107  	// Optional: By default, inventories stores a path relative to the scan root. If StoreAbsolutePath
   108  	// is set, the absolute path is stored instead.
   109  	StoreAbsolutePath bool
   110  	// Optional: If true, print a detailed analysis of the duration of each extractor.
   111  	PrintDurationAnalysis bool
   112  	// Optional: If true, fail the scan if any permission errors are encountered.
   113  	ErrorOnFSErrors bool
   114  	// Optional: If set, this function is called for each file to check if there is a specific
   115  	// extractor for this file. If it returns an extractor, only that extractor is used for the file.
   116  	ExtractorOverride func(filesystem.FileAPI) []filesystem.Extractor
   117  	// Optional: If set, SCALIBR returns an error when a plugin's required plugin
   118  	// isn't configured instead of enabling required plugins automatically.
   119  	ExplicitPlugins bool
   120  	// Optional: Configuration to apply to auto-enabled required plugins.
   121  	RequiredPluginConfig *cpb.PluginConfig
   122  }
   123  
   124  // EnableRequiredPlugins adds those plugins to the config that are required by enabled
   125  // plugins (such as Detectors or Enrichers) but have not been explicitly enabled.
   126  func (cfg *ScanConfig) EnableRequiredPlugins() error {
   127  	enabledPlugins := map[string]struct{}{}
   128  	for _, e := range cfg.Plugins {
   129  		enabledPlugins[e.Name()] = struct{}{}
   130  	}
   131  
   132  	requiredPlugins := map[string]struct{}{}
   133  	for _, p := range cfg.Plugins {
   134  		if d, ok := p.(detector.Detector); ok {
   135  			for _, req := range d.RequiredExtractors() {
   136  				requiredPlugins[req] = struct{}{}
   137  			}
   138  		}
   139  		if e, ok := p.(enricher.Enricher); ok {
   140  			for _, req := range e.RequiredPlugins() {
   141  				requiredPlugins[req] = struct{}{}
   142  			}
   143  		}
   144  	}
   145  
   146  	for p := range requiredPlugins {
   147  		if _, enabled := enabledPlugins[p]; enabled {
   148  			continue
   149  		}
   150  		if cfg.ExplicitPlugins {
   151  			// Plugins need to be explicitly enabled,
   152  			// so we log an error instead of auto-enabling them.
   153  			return fmt.Errorf("required plugin %q not enabled", p)
   154  		}
   155  
   156  		requiredPlugin, err := pl.FromName(p, cfg.RequiredPluginConfig)
   157  		// TODO: b/416106602 - Implement transitive enablement for required enrichers.
   158  		if err != nil {
   159  			return fmt.Errorf("required plugin %q not present in any list.go: %w", p, err)
   160  		}
   161  		enabledPlugins[p] = struct{}{}
   162  		cfg.Plugins = append(cfg.Plugins, requiredPlugin)
   163  	}
   164  	return nil
   165  }
   166  
   167  // ValidatePluginRequirements checks that the scanning environment's capabilities satisfy
   168  // the requirements of all enabled plugin.
   169  func (cfg *ScanConfig) ValidatePluginRequirements() error {
   170  	errs := []error{}
   171  	for _, p := range cfg.Plugins {
   172  		if err := plugin.ValidateRequirements(p, cfg.Capabilities); err != nil {
   173  			errs = append(errs, err)
   174  		}
   175  	}
   176  	return errors.Join(errs...)
   177  }
   178  
   179  // LINT.IfChange
   180  
   181  // ScanResult stores the results of a scan incl. scan status and inventory found.
   182  // TODO: b/425645186 - Remove this alias once all callers are migrated to the result package.
   183  type ScanResult = result.ScanResult
   184  
   185  // LINT.ThenChange(/binary/proto/scan_result.proto)
   186  
   187  // Scan executes the extraction/detection/annotation/etc. plugins using the provided scan config.
   188  func (Scanner) Scan(ctx context.Context, config *ScanConfig) (sr *ScanResult) {
   189  	if config.Stats == nil {
   190  		config.Stats = stats.NoopCollector{}
   191  	}
   192  	defer func() {
   193  		config.Stats.AfterScan(time.Since(sr.StartTime), sr.Status)
   194  	}()
   195  	sro := &newScanResultOptions{
   196  		StartTime: time.Now(),
   197  	}
   198  	if err := config.EnableRequiredPlugins(); err != nil {
   199  		sro.Err = err
   200  	} else if err := config.ValidatePluginRequirements(); err != nil {
   201  		sro.Err = err
   202  	} else if len(config.ScanRoots) == 0 {
   203  		sro.Err = errNoScanRoot
   204  	} else if len(config.PathsToExtract) > 0 && len(config.ScanRoots) > 1 {
   205  		sro.Err = errFilesWithSeveralRoots
   206  	}
   207  	if sro.Err != nil {
   208  		sro.EndTime = time.Now()
   209  		return newScanResult(sro)
   210  	}
   211  	extractors := pl.FilesystemExtractors(config.Plugins)
   212  	extractors, err := cf.SetupVelesExtractors(extractors)
   213  	if err != nil {
   214  		sro.Err = multierr.Append(sro.Err, err)
   215  		sro.EndTime = time.Now()
   216  		return newScanResult(sro)
   217  	}
   218  	extractorConfig := &filesystem.Config{
   219  		Stats:                 config.Stats,
   220  		ReadSymlinks:          config.ReadSymlinks,
   221  		Extractors:            extractors,
   222  		PathsToExtract:        config.PathsToExtract,
   223  		IgnoreSubDirs:         config.IgnoreSubDirs,
   224  		DirsToSkip:            config.DirsToSkip,
   225  		SkipDirRegex:          config.SkipDirRegex,
   226  		MaxFileSize:           config.MaxFileSize,
   227  		SkipDirGlob:           config.SkipDirGlob,
   228  		UseGitignore:          config.UseGitignore,
   229  		ScanRoots:             config.ScanRoots,
   230  		MaxInodes:             config.MaxInodes,
   231  		StoreAbsolutePath:     config.StoreAbsolutePath,
   232  		PrintDurationAnalysis: config.PrintDurationAnalysis,
   233  		ErrorOnFSErrors:       config.ErrorOnFSErrors,
   234  		ExtractorOverride:     config.ExtractorOverride,
   235  	}
   236  	inv, extractorStatus, err := filesystem.Run(ctx, extractorConfig)
   237  	if err != nil {
   238  		sro.Err = err
   239  		sro.EndTime = time.Now()
   240  		return newScanResult(sro)
   241  	}
   242  
   243  	sro.Inventory = inv
   244  	// Defer cleanup of all temporary files and directories created during extraction.
   245  	// This function iterates over all EmbeddedFS entries in the inventory and
   246  	// removes their associated TempPaths.
   247  	// Any failures during removal are logged but do not interrupt execution.
   248  	defer func() {
   249  		for _, embeddedFS := range sro.Inventory.EmbeddedFSs {
   250  			for _, tmpPath := range embeddedFS.TempPaths {
   251  				if err := os.RemoveAll(tmpPath); err != nil {
   252  					log.Infof("Failed to remove %s", tmpPath)
   253  				}
   254  			}
   255  		}
   256  	}()
   257  	sro.PluginStatus = append(sro.PluginStatus, extractorStatus...)
   258  	sysroot := config.ScanRoots[0]
   259  	standaloneCfg := &standalone.Config{
   260  		Extractors: pl.StandaloneExtractors(config.Plugins),
   261  		ScanRoot:   &scalibrfs.ScanRoot{FS: sysroot.FS, Path: sysroot.Path},
   262  	}
   263  	standaloneInv, standaloneStatus, err := standalone.Run(ctx, standaloneCfg)
   264  	if err != nil {
   265  		sro.Err = err
   266  		sro.EndTime = time.Now()
   267  		return newScanResult(sro)
   268  	}
   269  
   270  	sro.Inventory.Append(standaloneInv)
   271  	sro.PluginStatus = append(sro.PluginStatus, standaloneStatus...)
   272  
   273  	px, err := packageindex.New(sro.Inventory.Packages)
   274  	if err != nil {
   275  		sro.Err = err
   276  		sro.EndTime = time.Now()
   277  		return newScanResult(sro)
   278  	}
   279  
   280  	findings, detectorStatus, err := detectorrunner.Run(
   281  		ctx, config.Stats, pl.Detectors(config.Plugins), &scalibrfs.ScanRoot{FS: sysroot.FS, Path: sysroot.Path}, px,
   282  	)
   283  	sro.Inventory.PackageVulns = findings.PackageVulns
   284  	sro.Inventory.GenericFindings = findings.GenericFindings
   285  	sro.PluginStatus = append(sro.PluginStatus, detectorStatus...)
   286  	if err != nil {
   287  		sro.Err = err
   288  	}
   289  
   290  	annotatorCfg := &annotator.Config{
   291  		Annotators: pl.Annotators(config.Plugins),
   292  		ScanRoot:   sysroot,
   293  	}
   294  	annotatorStatus, err := annotator.Run(ctx, annotatorCfg, &sro.Inventory)
   295  	sro.PluginStatus = append(sro.PluginStatus, annotatorStatus...)
   296  	if err != nil {
   297  		sro.Err = multierr.Append(sro.Err, err)
   298  	}
   299  
   300  	enrichers := pl.Enrichers(config.Plugins)
   301  	enrichers, err = ce.SetupVelesEnrichers(enrichers)
   302  	if err != nil {
   303  		sro.Err = multierr.Append(sro.Err, err)
   304  		sro.EndTime = time.Now()
   305  		return newScanResult(sro)
   306  	}
   307  	enricherCfg := &enricher.Config{
   308  		Enrichers: enrichers,
   309  		ScanRoot: &scalibrfs.ScanRoot{
   310  			FS:   sysroot.FS,
   311  			Path: sysroot.Path,
   312  		},
   313  	}
   314  	enricherStatus, err := enricher.Run(ctx, enricherCfg, &sro.Inventory)
   315  	sro.PluginStatus = append(sro.PluginStatus, enricherStatus...)
   316  	if err != nil {
   317  		sro.Err = multierr.Append(sro.Err, err)
   318  	}
   319  
   320  	sro.EndTime = time.Now()
   321  	return newScanResult(sro)
   322  }
   323  
   324  // ScanContainer scans the provided container image for packages and security findings using the
   325  // provided scan config. It populates the LayerDetails field of the packages with the origin layer
   326  // details. Functions to create an Image from a tarball, remote name, or v1.Image are available in
   327  // the artifact/image/layerscanning/image package.
   328  func (s Scanner) ScanContainer(ctx context.Context, img image.Image, config *ScanConfig) (sr *ScanResult, err error) {
   329  	if len(config.ScanRoots) > 0 {
   330  		log.Warnf("expected no scan roots, but got %d scan roots, overwriting with container image scan root", len(config.ScanRoots))
   331  	}
   332  
   333  	imagefs := img.FS()
   334  	// Overwrite the scan roots with the chain layer filesystem.
   335  	config.ScanRoots = []*scalibrfs.ScanRoot{
   336  		{
   337  			FS: imagefs,
   338  		},
   339  	}
   340  
   341  	storeAbsPath := config.StoreAbsolutePath
   342  	// Don't try and store absolute path because on windows it will turn unix paths into
   343  	// Windows paths.
   344  	config.StoreAbsolutePath = false
   345  
   346  	// Suppress running enrichers until after layer details are populated.
   347  	var enrichers []enricher.Enricher
   348  	var nonEnricherPlugins []plugin.Plugin
   349  
   350  	for _, p := range config.Plugins {
   351  		if e, ok := p.(enricher.Enricher); ok {
   352  			enrichers = append(enrichers, e)
   353  		} else {
   354  			nonEnricherPlugins = append(nonEnricherPlugins, p)
   355  		}
   356  	}
   357  	config.Plugins = nonEnricherPlugins
   358  
   359  	chainLayers, err := img.ChainLayers()
   360  	if err != nil {
   361  		return nil, fmt.Errorf("failed to get chain layers: %w", err)
   362  	}
   363  
   364  	scanResult := s.Scan(ctx, config)
   365  	extractors := pl.FilesystemExtractors(config.Plugins)
   366  	extractors, err = cf.SetupVelesExtractors(extractors)
   367  	if err != nil {
   368  		return scanResult, err
   369  	}
   370  	extractorConfig := &filesystem.Config{
   371  		Stats:                 config.Stats,
   372  		ReadSymlinks:          config.ReadSymlinks,
   373  		Extractors:            extractors,
   374  		PathsToExtract:        config.PathsToExtract,
   375  		IgnoreSubDirs:         config.IgnoreSubDirs,
   376  		DirsToSkip:            config.DirsToSkip,
   377  		SkipDirRegex:          config.SkipDirRegex,
   378  		MaxFileSize:           config.MaxFileSize,
   379  		SkipDirGlob:           config.SkipDirGlob,
   380  		UseGitignore:          config.UseGitignore,
   381  		ScanRoots:             config.ScanRoots,
   382  		MaxInodes:             config.MaxInodes,
   383  		StoreAbsolutePath:     config.StoreAbsolutePath,
   384  		PrintDurationAnalysis: config.PrintDurationAnalysis,
   385  		ErrorOnFSErrors:       config.ErrorOnFSErrors,
   386  		ExtractorOverride:     config.ExtractorOverride,
   387  	}
   388  
   389  	// Populate the LayerDetails field of the inventory by tracing the layer origins.
   390  	trace.PopulateLayerDetails(ctx, &scanResult.Inventory, chainLayers, pl.FilesystemExtractors(config.Plugins), extractorConfig)
   391  
   392  	// Since we skipped storing absolute path in the main Scan function.
   393  	// Actually convert it to absolute path here.
   394  	if storeAbsPath {
   395  		for _, pkg := range scanResult.Inventory.Packages {
   396  			for i := range pkg.Locations {
   397  				pkg.Locations[i] = "/" + pkg.Locations[i]
   398  			}
   399  		}
   400  	}
   401  
   402  	// Run enrichers with the updated inventory.
   403  	enrichers, err = ce.SetupVelesEnrichers(enrichers)
   404  	if err != nil {
   405  		scanResult.Status.Status = plugin.ScanStatusFailed
   406  		scanResult.Status.FailureReason = err.Error()
   407  		return scanResult, nil //nolint:nilerr // Errors are returned in the scanResult.
   408  	}
   409  	enricherCfg := &enricher.Config{
   410  		Enrichers: enrichers,
   411  		ScanRoot: &scalibrfs.ScanRoot{
   412  			FS: imagefs,
   413  		},
   414  	}
   415  	enricherStatus, err := enricher.Run(ctx, enricherCfg, &scanResult.Inventory)
   416  	scanResult.PluginStatus = append(scanResult.PluginStatus, enricherStatus...)
   417  	if err != nil {
   418  		scanResult.Status.Status = plugin.ScanStatusFailed
   419  		scanResult.Status.FailureReason = err.Error()
   420  	}
   421  
   422  	// Keep the img variable alive till the end incase cleanup is not called on the parent.
   423  	runtime.KeepAlive(img)
   424  
   425  	return scanResult, nil
   426  }
   427  
   428  type newScanResultOptions struct {
   429  	StartTime    time.Time
   430  	EndTime      time.Time
   431  	PluginStatus []*plugin.Status
   432  	Inventory    inventory.Inventory
   433  	Err          error
   434  }
   435  
   436  func newScanResult(o *newScanResultOptions) *ScanResult {
   437  	status := &plugin.ScanStatus{}
   438  	if o.Err != nil {
   439  		status.Status = plugin.ScanStatusFailed
   440  		status.FailureReason = o.Err.Error()
   441  	} else {
   442  		status.Status = plugin.ScanStatusSucceeded
   443  		// If any plugin failed, set the overall scan status to partially succeeded.
   444  		for _, pluginStatus := range o.PluginStatus {
   445  			if pluginStatus.Status.Status == plugin.ScanStatusFailed {
   446  				status.Status = plugin.ScanStatusPartiallySucceeded
   447  				status.FailureReason = "not all plugins succeeded, see the plugin statuses"
   448  				break
   449  			}
   450  		}
   451  	}
   452  	r := &ScanResult{
   453  		StartTime:    o.StartTime,
   454  		EndTime:      o.EndTime,
   455  		Version:      version.ScannerVersion,
   456  		Status:       status,
   457  		PluginStatus: o.PluginStatus,
   458  		Inventory:    o.Inventory,
   459  	}
   460  
   461  	// Sort results for better diffing.
   462  	sortResults(r)
   463  	return r
   464  }
   465  
   466  // sortResults sorts the result to make the output deterministic and diffable.
   467  func sortResults(results *ScanResult) {
   468  	slices.SortFunc(results.PluginStatus, cmpStatus)
   469  	slices.SortFunc(results.Inventory.Packages, CmpPackages)
   470  	slices.SortFunc(results.Inventory.PackageVulns, cmpPackageVulns)
   471  	slices.SortFunc(results.Inventory.GenericFindings, cmpGenericFindings)
   472  }
   473  
   474  // CmpPackages is a comparison helper fun to be used for sorting Package structs.
   475  func CmpPackages(a, b *extractor.Package) int {
   476  	res := cmp.Or(
   477  		cmp.Compare(a.Name, b.Name),
   478  		cmp.Compare(a.Version, b.Version),
   479  		cmp.Compare(len(a.Plugins), len(b.Plugins)),
   480  	)
   481  	if res != 0 {
   482  		return res
   483  	}
   484  
   485  	res = 0
   486  	for i := range a.Plugins {
   487  		res = cmp.Or(res, cmp.Compare(a.Plugins[i], b.Plugins[i]))
   488  	}
   489  	if res != 0 {
   490  		return res
   491  	}
   492  
   493  	aloc := fmt.Sprintf("%v", a.Locations)
   494  	bloc := fmt.Sprintf("%v", b.Locations)
   495  	return cmp.Compare(aloc, bloc)
   496  }
   497  
   498  func cmpStatus(a, b *plugin.Status) int {
   499  	return cmpString(a.Name, b.Name)
   500  }
   501  
   502  func cmpPackageVulns(a, b *inventory.PackageVuln) int {
   503  	return cmpString(a.Vulnerability.Id, b.Vulnerability.Id)
   504  }
   505  
   506  func cmpGenericFindings(a, b *inventory.GenericFinding) int {
   507  	if a.Adv.ID.Reference != b.Adv.ID.Reference {
   508  		return cmpString(a.Adv.ID.Reference, b.Adv.ID.Reference)
   509  	}
   510  	return cmpString(a.Target.Extra, b.Target.Extra)
   511  }
   512  
   513  func cmpString(a, b string) int {
   514  	if a < b {
   515  		return -1
   516  	} else if a > b {
   517  		return 1
   518  	}
   519  	return 0
   520  }