github.com/google/osv-scalibr@v0.4.1/artifact/image/layerscanning/trace/trace.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package trace provides functionality to trace the origin of an inventory in a container image.
    16  package trace
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"io/fs"
    22  
    23  	"github.com/google/osv-scalibr/extractor"
    24  	"github.com/google/osv-scalibr/extractor/filesystem"
    25  	"github.com/google/osv-scalibr/extractor/filesystem/os/osrelease"
    26  	"github.com/google/osv-scalibr/inventory"
    27  	"github.com/google/osv-scalibr/log"
    28  
    29  	scalibrimage "github.com/google/osv-scalibr/artifact/image"
    30  	scalibrfs "github.com/google/osv-scalibr/fs"
    31  )
    32  
    33  // locationAndIndex is a struct to represent a location and the index of the layer it was found in.
    34  type locationAndIndex struct {
    35  	location string
    36  	index    int
    37  }
    38  
    39  // PopulateLayerDetails populates the LayerDetails field of the inventory with the origin details
    40  // obtained by tracing the inventory in the image.
    41  //
    42  // It does this by looking at each consecutive pair (n, n+1) of chain layers in reverse order and
    43  // checking if a package is present in layer n+1, but not layer n. For example, consider the chain
    44  // layers, each with a different set of packages:
    45  //
    46  //	Chain Layer 0: Packages A, B
    47  //	Chain Layer 1: Packages A
    48  //	Chain Layer 2: Packages A, B, C
    49  //
    50  // Then the origin of package C is layer 2, because it is not present in layer 1, but it is in
    51  // layer 2. Even though package B is present in layer 0, it is attributed to layer 2 because it
    52  // exists in layer 2, but not in layer 1. Package A is attributed to layer 0 because it is present
    53  // in all layers.
    54  //
    55  // Note that a precondition of this algorithm is that the chain layers are ordered by order of
    56  // creation.
    57  func PopulateLayerDetails(ctx context.Context, inv *inventory.Inventory, chainLayers []scalibrimage.ChainLayer, extractors []filesystem.Extractor, config *filesystem.Config) {
    58  	// If there are no chain layers, then there is nothing to trace. This should not happen, but we
    59  	// should handle it gracefully.
    60  	if len(chainLayers) == 0 {
    61  		log.Warnf("No chain layers found, cannot trace inventory.")
    62  		return
    63  	}
    64  
    65  	cim := &extractor.ContainerImageMetadata{
    66  		Index: len(inv.ContainerImageMetadata),
    67  	}
    68  	inv.ContainerImageMetadata = append(inv.ContainerImageMetadata, cim)
    69  	fillLayerMetadataFromChainLayers(cim, chainLayers)
    70  
    71  	osInfo, err := osrelease.GetOSRelease(chainLayers[len(chainLayers)-1].FS())
    72  	if err == nil {
    73  		cim.OSInfo = osInfo
    74  	}
    75  
    76  	// Helper function to update the extractor config.
    77  	updateExtractorConfig := func(pathsToExtract []string, extractor filesystem.Extractor, chainFS scalibrfs.FS) {
    78  		config.Extractors = []filesystem.Extractor{extractor}
    79  		config.PathsToExtract = pathsToExtract
    80  		config.ScanRoots = []*scalibrfs.ScanRoot{
    81  			&scalibrfs.ScanRoot{
    82  				FS: chainFS,
    83  			},
    84  		}
    85  	}
    86  
    87  	// locationIndexToPackages is used as a package cache to avoid re-extracting the same
    88  	// package from a file multiple times.
    89  	locationIndexToPackages := map[locationAndIndex][]*extractor.Package{}
    90  	lastLayerIndex := len(chainLayers) - 1
    91  
    92  	// Build a map from the extractor list for faster access.
    93  	nameToExtractor := map[string]filesystem.Extractor{}
    94  	for _, e := range extractors {
    95  		nameToExtractor[e.Name()] = e
    96  	}
    97  
    98  	for _, pkg := range inv.Packages {
    99  		layerDetails := cim.LayerMetadata[lastLayerIndex]
   100  		var pkgExtractor filesystem.Extractor
   101  		for _, name := range pkg.Plugins {
   102  			if ex, ok := nameToExtractor[name]; ok {
   103  				pkgExtractor = ex
   104  				break
   105  			}
   106  		}
   107  
   108  		// If the package has no locations or no filesystem Extractor, it cannot be traced.
   109  		isPackageTraceable := pkgExtractor != nil && len(pkg.Locations) > 0
   110  		if !isPackageTraceable {
   111  			continue
   112  		}
   113  
   114  		var pkgPURL string
   115  		if pkg.PURL() != nil {
   116  			pkgPURL = pkg.PURL().String()
   117  		}
   118  
   119  		var foundOrigin bool
   120  		fileLocation := pkg.Locations[0]
   121  		lastScannedLayerIndex := len(chainLayers) - 1
   122  
   123  		// Go backwards through the chain layers and find the first layer where the package is not
   124  		// present. Such layer is the layer in which the package was introduced. If the package is
   125  		// present in all layers, then it means it was introduced in the first layer.
   126  		for i := len(chainLayers) - 2; i >= 0; i-- {
   127  			oldChainLayer := chainLayers[i]
   128  
   129  			pkgLocationAndIndex := locationAndIndex{
   130  				location: fileLocation,
   131  				index:    i,
   132  			}
   133  
   134  			var oldPackages []*extractor.Package
   135  			if cachedPackages, ok := locationIndexToPackages[pkgLocationAndIndex]; ok {
   136  				oldPackages = cachedPackages
   137  			} else if _, err := oldChainLayer.FS().Stat(fileLocation); errors.Is(err, fs.ErrNotExist) {
   138  				// Check if file still exist in this layer, if not skip extraction.
   139  				// This is both an optimization, and avoids polluting the log output with false file not found errors.
   140  				oldPackages = []*extractor.Package{}
   141  			} else if filesExistInLayer(oldChainLayer, pkg.Locations) {
   142  				// Update the extractor config to use the files from the current layer.
   143  				// We only take extract the first location because other locations are derived from the initial
   144  				// extraction location. If other locations can no longer be determined from the first location
   145  				// they should not be included here, and the trace for those packages stops here.
   146  				updateExtractorConfig([]string{fileLocation}, pkgExtractor, oldChainLayer.FS())
   147  
   148  				// Runs SCALIBR extraction on the file of interest in oldChainLayer.
   149  				oldInv, _, err := filesystem.Run(ctx, config)
   150  				oldPackages = oldInv.Packages
   151  				if err != nil {
   152  					break
   153  				}
   154  			} else {
   155  				// If none of the files from the packages are present in the underlying layer, then there
   156  				// will be no difference in the extracted packages from oldChainLayer, so extraction can be
   157  				// skipped in the chain layer. This is an optimization to avoid extracting the same package
   158  				// multiple times.
   159  				continue
   160  			}
   161  
   162  			// Cache the packages for future use.
   163  			locationIndexToPackages[pkgLocationAndIndex] = oldPackages
   164  
   165  			foundPackage := false
   166  			for _, oldPKG := range oldPackages {
   167  				// PURLs are being used as a package key, so if they are different, skip this package.
   168  				oldPKGPURL := oldPKG.PURL()
   169  				if oldPKGPURL == nil || oldPKGPURL.String() != pkgPURL {
   170  					continue
   171  				}
   172  
   173  				if !areLocationsEqual(oldPKG.Locations, pkg.Locations) {
   174  					continue
   175  				}
   176  
   177  				foundPackage = true
   178  				break
   179  			}
   180  
   181  			// If the package is not present in the old layer, then it was introduced in the previous layer we actually scanned
   182  			if !foundPackage {
   183  				layerDetails = cim.LayerMetadata[lastScannedLayerIndex]
   184  				foundOrigin = true
   185  				break
   186  			}
   187  
   188  			// This is now the latest scanned layer
   189  			lastScannedLayerIndex = i
   190  		}
   191  
   192  		// If the package is present in every layer, then it means it was introduced in the first
   193  		// layer.
   194  		if !foundOrigin {
   195  			layerDetails = cim.LayerMetadata[0]
   196  		}
   197  		pkg.LayerMetadata = layerDetails
   198  	}
   199  }
   200  
   201  // areLocationsEqual checks if the package location strings are equal.
   202  func areLocationsEqual(fileLocations []string, otherFileLocations []string) bool {
   203  	if len(fileLocations) == 0 || len(otherFileLocations) == 0 {
   204  		log.Warnf("Empty file locations found. This should not happen.")
   205  		return false
   206  	}
   207  
   208  	return fileLocations[0] == otherFileLocations[0]
   209  }
   210  
   211  // getSingleLayerFSFromChainLayer returns the filesystem of the underlying layer in the chain layer.
   212  func getLayerFSFromChainLayer(chainLayer scalibrimage.ChainLayer) (scalibrfs.FS, error) {
   213  	layer := chainLayer.Layer()
   214  	if layer == nil {
   215  		return nil, errors.New("chain layer has no layer")
   216  	}
   217  
   218  	fs := layer.FS()
   219  	if fs == nil {
   220  		return nil, errors.New("layer has no filesystem")
   221  	}
   222  
   223  	return fs, nil
   224  }
   225  
   226  func fillLayerMetadataFromChainLayers(cim *extractor.ContainerImageMetadata, chainLayers []scalibrimage.ChainLayer) {
   227  	// Create list of layer details struct to be referenced by inventory.
   228  	for i, chainLayer := range chainLayers {
   229  		// Get the string representation of the diffID, and remove the algorithm prefix if it exists.
   230  		// TODO: b/406537132 - Determine if diffIDs should be validated via the Validate function in
   231  		// golang/opencontainers/digest/algorithm.go. Just getting the string representation of the
   232  		// diffID acts as failing open, but perhaps we should consider validating the diffID and logging
   233  		// a warning if it isn't.
   234  		metadata := &extractor.LayerMetadata{
   235  			Index:           i,
   236  			ParentContainer: cim,
   237  			ChainID:         chainLayer.ChainID(),
   238  			DiffID:          chainLayer.Layer().DiffID(),
   239  			Command:         chainLayer.Layer().Command(),
   240  			IsEmpty:         chainLayer.Layer().IsEmpty(),
   241  		}
   242  		cim.LayerMetadata = append(cim.LayerMetadata, metadata)
   243  	}
   244  }
   245  
   246  // filesExistInLayer checks if any of the provided files are present in the underlying layer of the
   247  // chain layer.
   248  func filesExistInLayer(chainLayer scalibrimage.ChainLayer, fileLocations []string) bool {
   249  	layerFS, err := getLayerFSFromChainLayer(chainLayer)
   250  	if err != nil {
   251  		return false
   252  	}
   253  
   254  	// Check if any of the files are present in the underlying layer.
   255  	for _, fileLocation := range fileLocations {
   256  		if _, err := layerFS.Stat(fileLocation); err == nil {
   257  			return true
   258  		}
   259  	}
   260  	return false
   261  }