github.com/google/osv-scalibr@v0.4.1/artifact/image/layerscanning/trace/trace.go (about) 1 // Copyright 2025 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package trace provides functionality to trace the origin of an inventory in a container image. 16 package trace 17 18 import ( 19 "context" 20 "errors" 21 "io/fs" 22 23 "github.com/google/osv-scalibr/extractor" 24 "github.com/google/osv-scalibr/extractor/filesystem" 25 "github.com/google/osv-scalibr/extractor/filesystem/os/osrelease" 26 "github.com/google/osv-scalibr/inventory" 27 "github.com/google/osv-scalibr/log" 28 29 scalibrimage "github.com/google/osv-scalibr/artifact/image" 30 scalibrfs "github.com/google/osv-scalibr/fs" 31 ) 32 33 // locationAndIndex is a struct to represent a location and the index of the layer it was found in. 34 type locationAndIndex struct { 35 location string 36 index int 37 } 38 39 // PopulateLayerDetails populates the LayerDetails field of the inventory with the origin details 40 // obtained by tracing the inventory in the image. 41 // 42 // It does this by looking at each consecutive pair (n, n+1) of chain layers in reverse order and 43 // checking if a package is present in layer n+1, but not layer n. For example, consider the chain 44 // layers, each with a different set of packages: 45 // 46 // Chain Layer 0: Packages A, B 47 // Chain Layer 1: Packages A 48 // Chain Layer 2: Packages A, B, C 49 // 50 // Then the origin of package C is layer 2, because it is not present in layer 1, but it is in 51 // layer 2. Even though package B is present in layer 0, it is attributed to layer 2 because it 52 // exists in layer 2, but not in layer 1. Package A is attributed to layer 0 because it is present 53 // in all layers. 54 // 55 // Note that a precondition of this algorithm is that the chain layers are ordered by order of 56 // creation. 57 func PopulateLayerDetails(ctx context.Context, inv *inventory.Inventory, chainLayers []scalibrimage.ChainLayer, extractors []filesystem.Extractor, config *filesystem.Config) { 58 // If there are no chain layers, then there is nothing to trace. This should not happen, but we 59 // should handle it gracefully. 60 if len(chainLayers) == 0 { 61 log.Warnf("No chain layers found, cannot trace inventory.") 62 return 63 } 64 65 cim := &extractor.ContainerImageMetadata{ 66 Index: len(inv.ContainerImageMetadata), 67 } 68 inv.ContainerImageMetadata = append(inv.ContainerImageMetadata, cim) 69 fillLayerMetadataFromChainLayers(cim, chainLayers) 70 71 osInfo, err := osrelease.GetOSRelease(chainLayers[len(chainLayers)-1].FS()) 72 if err == nil { 73 cim.OSInfo = osInfo 74 } 75 76 // Helper function to update the extractor config. 77 updateExtractorConfig := func(pathsToExtract []string, extractor filesystem.Extractor, chainFS scalibrfs.FS) { 78 config.Extractors = []filesystem.Extractor{extractor} 79 config.PathsToExtract = pathsToExtract 80 config.ScanRoots = []*scalibrfs.ScanRoot{ 81 &scalibrfs.ScanRoot{ 82 FS: chainFS, 83 }, 84 } 85 } 86 87 // locationIndexToPackages is used as a package cache to avoid re-extracting the same 88 // package from a file multiple times. 89 locationIndexToPackages := map[locationAndIndex][]*extractor.Package{} 90 lastLayerIndex := len(chainLayers) - 1 91 92 // Build a map from the extractor list for faster access. 93 nameToExtractor := map[string]filesystem.Extractor{} 94 for _, e := range extractors { 95 nameToExtractor[e.Name()] = e 96 } 97 98 for _, pkg := range inv.Packages { 99 layerDetails := cim.LayerMetadata[lastLayerIndex] 100 var pkgExtractor filesystem.Extractor 101 for _, name := range pkg.Plugins { 102 if ex, ok := nameToExtractor[name]; ok { 103 pkgExtractor = ex 104 break 105 } 106 } 107 108 // If the package has no locations or no filesystem Extractor, it cannot be traced. 109 isPackageTraceable := pkgExtractor != nil && len(pkg.Locations) > 0 110 if !isPackageTraceable { 111 continue 112 } 113 114 var pkgPURL string 115 if pkg.PURL() != nil { 116 pkgPURL = pkg.PURL().String() 117 } 118 119 var foundOrigin bool 120 fileLocation := pkg.Locations[0] 121 lastScannedLayerIndex := len(chainLayers) - 1 122 123 // Go backwards through the chain layers and find the first layer where the package is not 124 // present. Such layer is the layer in which the package was introduced. If the package is 125 // present in all layers, then it means it was introduced in the first layer. 126 for i := len(chainLayers) - 2; i >= 0; i-- { 127 oldChainLayer := chainLayers[i] 128 129 pkgLocationAndIndex := locationAndIndex{ 130 location: fileLocation, 131 index: i, 132 } 133 134 var oldPackages []*extractor.Package 135 if cachedPackages, ok := locationIndexToPackages[pkgLocationAndIndex]; ok { 136 oldPackages = cachedPackages 137 } else if _, err := oldChainLayer.FS().Stat(fileLocation); errors.Is(err, fs.ErrNotExist) { 138 // Check if file still exist in this layer, if not skip extraction. 139 // This is both an optimization, and avoids polluting the log output with false file not found errors. 140 oldPackages = []*extractor.Package{} 141 } else if filesExistInLayer(oldChainLayer, pkg.Locations) { 142 // Update the extractor config to use the files from the current layer. 143 // We only take extract the first location because other locations are derived from the initial 144 // extraction location. If other locations can no longer be determined from the first location 145 // they should not be included here, and the trace for those packages stops here. 146 updateExtractorConfig([]string{fileLocation}, pkgExtractor, oldChainLayer.FS()) 147 148 // Runs SCALIBR extraction on the file of interest in oldChainLayer. 149 oldInv, _, err := filesystem.Run(ctx, config) 150 oldPackages = oldInv.Packages 151 if err != nil { 152 break 153 } 154 } else { 155 // If none of the files from the packages are present in the underlying layer, then there 156 // will be no difference in the extracted packages from oldChainLayer, so extraction can be 157 // skipped in the chain layer. This is an optimization to avoid extracting the same package 158 // multiple times. 159 continue 160 } 161 162 // Cache the packages for future use. 163 locationIndexToPackages[pkgLocationAndIndex] = oldPackages 164 165 foundPackage := false 166 for _, oldPKG := range oldPackages { 167 // PURLs are being used as a package key, so if they are different, skip this package. 168 oldPKGPURL := oldPKG.PURL() 169 if oldPKGPURL == nil || oldPKGPURL.String() != pkgPURL { 170 continue 171 } 172 173 if !areLocationsEqual(oldPKG.Locations, pkg.Locations) { 174 continue 175 } 176 177 foundPackage = true 178 break 179 } 180 181 // If the package is not present in the old layer, then it was introduced in the previous layer we actually scanned 182 if !foundPackage { 183 layerDetails = cim.LayerMetadata[lastScannedLayerIndex] 184 foundOrigin = true 185 break 186 } 187 188 // This is now the latest scanned layer 189 lastScannedLayerIndex = i 190 } 191 192 // If the package is present in every layer, then it means it was introduced in the first 193 // layer. 194 if !foundOrigin { 195 layerDetails = cim.LayerMetadata[0] 196 } 197 pkg.LayerMetadata = layerDetails 198 } 199 } 200 201 // areLocationsEqual checks if the package location strings are equal. 202 func areLocationsEqual(fileLocations []string, otherFileLocations []string) bool { 203 if len(fileLocations) == 0 || len(otherFileLocations) == 0 { 204 log.Warnf("Empty file locations found. This should not happen.") 205 return false 206 } 207 208 return fileLocations[0] == otherFileLocations[0] 209 } 210 211 // getSingleLayerFSFromChainLayer returns the filesystem of the underlying layer in the chain layer. 212 func getLayerFSFromChainLayer(chainLayer scalibrimage.ChainLayer) (scalibrfs.FS, error) { 213 layer := chainLayer.Layer() 214 if layer == nil { 215 return nil, errors.New("chain layer has no layer") 216 } 217 218 fs := layer.FS() 219 if fs == nil { 220 return nil, errors.New("layer has no filesystem") 221 } 222 223 return fs, nil 224 } 225 226 func fillLayerMetadataFromChainLayers(cim *extractor.ContainerImageMetadata, chainLayers []scalibrimage.ChainLayer) { 227 // Create list of layer details struct to be referenced by inventory. 228 for i, chainLayer := range chainLayers { 229 // Get the string representation of the diffID, and remove the algorithm prefix if it exists. 230 // TODO: b/406537132 - Determine if diffIDs should be validated via the Validate function in 231 // golang/opencontainers/digest/algorithm.go. Just getting the string representation of the 232 // diffID acts as failing open, but perhaps we should consider validating the diffID and logging 233 // a warning if it isn't. 234 metadata := &extractor.LayerMetadata{ 235 Index: i, 236 ParentContainer: cim, 237 ChainID: chainLayer.ChainID(), 238 DiffID: chainLayer.Layer().DiffID(), 239 Command: chainLayer.Layer().Command(), 240 IsEmpty: chainLayer.Layer().IsEmpty(), 241 } 242 cim.LayerMetadata = append(cim.LayerMetadata, metadata) 243 } 244 } 245 246 // filesExistInLayer checks if any of the provided files are present in the underlying layer of the 247 // chain layer. 248 func filesExistInLayer(chainLayer scalibrimage.ChainLayer, fileLocations []string) bool { 249 layerFS, err := getLayerFSFromChainLayer(chainLayer) 250 if err != nil { 251 return false 252 } 253 254 // Check if any of the files are present in the underlying layer. 255 for _, fileLocation := range fileLocations { 256 if _, err := layerFS.Stat(fileLocation); err == nil { 257 return true 258 } 259 } 260 return false 261 }