github.com/google/osv-scalibr@v0.4.1/enricher/baseimage/baseimage.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package baseimage enriches inventory layer details with potential base images from deps.dev.
    16  package baseimage
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"log"
    23  	"slices"
    24  
    25  	"github.com/google/osv-scalibr/clients/depsdev/v1alpha1/grpcclient"
    26  	"github.com/google/osv-scalibr/enricher"
    27  	"github.com/google/osv-scalibr/extractor"
    28  	"github.com/google/osv-scalibr/inventory"
    29  	"github.com/google/osv-scalibr/plugin"
    30  	"github.com/opencontainers/go-digest"
    31  	"github.com/opencontainers/image-spec/identity"
    32  	"go.uber.org/multierr"
    33  	"golang.org/x/sync/errgroup"
    34  )
    35  
    36  const (
    37  	// Name is the name of the base image enricher.
    38  	Name = "baseimage"
    39  	// Version is the version of the base image enricher.
    40  	Version = 0
    41  	// digestSHA256EmptyTar is the canonical sha256 digest of empty tar file -
    42  	// (1024 NULL bytes)
    43  	digestSHA256EmptyTar = digest.Digest("sha256:5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef")
    44  
    45  	maxConcurrentRequests = 1000
    46  )
    47  
    48  // Config is the configuration for the base image enricher.
    49  type Config struct {
    50  	Client Client
    51  }
    52  
    53  // DefaultConfig returns the default configuration for the base image enricher.
    54  func DefaultConfig() *Config {
    55  	grpcConfig := grpcclient.DefaultConfig()
    56  	grpcclient, err := grpcclient.New(grpcConfig)
    57  	if err != nil {
    58  		log.Fatalf("Failed to create base image client: %v", err)
    59  	}
    60  
    61  	client := NewClientGRPC(grpcclient)
    62  
    63  	return &Config{
    64  		Client: client,
    65  	}
    66  }
    67  
    68  // Enricher enriches inventory layer details with potential base images from deps.dev.
    69  type Enricher struct {
    70  	client Client
    71  }
    72  
    73  // New returns a new base image enricher.
    74  func New(cfg *Config) (*Enricher, error) {
    75  	if cfg == nil {
    76  		return nil, errors.New("config is nil")
    77  	}
    78  	if cfg.Client == nil {
    79  		return nil, errors.New("client is nil")
    80  	}
    81  	return &Enricher{client: cfg.Client}, nil
    82  }
    83  
    84  // NewDefault returns a new base image enricher with the default configuration.
    85  // It will log.Fatal if the enricher cannot be created.
    86  func NewDefault() enricher.Enricher {
    87  	e, err := New(DefaultConfig())
    88  	if err != nil {
    89  		log.Fatalf("Failed to create base image enricher: %v", err)
    90  	}
    91  	return e
    92  }
    93  
    94  // Config returns the configuration for the base image enricher.
    95  func (e *Enricher) Config() *Config {
    96  	return &Config{
    97  		Client: e.client,
    98  	}
    99  }
   100  
   101  // Name of the base image enricher.
   102  func (*Enricher) Name() string { return Name }
   103  
   104  // Version of the base image enricher.
   105  func (*Enricher) Version() int { return Version }
   106  
   107  // Requirements of the base image enricher.
   108  func (*Enricher) Requirements() *plugin.Capabilities {
   109  	return &plugin.Capabilities{Network: plugin.NetworkOnline}
   110  }
   111  
   112  // RequiredPlugins returns a list of Plugins that need to be enabled for this Enricher to work.
   113  func (*Enricher) RequiredPlugins() []string {
   114  	return []string{}
   115  }
   116  
   117  // Enrich enriches the inventory with base image information from deps.dev.
   118  func (e *Enricher) Enrich(ctx context.Context, _ *enricher.ScanInput, inv *inventory.Inventory) error {
   119  	if inv.ContainerImageMetadata == nil {
   120  		return nil
   121  	}
   122  
   123  	// Map from chain ID to list of repositories it belongs to.
   124  	chainIDToBaseImage := make(map[string][]*extractor.BaseImageDetails)
   125  	var enrichErr error
   126  	for _, cim := range inv.ContainerImageMetadata {
   127  		if cim.LayerMetadata == nil {
   128  			continue
   129  		}
   130  
   131  		// Placeholder for the scanned image itself.
   132  		cim.BaseImages = [][]*extractor.BaseImageDetails{
   133  			[]*extractor.BaseImageDetails{},
   134  		}
   135  
   136  		chainIDsByLayerIndex := make([]digest.Digest, len(cim.LayerMetadata))
   137  		baseImagesByLayerIndex := make([][]*extractor.BaseImageDetails, len(cim.LayerMetadata))
   138  		g, ctx := errgroup.WithContext(ctx)
   139  		g.SetLimit(maxConcurrentRequests)
   140  
   141  		// We do not want to use the normal chainID of the layer, because it does not include empty
   142  		// layers. Deps.dev does a special calculation of the chainID that includes empty layers, so we
   143  		// do the same here.
   144  		for i, l := range cim.LayerMetadata {
   145  			diffID := l.DiffID
   146  			if l.DiffID == "" {
   147  				diffID = digestSHA256EmptyTar
   148  			}
   149  
   150  			// first populate this with diffIDs
   151  			chainIDsByLayerIndex[i] = diffID
   152  		}
   153  		// This replaces the diffIDs with chainIDs for the corresponding index.
   154  		identity.ChainIDs(chainIDsByLayerIndex)
   155  
   156  		for i, chainID := range chainIDsByLayerIndex {
   157  			if val, ok := chainIDToBaseImage[chainID.String()]; ok {
   158  				// Already cached, we can just skip this layer.
   159  				baseImagesByLayerIndex[i] = val
   160  				continue
   161  			}
   162  
   163  			// Otherwise query deps.dev for the base images of this layer.
   164  			g.Go(func() error {
   165  				if ctx.Err() != nil {
   166  					// this return value doesn't matter to errgroup.Wait(), since it already errored
   167  					return ctx.Err()
   168  				}
   169  
   170  				req := &Request{
   171  					ChainID: chainID.String(),
   172  				}
   173  				resp, err := e.client.QueryContainerImages(ctx, req)
   174  				if err != nil {
   175  					if !errors.Is(err, errNotFound) {
   176  						// If one query fails even with grpc retries, we cancel the rest of the
   177  						// queries and return the error.
   178  						return fmt.Errorf("failed to query container images for chain ID %q: %w", chainID.String(), err)
   179  					}
   180  					return nil
   181  				}
   182  				var baseImages []*extractor.BaseImageDetails
   183  
   184  				if resp != nil && resp.Results != nil && len(resp.Results) > 0 {
   185  					for _, result := range resp.Results {
   186  						if result.Repository != "" {
   187  							baseImages = append(baseImages, &extractor.BaseImageDetails{
   188  								Repository: result.Repository,
   189  								Registry:   "docker.io", // Currently all deps.dev images are from the docker mirror.
   190  								ChainID:    chainID,
   191  								Plugin:     Name,
   192  							})
   193  						}
   194  					}
   195  				}
   196  
   197  				// Cache and also save to layer map.
   198  				baseImagesByLayerIndex[i] = baseImages
   199  
   200  				return nil
   201  			})
   202  		}
   203  
   204  		if err := g.Wait(); err != nil {
   205  			enrichErr = multierr.Append(enrichErr, err)
   206  			// Move onto the next image
   207  			continue
   208  		}
   209  
   210  		// Loop backwards through the layers, from the newest to the oldest layer.
   211  		// This is because base images are identified by the chain ID of the newest layer in the image,
   212  		// so all older layer must belong to that base image.
   213  		for i, lm := range slices.Backward(cim.LayerMetadata) {
   214  			baseImages := baseImagesByLayerIndex[i]
   215  			lm.BaseImageIndex = len(cim.BaseImages) - 1
   216  			chainIDToBaseImage[chainIDsByLayerIndex[i].String()] = baseImages
   217  
   218  			if len(baseImages) == 0 {
   219  				continue
   220  			}
   221  
   222  			// Is the current set of baseImages the same as the previous?
   223  			isSame := false
   224  			lastBaseImages := cim.BaseImages[len(cim.BaseImages)-1]
   225  			if len(baseImages) == len(lastBaseImages) {
   226  				isSame = true
   227  				for j := range baseImages {
   228  					if baseImages[j].Repository != lastBaseImages[j].Repository ||
   229  						baseImages[j].Registry != lastBaseImages[j].Registry {
   230  						isSame = false
   231  						break
   232  					}
   233  				}
   234  			}
   235  
   236  			if !isSame {
   237  				// Only if it's not the same base image, update
   238  				cim.BaseImages = append(cim.BaseImages, baseImages)
   239  				// And if we do update, also change the base image index to new last index.
   240  				lm.BaseImageIndex++
   241  			}
   242  		}
   243  	}
   244  
   245  	return enrichErr
   246  }