github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/containers/dockerbaseimage/dockerbaseimage.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package dockerbaseimage extracts base image urls from Dockerfiles.
    16  package dockerbaseimage
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"io"
    23  	"path/filepath"
    24  	"slices"
    25  	"strings"
    26  
    27  	"github.com/google/osv-scalibr/extractor"
    28  	"github.com/google/osv-scalibr/extractor/filesystem"
    29  	"github.com/google/osv-scalibr/extractor/filesystem/internal/units"
    30  	"github.com/google/osv-scalibr/inventory"
    31  	"github.com/google/osv-scalibr/log"
    32  	"github.com/google/osv-scalibr/plugin"
    33  	"github.com/google/osv-scalibr/purl"
    34  	"github.com/google/osv-scalibr/stats"
    35  	"github.com/moby/buildkit/frontend/dockerfile/linter"
    36  
    37  	mbi "github.com/moby/buildkit/frontend/dockerfile/instructions"
    38  	mbp "github.com/moby/buildkit/frontend/dockerfile/parser"
    39  )
    40  
    41  const (
    42  	// Name is the unique name of this extractor.
    43  	Name = "containers/dockerbaseimage"
    44  
    45  	// DefaultMaxFileSizeBytes is the default maximum file size the extractor will
    46  	// attempt to extract. If a file is encountered that is larger than this
    47  	// limit, the file is ignored by `FileRequired`.
    48  	DefaultMaxFileSizeBytes = 1 * units.MiB
    49  )
    50  
    51  var (
    52  	// dockerBaseContainers is a list of reserved terms/base containers that can be used within a
    53  	// Dockerfile (e.g. "scratch" is Docker's reserved, minimal image) and require special handling.
    54  	dockerBaseContainers = []string{"scratch"}
    55  )
    56  
    57  // Config is the configuration for the Extractor.
    58  type Config struct {
    59  	// Stats is a stats collector for reporting metrics.
    60  	Stats stats.Collector
    61  	// MaxFileSizeBytes is the maximum file size this extractor will unmarshal. If
    62  	// `FileRequired` gets a bigger file, it will return false,
    63  	MaxFileSizeBytes int64
    64  }
    65  
    66  // DefaultConfig returns the default configuration for the extractor.
    67  func DefaultConfig() Config {
    68  	return Config{
    69  		MaxFileSizeBytes: DefaultMaxFileSizeBytes,
    70  	}
    71  }
    72  
    73  // Extractor extracts repository URLs from Dockerfiles.
    74  type Extractor struct {
    75  	stats            stats.Collector
    76  	maxFileSizeBytes int64
    77  }
    78  
    79  // New returns a Dockerfile repository extractor.
    80  //
    81  // For most use cases, initialize with:
    82  // ```
    83  // e := New(DefaultConfig())
    84  // ```
    85  func New(cfg Config) *Extractor {
    86  	return &Extractor{
    87  		stats:            cfg.Stats,
    88  		maxFileSizeBytes: cfg.MaxFileSizeBytes,
    89  	}
    90  }
    91  
    92  // NewDefault returns an extractor with the default config settings.
    93  func NewDefault() filesystem.Extractor { return New(DefaultConfig()) }
    94  
    95  // Name of the extractor.
    96  func (e Extractor) Name() string { return Name }
    97  
    98  // Version of the extractor.
    99  func (e Extractor) Version() int { return 0 }
   100  
   101  // FileRequired returns true if the specified file matches Dockerfile.
   102  func (e Extractor) FileRequired(api filesystem.FileAPI) bool {
   103  	fileName := filepath.Base(api.Path())
   104  	ext := filepath.Ext(fileName)
   105  	baseName := strings.TrimSuffix(fileName, ext)
   106  	return strings.ToLower(baseName) == "dockerfile" || strings.ToLower(ext) == ".dockerfile"
   107  }
   108  
   109  // Extract extracts base image urls from a Dockerfile.
   110  func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) {
   111  	if input.Info == nil {
   112  		return inventory.Inventory{}, errors.New("input.Info is nil")
   113  	}
   114  	if input.Info.Size() > e.maxFileSizeBytes {
   115  		// Skipping too large file.
   116  		log.Infof("Skipping too large file: %s", input.Path)
   117  		return inventory.Inventory{}, nil
   118  	}
   119  
   120  	stages, args, err := parse(input.Reader)
   121  	if err != nil {
   122  		log.Warnf("Parsing error: %v", err)
   123  		return inventory.Inventory{}, err
   124  	}
   125  
   126  	argsMap := toMap(args)
   127  	baseContainers := uniqueContainers(stages)
   128  
   129  	var pkgs []*extractor.Package
   130  	for _, container := range baseContainers {
   131  		resolvedName := resolveName(container, argsMap)
   132  
   133  		name, version := parseName(resolvedName)
   134  
   135  		pkgs = append(pkgs, &extractor.Package{
   136  			Locations: []string{input.Path},
   137  			Name:      name,
   138  			Version:   version,
   139  			PURLType:  purl.TypeDocker,
   140  		})
   141  	}
   142  
   143  	return inventory.Inventory{Packages: pkgs}, nil
   144  }
   145  
   146  // Requirements of the extractor.
   147  func (e Extractor) Requirements() *plugin.Capabilities { return &plugin.Capabilities{} }
   148  
   149  func resolveName(name string, argsMap map[string]string) string {
   150  	if !strings.HasPrefix(name, "$") {
   151  		return name
   152  	}
   153  	resolved := argsMap[strings.Trim(name, "${}")]
   154  	if resolved == "" {
   155  		return name
   156  	}
   157  	return resolved
   158  }
   159  
   160  func parseName(name string) (string, string) {
   161  	if strings.Contains(name, "@") {
   162  		parts := strings.SplitN(name, "@", 2)
   163  		return parts[0], parts[1]
   164  	}
   165  
   166  	if strings.Contains(name, ":") {
   167  		parts := strings.SplitN(name, ":", 2)
   168  		return parts[0], parts[1]
   169  	}
   170  
   171  	return name, "latest"
   172  }
   173  
   174  func toMap(args []mbi.ArgCommand) map[string]string {
   175  	m := make(map[string]string)
   176  	for _, arg := range args {
   177  		for _, arg := range arg.Args {
   178  			if arg.Value != nil {
   179  				m[arg.Key] = *arg.Value
   180  			}
   181  		}
   182  	}
   183  	return m
   184  }
   185  
   186  func uniqueContainers(stages []mbi.Stage) []string {
   187  	stagesSeen := make(map[string]bool)
   188  	containersSeen := make(map[string]bool)
   189  	var baseContainers []string
   190  	for _, stage := range stages {
   191  		if slices.Contains(dockerBaseContainers, stage.BaseName) {
   192  			// Skip base containers that are reserved or special values.
   193  			continue
   194  		}
   195  		stagesSeen[stage.Name] = true
   196  		if stagesSeen[stage.BaseName] {
   197  			continue
   198  		}
   199  		baseContainer := stage.BaseName
   200  		if containersSeen[baseContainer] {
   201  			continue
   202  		}
   203  		baseContainers = append(baseContainers, baseContainer)
   204  		containersSeen[baseContainer] = true
   205  	}
   206  	return baseContainers
   207  }
   208  
   209  func parse(r io.Reader) ([]mbi.Stage, []mbi.ArgCommand, error) {
   210  	p, err := mbp.Parse(r)
   211  	if err != nil {
   212  		return nil, nil, fmt.Errorf("failed to parse dockerfile: %w", err)
   213  	}
   214  
   215  	return mbi.Parse(p.AST, linter.New(&linter.Config{}))
   216  }