github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/os/dpkg/dpkg.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package dpkg extracts packages from dpkg database.
    16  package dpkg
    17  
    18  import (
    19  	"bufio"
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"net/textproto"
    25  	"path/filepath"
    26  	"strings"
    27  
    28  	"github.com/google/osv-scalibr/extractor"
    29  	"github.com/google/osv-scalibr/extractor/filesystem"
    30  	"github.com/google/osv-scalibr/extractor/filesystem/internal/units"
    31  	dpkgmeta "github.com/google/osv-scalibr/extractor/filesystem/os/dpkg/metadata"
    32  	"github.com/google/osv-scalibr/extractor/filesystem/os/osrelease"
    33  	"github.com/google/osv-scalibr/inventory"
    34  	"github.com/google/osv-scalibr/inventory/vex"
    35  	"github.com/google/osv-scalibr/log"
    36  	"github.com/google/osv-scalibr/plugin"
    37  	"github.com/google/osv-scalibr/purl"
    38  	"github.com/google/osv-scalibr/stats"
    39  )
    40  
    41  const (
    42  	// Name is the unique name of this extractor.
    43  	Name = "os/dpkg"
    44  
    45  	// defaultMaxFileSizeBytes is the maximum file size an extractor will unmarshal.
    46  	// If Extract gets a bigger file, it will return an error.
    47  	defaultMaxFileSizeBytes = 100 * units.MiB
    48  
    49  	// defaultIncludeNotInstalled is the default value for the IncludeNotInstalled option.
    50  	defaultIncludeNotInstalled = false
    51  )
    52  
    53  // Config is the configuration for the Extractor.
    54  type Config struct {
    55  	// Stats is a stats collector for reporting metrics.
    56  	Stats stats.Collector
    57  	// MaxFileSizeBytes is the maximum file size this extractor will unmarshal. If
    58  	// `FileRequired` gets a bigger file, it will return false,
    59  	MaxFileSizeBytes int64
    60  	// IncludeNotInstalled includes packages that are not installed
    61  	// (e.g. `deinstall`, `purge`, and those missing a status field).
    62  	IncludeNotInstalled bool
    63  }
    64  
    65  // DefaultConfig returns the default configuration for the DPKG extractor.
    66  func DefaultConfig() Config {
    67  	return Config{
    68  		MaxFileSizeBytes:    defaultMaxFileSizeBytes,
    69  		IncludeNotInstalled: defaultIncludeNotInstalled,
    70  	}
    71  }
    72  
    73  // Extractor extracts packages from DPKG files.
    74  type Extractor struct {
    75  	stats               stats.Collector
    76  	maxFileSizeBytes    int64
    77  	includeNotInstalled bool
    78  }
    79  
    80  // New returns a DPKG extractor.
    81  //
    82  // For most use cases, initialize with:
    83  // ```
    84  // e := New(DefaultConfig())
    85  // ```
    86  func New(cfg Config) *Extractor {
    87  	return &Extractor{
    88  		stats:               cfg.Stats,
    89  		maxFileSizeBytes:    cfg.MaxFileSizeBytes,
    90  		includeNotInstalled: cfg.IncludeNotInstalled,
    91  	}
    92  }
    93  
    94  // NewDefault returns an extractor with the default config settings.
    95  func NewDefault() filesystem.Extractor { return New(DefaultConfig()) }
    96  
    97  // Config returns the configuration of the extractor.
    98  func (e Extractor) Config() Config {
    99  	return Config{
   100  		Stats:               e.stats,
   101  		MaxFileSizeBytes:    e.maxFileSizeBytes,
   102  		IncludeNotInstalled: e.includeNotInstalled,
   103  	}
   104  }
   105  
   106  // Name of the extractor.
   107  func (e Extractor) Name() string { return Name }
   108  
   109  // Version of the extractor.
   110  func (e Extractor) Version() int { return 0 }
   111  
   112  // Requirements of the extractor.
   113  func (e Extractor) Requirements() *plugin.Capabilities { return &plugin.Capabilities{} }
   114  
   115  // FileRequired returns true if the specified file matches dpkg status file pattern.
   116  func (e Extractor) FileRequired(api filesystem.FileAPI) bool {
   117  	path := api.Path()
   118  	if !fileRequired(path) {
   119  		return false
   120  	}
   121  
   122  	fileinfo, err := api.Stat()
   123  	if err != nil {
   124  		return false
   125  	}
   126  	if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes {
   127  		e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultSizeLimitExceeded)
   128  		return false
   129  	}
   130  
   131  	e.reportFileRequired(path, fileinfo.Size(), stats.FileRequiredResultOK)
   132  	return true
   133  }
   134  
   135  func fileRequired(path string) bool {
   136  	normalized := filepath.ToSlash(path)
   137  
   138  	// Normal status file matching DPKG or OPKG format
   139  	if normalized == "var/lib/dpkg/status" || normalized == "usr/lib/opkg/status" {
   140  		return true
   141  	}
   142  
   143  	// Should only match status files in status.d directory.
   144  	return strings.HasPrefix(normalized, "var/lib/dpkg/status.d/") && !strings.HasSuffix(normalized, ".md5sums")
   145  }
   146  
   147  func (e Extractor) reportFileRequired(path string, fileSizeBytes int64, result stats.FileRequiredResult) {
   148  	if e.stats == nil {
   149  		return
   150  	}
   151  	e.stats.AfterFileRequired(e.Name(), &stats.FileRequiredStats{
   152  		Path:          path,
   153  		Result:        result,
   154  		FileSizeBytes: fileSizeBytes,
   155  	})
   156  }
   157  
   158  // Extract extracts packages from dpkg status files passed through the scan input.
   159  func (e Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) {
   160  	pkgs, err := e.extractFromInput(ctx, input)
   161  	if e.stats != nil {
   162  		var fileSizeBytes int64
   163  		if input.Info != nil {
   164  			fileSizeBytes = input.Info.Size()
   165  		}
   166  		e.stats.AfterFileExtracted(e.Name(), &stats.FileExtractedStats{
   167  			Path:          input.Path,
   168  			Result:        filesystem.ExtractorErrorToFileExtractedResult(err),
   169  			FileSizeBytes: fileSizeBytes,
   170  		})
   171  	}
   172  	return inventory.Inventory{Packages: pkgs}, err
   173  }
   174  
   175  func (e Extractor) extractFromInput(ctx context.Context, input *filesystem.ScanInput) ([]*extractor.Package, error) {
   176  	m, err := osrelease.GetOSRelease(input.FS)
   177  	if err != nil {
   178  		log.Errorf("osrelease.ParseOsRelease(): %v", err)
   179  	}
   180  
   181  	rd := textproto.NewReader(bufio.NewReader(input.Reader))
   182  	pkgs := []*extractor.Package{}
   183  	for eof := false; !eof; {
   184  		// Return if canceled or exceeding deadline.
   185  		if err := ctx.Err(); err != nil {
   186  			return pkgs, fmt.Errorf("%s halted due to context error: %w", e.Name(), err)
   187  		}
   188  
   189  		h, err := rd.ReadMIMEHeader()
   190  		if err != nil {
   191  			if errors.Is(err, io.EOF) {
   192  				// We might still have one more line of data
   193  				// so return only after it's been parsed.
   194  				eof = true
   195  			} else {
   196  				if strings.Contains(input.Path, "status.d") {
   197  					log.Warnf("Failed to read MIME header from %q: %v", input.Path, err)
   198  					return []*extractor.Package{}, nil
   199  				}
   200  				return pkgs, err
   201  			}
   202  		}
   203  
   204  		// Skip empty lines
   205  		if len(h) == 0 {
   206  			continue
   207  		}
   208  
   209  		// Distroless distributions have their packages in status.d, which does not contain the Status
   210  		// value.
   211  		if !e.includeNotInstalled && (!strings.Contains(input.Path, "status.d") || h.Get("Status") != "") {
   212  			if h.Get("Status") == "" {
   213  				log.Warnf("Package %q has no status field", h.Get("Package"))
   214  				continue
   215  			}
   216  			installed, err := statusInstalled(h.Get("Status"))
   217  			if err != nil {
   218  				return pkgs, fmt.Errorf("statusInstalled(%q): %w", h.Get("Status"), err)
   219  			}
   220  			if !installed {
   221  				continue
   222  			}
   223  		}
   224  
   225  		pkgName := h.Get("Package")
   226  		pkgVersion := h.Get("Version")
   227  		if pkgName == "" || pkgVersion == "" {
   228  			if !eof { // Expected when reaching the last line.
   229  				log.Warnf("DPKG package name or version is empty (name: %q, version: %q)", pkgName, pkgVersion)
   230  			}
   231  			continue
   232  		}
   233  
   234  		description := strings.ToLower(h.Get("Description"))
   235  		var vexes []*vex.PackageExploitabilitySignal
   236  		if strings.Contains(description, "transitional package") ||
   237  			strings.Contains(description, "transitional dummy package") ||
   238  			strings.Contains(description, "transitional empty package") {
   239  			vexes = append(vexes, &vex.PackageExploitabilitySignal{
   240  				Plugin:          Name,
   241  				Justification:   vex.ComponentNotPresent,
   242  				MatchesAllVulns: true,
   243  			})
   244  		}
   245  
   246  		purlType := purl.TypeDebian
   247  		if input.Path == "usr/lib/opkg/status" {
   248  			purlType = purl.TypeOpkg
   249  		}
   250  
   251  		p := &extractor.Package{
   252  			Name:     pkgName,
   253  			Version:  pkgVersion,
   254  			PURLType: purlType,
   255  			Metadata: &dpkgmeta.Metadata{
   256  				PackageName:       pkgName,
   257  				PackageVersion:    pkgVersion,
   258  				Status:            h.Get("Status"),
   259  				OSID:              m["ID"],
   260  				OSVersionCodename: m["VERSION_CODENAME"],
   261  				OSVersionID:       m["VERSION_ID"],
   262  				Maintainer:        h.Get("Maintainer"),
   263  				Architecture:      h.Get("Architecture"),
   264  			},
   265  			Locations:             []string{input.Path},
   266  			ExploitabilitySignals: vexes,
   267  		}
   268  		sourceName, sourceVersion, err := parseSourceNameVersion(h.Get("Source"))
   269  		if err != nil {
   270  			return pkgs, fmt.Errorf("parseSourceNameVersion(%q): %w", h.Get("Source"), err)
   271  		}
   272  		if sourceName != "" {
   273  			p.Metadata.(*dpkgmeta.Metadata).SourceName = sourceName
   274  			p.Metadata.(*dpkgmeta.Metadata).SourceVersion = sourceVersion
   275  		}
   276  
   277  		pkgs = append(pkgs, p)
   278  	}
   279  	return pkgs, nil
   280  }
   281  
   282  func statusInstalled(status string) (bool, error) {
   283  	// Status field format: "want flag status", e.g. "install ok installed"
   284  	// The package is currently installed if the status field is set to installed.
   285  	// Other fields just show the intent of the package manager but not the current state.
   286  	parts := strings.Split(status, " ")
   287  	if len(parts) != 3 {
   288  		return false, fmt.Errorf("invalid DPKG Status field %q", status)
   289  	}
   290  	return parts[2] == "installed", nil
   291  }
   292  
   293  func parseSourceNameVersion(source string) (string, string, error) {
   294  	if source == "" {
   295  		return "", "", nil
   296  	}
   297  	// Format is either "name" or "name (version)"
   298  	if idx := strings.Index(source, " ("); idx != -1 {
   299  		if !strings.HasSuffix(source, ")") {
   300  			return "", "", fmt.Errorf("invalid DPKG Source field: %q", source)
   301  		}
   302  		n := source[:idx]
   303  		v := source[idx+2 : len(source)-1]
   304  		return n, v, nil
   305  	}
   306  	return source, "", nil
   307  }