github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/debian/parse_deb_archive.go (about)

     1  package debian
     2  
     3  import (
     4  	"archive/tar"
     5  	"bytes"
     6  	"context"
     7  	"fmt"
     8  	"io"
     9  	"path/filepath"
    10  	"regexp"
    11  	"strings"
    12  
    13  	"github.com/blakesmith/ar"
    14  	"github.com/mholt/archives"
    15  
    16  	"github.com/anchore/syft/internal"
    17  	"github.com/anchore/syft/internal/unknown"
    18  	"github.com/anchore/syft/syft/artifact"
    19  	"github.com/anchore/syft/syft/file"
    20  	"github.com/anchore/syft/syft/pkg"
    21  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    22  )
    23  
    24  // parseDebArchive parses a Debian package archive (.deb) file and returns the packages it contains.
    25  // A .deb file is an ar archive containing three main files:
    26  // - debian-binary: Version of the .deb format (usually "2.0")
    27  // - control.tar.gz/xz/zst: Contains package metadata (control file, md5sums, conffiles)
    28  // - data.tar.gz/xz/zst: Contains the actual files to be installed (not processed by this cataloger)
    29  //
    30  // This function extracts and processes the control information to create package metadata.
    31  func parseDebArchive(ctx context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    32  	arReader := ar.NewReader(reader)
    33  
    34  	var metadata *pkg.DpkgArchiveEntry
    35  	var licenses []string
    36  	var unknownErr error
    37  	for {
    38  		header, err := arReader.Next()
    39  		if err == io.EOF {
    40  			break
    41  		}
    42  		if err != nil {
    43  			return nil, nil, fmt.Errorf("failed to read ar header: %w", err)
    44  		}
    45  
    46  		switch {
    47  		case strings.HasPrefix(header.Name, "control.tar"):
    48  			// Decompress the control.tar.* file
    49  			dcReader, err := decompressionStream(ctx, arReader, header.Name)
    50  			if err != nil {
    51  				return nil, nil, unknown.New(reader.Location, fmt.Errorf("failed to decompress control.tar.* file: %w", err))
    52  			}
    53  			metadata, err = processControlTar(dcReader)
    54  			if err != nil {
    55  				return nil, nil, unknown.New(reader.Location, fmt.Errorf("failed to process control.tar.* file: %w", err))
    56  			}
    57  		case strings.HasPrefix(header.Name, "data.tar"):
    58  			// Decompress the data.tar.* file
    59  			dcReader, err := decompressionStream(ctx, arReader, header.Name)
    60  			if err != nil {
    61  				return nil, nil, unknown.New(reader.Location, fmt.Errorf("failed to decompress data.tar.* file: %w", err))
    62  			}
    63  			licenses, err = processDataTar(dcReader)
    64  			if err != nil {
    65  				unknownErr = unknown.Append(unknownErr, reader.Location, fmt.Errorf("failed to process data.tar.* file: %w", err))
    66  			}
    67  		}
    68  	}
    69  
    70  	if metadata == nil {
    71  		return nil, nil, unknown.New(reader.Location, fmt.Errorf("no application found described in .dpkg archive"))
    72  	}
    73  
    74  	return []pkg.Package{
    75  		newDebArchivePackage(ctx, reader.Location, *metadata, licenses),
    76  	}, nil, nil
    77  }
    78  
    79  // this is the pattern you'd expect to see in a tar header for a debian package license file ()
    80  var archiveHeaderLicensePathPattern = regexp.MustCompile(`^\.?/usr/share/doc/[^/]+/copyright$`)
    81  
    82  func processDataTar(dcReader io.ReadCloser) ([]string, error) {
    83  	defer internal.CloseAndLogError(dcReader, "")
    84  	var licenses []string
    85  
    86  	tarReader := tar.NewReader(dcReader)
    87  	for {
    88  		header, err := tarReader.Next()
    89  		if err == io.EOF {
    90  			break
    91  		}
    92  		if err != nil {
    93  			return licenses, err
    94  		}
    95  
    96  		// look for /usr/share/docs/*/copyright files, parse each one for license claims
    97  		// TODO: in the future we can add archive sub indexes to the locations to see where within
    98  		// the dpkg archive the license was found
    99  		if archiveHeaderLicensePathPattern.MatchString(header.Name) {
   100  			licenses = append(licenses, parseLicensesFromCopyright(tarReader)...)
   101  		}
   102  	}
   103  
   104  	return licenses, nil
   105  }
   106  
   107  func processControlTar(dcReader io.ReadCloser) (*pkg.DpkgArchiveEntry, error) {
   108  	defer internal.CloseAndLogError(dcReader, "")
   109  
   110  	// Extract control, md5sums, and conffiles files from control.tar
   111  	tarReader := tar.NewReader(dcReader)
   112  	controlFileContent, md5Content, confContent, err := readControlFiles(tarReader)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("failed to read control files: %w", err)
   115  	}
   116  
   117  	if controlFileContent == nil {
   118  		return nil, fmt.Errorf("control file not found in archive")
   119  	}
   120  
   121  	metadata, err := newDpkgArchiveMetadata(controlFileContent, md5Content, confContent)
   122  	if err != nil {
   123  		return nil, fmt.Errorf("failed to create package metadata: %w", err)
   124  	}
   125  
   126  	return &metadata, nil
   127  }
   128  
   129  func newDpkgArchiveMetadata(controlFile, md5sums, confFiles []byte) (pkg.DpkgArchiveEntry, error) {
   130  	// parse the control file to get package metadata
   131  	metadata, err := parseControlFile(string(controlFile))
   132  	if err != nil {
   133  		return pkg.DpkgArchiveEntry{}, fmt.Errorf("failed to parse control file: %w", err)
   134  	}
   135  
   136  	// parse MD5 sums to get file records
   137  	var files []pkg.DpkgFileRecord
   138  	if len(md5sums) > 0 {
   139  		files = parseDpkgMD5Info(bytes.NewReader(md5sums))
   140  	}
   141  
   142  	// mark config files
   143  	if len(confFiles) > 0 {
   144  		markConfigFiles(confFiles, files)
   145  	}
   146  
   147  	metadata.Files = files
   148  	return metadata, nil
   149  }
   150  
   151  func decompressionStream(ctx context.Context, r io.Reader, filePath string) (io.ReadCloser, error) {
   152  	format, stream, err := archives.Identify(ctx, filePath, r)
   153  	if err != nil {
   154  		return nil, fmt.Errorf("failed to identify compression format: %w", err)
   155  	}
   156  
   157  	decompressor, ok := format.(archives.Decompressor)
   158  	if !ok {
   159  		return nil, fmt.Errorf("file format does not support decompression: %s", filePath)
   160  	}
   161  
   162  	rc, err := decompressor.OpenReader(stream)
   163  	if err != nil {
   164  		return nil, fmt.Errorf("failed to create decompression reader: %w", err)
   165  	}
   166  
   167  	return rc, nil
   168  }
   169  
   170  // readControlFiles extracts important files from the control.tar archive
   171  func readControlFiles(tarReader *tar.Reader) (controlFile, md5sums, conffiles []byte, err error) {
   172  	for {
   173  		header, err := tarReader.Next()
   174  		if err == io.EOF {
   175  			break
   176  		}
   177  		if err != nil {
   178  			return nil, nil, nil, err
   179  		}
   180  
   181  		switch filepath.Base(header.Name) {
   182  		case "control":
   183  			controlFile, err = io.ReadAll(tarReader)
   184  			if err != nil {
   185  				return nil, nil, nil, err
   186  			}
   187  		case "md5sums":
   188  			md5sums, err = io.ReadAll(tarReader)
   189  			if err != nil {
   190  				return nil, nil, nil, err
   191  			}
   192  		case "conffiles":
   193  			conffiles, err = io.ReadAll(tarReader)
   194  			if err != nil {
   195  				return nil, nil, nil, err
   196  			}
   197  		}
   198  	}
   199  
   200  	return controlFile, md5sums, conffiles, nil
   201  }
   202  
   203  // parseControlFile parses the content of a debian control file into package metadata
   204  func parseControlFile(controlFileContent string) (pkg.DpkgArchiveEntry, error) {
   205  	// Reuse the existing dpkg status file parsing logic
   206  	reader := strings.NewReader(controlFileContent)
   207  
   208  	entries, err := parseDpkgStatus(reader)
   209  	if err != nil {
   210  		return pkg.DpkgArchiveEntry{}, fmt.Errorf("failed to parse control file: %w", err)
   211  	}
   212  
   213  	if len(entries) == 0 {
   214  		return pkg.DpkgArchiveEntry{}, fmt.Errorf("no package entries found in control file")
   215  	}
   216  
   217  	// We expect only one entry from a .deb control file
   218  	return pkg.DpkgArchiveEntry(entries[0]), nil
   219  }
   220  
   221  // markConfigFiles marks files that are listed in conffiles as configuration files
   222  func markConfigFiles(conffilesContent []byte, files []pkg.DpkgFileRecord) {
   223  	// Parse the conffiles content into DpkgFileRecord entries
   224  	confFiles := parseDpkgConffileInfo(bytes.NewReader(conffilesContent))
   225  
   226  	// Create a map for quick lookup of config files by path
   227  	configPathMap := make(map[string]struct{})
   228  	for _, confFile := range confFiles {
   229  		configPathMap[confFile.Path] = struct{}{}
   230  	}
   231  
   232  	// Mark files as config files if they're in the conffiles list
   233  	for i := range files {
   234  		if _, exists := configPathMap[files[i].Path]; exists {
   235  			files[i].IsConfigFile = true
   236  		}
   237  	}
   238  }