github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/debian/parse_deb_archive.go (about) 1 package debian 2 3 import ( 4 "archive/tar" 5 "bytes" 6 "context" 7 "fmt" 8 "io" 9 "path/filepath" 10 "regexp" 11 "strings" 12 13 "github.com/blakesmith/ar" 14 "github.com/mholt/archives" 15 16 "github.com/anchore/syft/internal" 17 "github.com/anchore/syft/internal/unknown" 18 "github.com/anchore/syft/syft/artifact" 19 "github.com/anchore/syft/syft/file" 20 "github.com/anchore/syft/syft/pkg" 21 "github.com/anchore/syft/syft/pkg/cataloger/generic" 22 ) 23 24 // parseDebArchive parses a Debian package archive (.deb) file and returns the packages it contains. 25 // A .deb file is an ar archive containing three main files: 26 // - debian-binary: Version of the .deb format (usually "2.0") 27 // - control.tar.gz/xz/zst: Contains package metadata (control file, md5sums, conffiles) 28 // - data.tar.gz/xz/zst: Contains the actual files to be installed (not processed by this cataloger) 29 // 30 // This function extracts and processes the control information to create package metadata. 31 func parseDebArchive(ctx context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { 32 arReader := ar.NewReader(reader) 33 34 var metadata *pkg.DpkgArchiveEntry 35 var licenses []string 36 var unknownErr error 37 for { 38 header, err := arReader.Next() 39 if err == io.EOF { 40 break 41 } 42 if err != nil { 43 return nil, nil, fmt.Errorf("failed to read ar header: %w", err) 44 } 45 46 switch { 47 case strings.HasPrefix(header.Name, "control.tar"): 48 // Decompress the control.tar.* file 49 dcReader, err := decompressionStream(ctx, arReader, header.Name) 50 if err != nil { 51 return nil, nil, unknown.New(reader.Location, fmt.Errorf("failed to decompress control.tar.* file: %w", err)) 52 } 53 metadata, err = processControlTar(dcReader) 54 if err != nil { 55 return nil, nil, unknown.New(reader.Location, fmt.Errorf("failed to process control.tar.* file: %w", err)) 56 } 57 case strings.HasPrefix(header.Name, "data.tar"): 58 // Decompress the data.tar.* file 59 dcReader, err := decompressionStream(ctx, arReader, header.Name) 60 if err != nil { 61 return nil, nil, unknown.New(reader.Location, fmt.Errorf("failed to decompress data.tar.* file: %w", err)) 62 } 63 licenses, err = processDataTar(dcReader) 64 if err != nil { 65 unknownErr = unknown.Append(unknownErr, reader.Location, fmt.Errorf("failed to process data.tar.* file: %w", err)) 66 } 67 } 68 } 69 70 if metadata == nil { 71 return nil, nil, unknown.New(reader.Location, fmt.Errorf("no application found described in .dpkg archive")) 72 } 73 74 return []pkg.Package{ 75 newDebArchivePackage(ctx, reader.Location, *metadata, licenses), 76 }, nil, nil 77 } 78 79 // this is the pattern you'd expect to see in a tar header for a debian package license file () 80 var archiveHeaderLicensePathPattern = regexp.MustCompile(`^\.?/usr/share/doc/[^/]+/copyright$`) 81 82 func processDataTar(dcReader io.ReadCloser) ([]string, error) { 83 defer internal.CloseAndLogError(dcReader, "") 84 var licenses []string 85 86 tarReader := tar.NewReader(dcReader) 87 for { 88 header, err := tarReader.Next() 89 if err == io.EOF { 90 break 91 } 92 if err != nil { 93 return licenses, err 94 } 95 96 // look for /usr/share/docs/*/copyright files, parse each one for license claims 97 // TODO: in the future we can add archive sub indexes to the locations to see where within 98 // the dpkg archive the license was found 99 if archiveHeaderLicensePathPattern.MatchString(header.Name) { 100 licenses = append(licenses, parseLicensesFromCopyright(tarReader)...) 101 } 102 } 103 104 return licenses, nil 105 } 106 107 func processControlTar(dcReader io.ReadCloser) (*pkg.DpkgArchiveEntry, error) { 108 defer internal.CloseAndLogError(dcReader, "") 109 110 // Extract control, md5sums, and conffiles files from control.tar 111 tarReader := tar.NewReader(dcReader) 112 controlFileContent, md5Content, confContent, err := readControlFiles(tarReader) 113 if err != nil { 114 return nil, fmt.Errorf("failed to read control files: %w", err) 115 } 116 117 if controlFileContent == nil { 118 return nil, fmt.Errorf("control file not found in archive") 119 } 120 121 metadata, err := newDpkgArchiveMetadata(controlFileContent, md5Content, confContent) 122 if err != nil { 123 return nil, fmt.Errorf("failed to create package metadata: %w", err) 124 } 125 126 return &metadata, nil 127 } 128 129 func newDpkgArchiveMetadata(controlFile, md5sums, confFiles []byte) (pkg.DpkgArchiveEntry, error) { 130 // parse the control file to get package metadata 131 metadata, err := parseControlFile(string(controlFile)) 132 if err != nil { 133 return pkg.DpkgArchiveEntry{}, fmt.Errorf("failed to parse control file: %w", err) 134 } 135 136 // parse MD5 sums to get file records 137 var files []pkg.DpkgFileRecord 138 if len(md5sums) > 0 { 139 files = parseDpkgMD5Info(bytes.NewReader(md5sums)) 140 } 141 142 // mark config files 143 if len(confFiles) > 0 { 144 markConfigFiles(confFiles, files) 145 } 146 147 metadata.Files = files 148 return metadata, nil 149 } 150 151 func decompressionStream(ctx context.Context, r io.Reader, filePath string) (io.ReadCloser, error) { 152 format, stream, err := archives.Identify(ctx, filePath, r) 153 if err != nil { 154 return nil, fmt.Errorf("failed to identify compression format: %w", err) 155 } 156 157 decompressor, ok := format.(archives.Decompressor) 158 if !ok { 159 return nil, fmt.Errorf("file format does not support decompression: %s", filePath) 160 } 161 162 rc, err := decompressor.OpenReader(stream) 163 if err != nil { 164 return nil, fmt.Errorf("failed to create decompression reader: %w", err) 165 } 166 167 return rc, nil 168 } 169 170 // readControlFiles extracts important files from the control.tar archive 171 func readControlFiles(tarReader *tar.Reader) (controlFile, md5sums, conffiles []byte, err error) { 172 for { 173 header, err := tarReader.Next() 174 if err == io.EOF { 175 break 176 } 177 if err != nil { 178 return nil, nil, nil, err 179 } 180 181 switch filepath.Base(header.Name) { 182 case "control": 183 controlFile, err = io.ReadAll(tarReader) 184 if err != nil { 185 return nil, nil, nil, err 186 } 187 case "md5sums": 188 md5sums, err = io.ReadAll(tarReader) 189 if err != nil { 190 return nil, nil, nil, err 191 } 192 case "conffiles": 193 conffiles, err = io.ReadAll(tarReader) 194 if err != nil { 195 return nil, nil, nil, err 196 } 197 } 198 } 199 200 return controlFile, md5sums, conffiles, nil 201 } 202 203 // parseControlFile parses the content of a debian control file into package metadata 204 func parseControlFile(controlFileContent string) (pkg.DpkgArchiveEntry, error) { 205 // Reuse the existing dpkg status file parsing logic 206 reader := strings.NewReader(controlFileContent) 207 208 entries, err := parseDpkgStatus(reader) 209 if err != nil { 210 return pkg.DpkgArchiveEntry{}, fmt.Errorf("failed to parse control file: %w", err) 211 } 212 213 if len(entries) == 0 { 214 return pkg.DpkgArchiveEntry{}, fmt.Errorf("no package entries found in control file") 215 } 216 217 // We expect only one entry from a .deb control file 218 return pkg.DpkgArchiveEntry(entries[0]), nil 219 } 220 221 // markConfigFiles marks files that are listed in conffiles as configuration files 222 func markConfigFiles(conffilesContent []byte, files []pkg.DpkgFileRecord) { 223 // Parse the conffiles content into DpkgFileRecord entries 224 confFiles := parseDpkgConffileInfo(bytes.NewReader(conffilesContent)) 225 226 // Create a map for quick lookup of config files by path 227 configPathMap := make(map[string]struct{}) 228 for _, confFile := range confFiles { 229 configPathMap[confFile.Path] = struct{}{} 230 } 231 232 // Mark files as config files if they're in the conffiles list 233 for i := range files { 234 if _, exists := configPathMap[files[i].Path]; exists { 235 files[i].IsConfigFile = true 236 } 237 } 238 }