github.com/quay/claircore@v1.5.28/python/packagescanner.go (about)

     1  // Package python contains components for interrogating python packages in
     2  // container layers.
     3  package python
     4  
     5  import (
     6  	"bufio"
     7  	"bytes"
     8  	"context"
     9  	"fmt"
    10  	"io/fs"
    11  	"net/textproto"
    12  	"path"
    13  	"path/filepath"
    14  	"runtime/trace"
    15  	"strings"
    16  
    17  	"github.com/quay/zlog"
    18  
    19  	"github.com/quay/claircore"
    20  	"github.com/quay/claircore/indexer"
    21  	"github.com/quay/claircore/pkg/pep440"
    22  )
    23  
    24  var (
    25  	_ indexer.VersionedScanner = (*Scanner)(nil)
    26  	_ indexer.PackageScanner   = (*Scanner)(nil)
    27  
    28  	Repository = claircore.Repository{
    29  		Name: "pypi",
    30  		URI:  "https://pypi.org/simple",
    31  	}
    32  )
    33  
    34  // Scanner implements the scanner.PackageScanner interface.
    35  //
    36  // It looks for directories that seem like wheels or eggs, and looks at the
    37  // metadata recorded there. This type attempts to follow the specs documented by
    38  // the [PyPA], with the newer PEPs being preferred.
    39  //
    40  // The zero value is ready to use.
    41  //
    42  // [PyPA]: https://packaging.python.org/en/latest/specifications/recording-installed-packages/
    43  type Scanner struct{}
    44  
    45  // Name implements scanner.VersionedScanner.
    46  func (*Scanner) Name() string { return "python" }
    47  
    48  // Version implements scanner.VersionedScanner.
    49  func (*Scanner) Version() string { return "4" }
    50  
    51  // Kind implements scanner.VersionedScanner.
    52  func (*Scanner) Kind() string { return "package" }
    53  
    54  // Scan attempts to find wheel or egg info directories and record the package
    55  // information there.
    56  //
    57  // A return of (nil, nil) is expected if there's nothing found.
    58  func (ps *Scanner) Scan(ctx context.Context, layer *claircore.Layer) ([]*claircore.Package, error) {
    59  	defer trace.StartRegion(ctx, "Scanner.Scan").End()
    60  	trace.Log(ctx, "layer", layer.Hash.String())
    61  	ctx = zlog.ContextWithValues(ctx,
    62  		"component", "python/Scanner.Scan",
    63  		"version", ps.Version(),
    64  		"layer", layer.Hash.String())
    65  	zlog.Debug(ctx).Msg("start")
    66  	defer zlog.Debug(ctx).Msg("done")
    67  	if err := ctx.Err(); err != nil {
    68  		return nil, err
    69  	}
    70  
    71  	sys, err := layer.FS()
    72  	if err != nil {
    73  		return nil, fmt.Errorf("python: unable to open layer: %w", err)
    74  	}
    75  
    76  	ms, err := findDeliciousEgg(ctx, sys)
    77  	if err != nil {
    78  		return nil, fmt.Errorf("python: failed to find delicious egg: %w", err)
    79  	}
    80  	var ret []*claircore.Package
    81  	for _, n := range ms {
    82  		b, err := fs.ReadFile(sys, n)
    83  		if err != nil {
    84  			return nil, fmt.Errorf("python: unable to read file: %w", err)
    85  		}
    86  		// The two files we read are in RFC8288 (email message) format, and the
    87  		// keys we care about are shared.
    88  		rd := textproto.NewReader(bufio.NewReader(bytes.NewReader(b)))
    89  		hdr, err := rd.ReadMIMEHeader()
    90  		if err != nil && hdr == nil {
    91  			zlog.Warn(ctx).
    92  				Err(err).
    93  				Str("path", n).
    94  				Msg("unable to read metadata, skipping")
    95  			continue
    96  		}
    97  		v, err := pep440.Parse(hdr.Get("Version"))
    98  		if err != nil {
    99  			zlog.Warn(ctx).
   100  				Err(err).
   101  				Str("path", n).
   102  				Msg("couldn't parse the version, skipping")
   103  			continue
   104  		}
   105  		pkgDB := filepath.Join(n, "..", "..")
   106  		// If the package is .egg-info format
   107  		// with just the .egg-info file,
   108  		// only go up one level.
   109  		if strings.HasSuffix(n, `.egg-info`) {
   110  			pkgDB = filepath.Join(n, "..")
   111  		}
   112  		ret = append(ret, &claircore.Package{
   113  			Name:              strings.ToLower(hdr.Get("Name")),
   114  			Version:           v.String(),
   115  			PackageDB:         "python:" + pkgDB,
   116  			Filepath:          n,
   117  			Kind:              claircore.BINARY,
   118  			NormalizedVersion: v.Version(),
   119  			// TODO Is there some way to pick up on where a wheel or egg was
   120  			// found?
   121  			RepositoryHint: "https://pypi.org/simple",
   122  		})
   123  	}
   124  	return ret, nil
   125  }
   126  
   127  // DefaultRepository implements [indexer.DefaultRepoScanner]
   128  func (Scanner) DefaultRepository(ctx context.Context) *claircore.Repository {
   129  	return &Repository
   130  }
   131  
   132  // findDeliciousEgg finds eggs and wheels.
   133  //
   134  // Three formats are supported at this time:
   135  //
   136  // * .egg      - only when .egg is a directory. .egg as a zipfile is not supported at this time.
   137  // * .egg-info - both as a standalone file and a directory which contains PKG-INFO.
   138  // * wheel     - only .dist-info/METADATA is supported.
   139  //
   140  // See https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html for more information about Python Eggs
   141  // and https://peps.python.org/pep-0427/ for more information about Wheel.
   142  func findDeliciousEgg(ctx context.Context, sys fs.FS) (out []string, err error) {
   143  	// Is this layer an rpm layer?
   144  	//
   145  	// If so, files in the disto-managed directory can be skipped.
   146  	var rpm bool
   147  	for _, p := range []string{
   148  		"var/lib/rpm/Packages",
   149  		"var/lib/rpm/rpmdb.sqlite",
   150  		"var/lib/rpm/Packages.db",
   151  	} {
   152  		if fi, err := fs.Stat(sys, p); err == nil && fi.Mode().IsRegular() {
   153  			rpm = true
   154  			break
   155  		}
   156  	}
   157  	// Is this layer a dpkg layer?
   158  	var dpkg bool
   159  	if fi, err := fs.Stat(sys, `var/lib/dpkg/status`); err == nil && fi.Mode().IsRegular() {
   160  		dpkg = true
   161  	}
   162  
   163  	return out, fs.WalkDir(sys, ".", func(p string, d fs.DirEntry, err error) error {
   164  		ev := zlog.Debug(ctx).
   165  			Str("file", p)
   166  		var success bool
   167  		defer func() {
   168  			if !success {
   169  				ev.Discard().Send()
   170  			}
   171  		}()
   172  		switch {
   173  		case err != nil:
   174  			return err
   175  		case (rpm || dpkg) && d.Type().IsDir():
   176  			// Skip one level up from the "packages" directory so the walk also
   177  			// skips the standard library.
   178  			var pat string
   179  			switch {
   180  			case rpm:
   181  				pat = `usr/lib*/python[23].*`
   182  				ev = ev.Bool("rpm_dir", true)
   183  			case dpkg:
   184  				pat = `usr/lib*/python[23]`
   185  				ev = ev.Bool("dpkg_dir", true)
   186  			default:
   187  				panic("programmer error: unreachable")
   188  			}
   189  			if m, _ := path.Match(pat, p); m {
   190  				ev.Msg("skipping directory")
   191  				return fs.SkipDir
   192  			}
   193  			fallthrough
   194  		case !d.Type().IsRegular():
   195  			// Should we chase symlinks with the correct name?
   196  			return nil
   197  		case strings.HasPrefix(filepath.Base(p), ".wh."):
   198  			return nil
   199  		case strings.HasSuffix(p, `.egg/EGG-INFO/PKG-INFO`):
   200  			ev = ev.Str("kind", ".egg")
   201  		case strings.HasSuffix(p, `.egg-info`):
   202  			fallthrough
   203  		case strings.HasSuffix(p, `.egg-info/PKG-INFO`):
   204  			ev = ev.Str("kind", ".egg-info")
   205  		case strings.HasSuffix(p, `.dist-info/METADATA`):
   206  			ev = ev.Str("kind", "wheel")
   207  			// See if we can discern the installer.
   208  			var installer string
   209  			ip := path.Join(path.Dir(p), `INSTALLER`)
   210  			if ic, err := fs.ReadFile(sys, ip); err == nil {
   211  				installer = string(bytes.TrimSpace(ic))
   212  				ev = ev.Str("installer", installer)
   213  			}
   214  			if _, ok := blocklist[installer]; ok {
   215  				ev.Msg("skipping package")
   216  				return nil
   217  			}
   218  		default:
   219  			return nil
   220  		}
   221  		ev.Msg("found package")
   222  		success = true
   223  		out = append(out, p)
   224  		return nil
   225  	})
   226  }
   227  
   228  // Blocklist of installers to ignore.
   229  //
   230  // Currently, rpm is the only known package manager that actually populates this
   231  // information.
   232  var blocklist = map[string]struct{}{
   233  	"rpm":  {},
   234  	"dpkg": {},
   235  	"apk":  {},
   236  }