github.com/quay/claircore@v1.5.28/java/packagescanner.go (about)

     1  // Package java contains components for interrogating java packages in
     2  // container layers.
     3  package java
     4  
     5  import (
     6  	"archive/zip"
     7  	"bytes"
     8  	"context"
     9  	"crypto/sha1"
    10  	"encoding/json"
    11  	"errors"
    12  	"fmt"
    13  	"io"
    14  	"net/http"
    15  	"net/url"
    16  	"runtime/trace"
    17  	"sort"
    18  	"strconv"
    19  	"strings"
    20  	"time"
    21  
    22  	"github.com/quay/zlog"
    23  
    24  	"github.com/quay/claircore"
    25  	"github.com/quay/claircore/indexer"
    26  	"github.com/quay/claircore/java/jar"
    27  )
    28  
    29  var (
    30  	_ indexer.VersionedScanner   = (*Scanner)(nil)
    31  	_ indexer.PackageScanner     = (*Scanner)(nil)
    32  	_ indexer.RPCScanner         = (*Scanner)(nil)
    33  	_ indexer.DefaultRepoScanner = (*Scanner)(nil)
    34  
    35  	Repository = claircore.Repository{
    36  		Name: "maven",
    37  		URI:  "https://repo1.maven.apache.org/maven2",
    38  	}
    39  )
    40  
    41  // DefaultSearchAPI is a maven-like REST API that may be used to do
    42  // reverse lookups based on an archive's sha1 sum.
    43  //
    44  //doc:url indexer
    45  const DefaultSearchAPI = `https://search.maven.org/solrsearch/select`
    46  const DefaultRequestTimeout = 2 * time.Second
    47  
    48  // ScannerConfig is the struct used to configure a Scanner.
    49  type ScannerConfig struct {
    50  	// DisableAPI disables the use of the API.
    51  	DisableAPI bool `yaml:"disable_api" json:"disable_api"`
    52  	// API is a URL endpoint to a maven-like REST API.
    53  	// The default is DefaultSearchAPI.
    54  	API               string        `yaml:"api" json:"api"`
    55  	APIRequestTimeout time.Duration `yaml:"api_request_timeout" json:"api_request_timeout"`
    56  }
    57  
    58  // Scanner implements the scanner.PackageScanner interface.
    59  //
    60  // It looks for files that seem like jar, war or ear, and looks at the
    61  // metadata recorded there.
    62  //
    63  // The zero value is ready to use.
    64  type Scanner struct {
    65  	client             *http.Client
    66  	root               *url.URL
    67  	rootRequestTimeout time.Duration
    68  }
    69  
    70  // Name implements scanner.VersionedScanner.
    71  func (*Scanner) Name() string { return "java" }
    72  
    73  // Version implements scanner.VersionedScanner.
    74  func (*Scanner) Version() string { return "6" }
    75  
    76  // Kind implements scanner.VersionedScanner.
    77  func (*Scanner) Kind() string { return "package" }
    78  
    79  // Configure implements indexer.RPCScanner.
    80  func (s *Scanner) Configure(ctx context.Context, f indexer.ConfigDeserializer, c *http.Client) error {
    81  	ctx = zlog.ContextWithValues(ctx,
    82  		"component", "java/Scanner.Configure",
    83  		"version", s.Version())
    84  	var cfg ScannerConfig
    85  	s.client = c
    86  	if err := f(&cfg); err != nil {
    87  		return err
    88  	}
    89  
    90  	if cfg.DisableAPI {
    91  		zlog.Debug(ctx).Msg("search API disabled")
    92  	} else {
    93  		api := DefaultSearchAPI
    94  		if cfg.API != "" {
    95  			api = cfg.API
    96  		}
    97  		requestTimeout := DefaultRequestTimeout
    98  		if cfg.APIRequestTimeout != 0 {
    99  			requestTimeout = cfg.APIRequestTimeout
   100  		}
   101  		s.rootRequestTimeout = requestTimeout
   102  		zlog.Debug(ctx).
   103  			Str("api", api).
   104  			Float64("requestTimeout", requestTimeout.Seconds()).
   105  			Msg("configured search API URL")
   106  		u, err := url.Parse(api)
   107  		if err != nil {
   108  			return err
   109  		}
   110  		s.root = u
   111  	}
   112  
   113  	return nil
   114  }
   115  
   116  // Scan attempts to find jar, war or ear files and record the package
   117  // information there.
   118  //
   119  // A return of (nil, nil) is expected if there's nothing found.
   120  func (s *Scanner) Scan(ctx context.Context, layer *claircore.Layer) ([]*claircore.Package, error) {
   121  	defer trace.StartRegion(ctx, "Scanner.Scan").End()
   122  	trace.Log(ctx, "layer", layer.Hash.String())
   123  	ctx = zlog.ContextWithValues(ctx,
   124  		"component", "java/Scanner.Scan",
   125  		"version", s.Version(),
   126  		"layer", layer.Hash.String())
   127  	zlog.Debug(ctx).Msg("start")
   128  	defer zlog.Debug(ctx).Msg("done")
   129  	if err := ctx.Err(); err != nil {
   130  		return nil, err
   131  	}
   132  	sys, err := layer.FS()
   133  	if err != nil {
   134  		return nil, fmt.Errorf("java: unable to open layer: %w", err)
   135  	}
   136  
   137  	ars, err := archives(ctx, sys)
   138  	if err != nil {
   139  		return nil, err
   140  	}
   141  	// All used in the loop below.
   142  	var ret []*claircore.Package
   143  	buf := getBuf()
   144  	sh := sha1.New()
   145  	ck := make([]byte, sha1.Size)
   146  	doSearch := s.root != nil
   147  	defer putBuf(buf)
   148  	for _, n := range ars {
   149  		ctx := zlog.ContextWithValues(ctx, "file", n)
   150  		sh.Reset()
   151  		buf.Reset()
   152  		// Calculate the SHA1 as it's buffered, since it may be needed for
   153  		// searching later.
   154  		f, err := sys.Open(n)
   155  		if err != nil {
   156  			return nil, err
   157  		}
   158  		fStat, err := f.Stat()
   159  		if err == nil {
   160  			buf.Grow(int(fStat.Size()))
   161  		}
   162  		sz, err := buf.ReadFrom(io.TeeReader(f, sh))
   163  		f.Close()
   164  		if err != nil {
   165  			return nil, err
   166  		}
   167  		zb := buf.Bytes()
   168  		if !bytes.Equal(zb[:4], jar.Header) {
   169  			// Has a reasonable size and name, but isn't really a zip.
   170  			zlog.Debug(ctx).Msg("not actually a jar: bad header")
   171  			continue
   172  		}
   173  		z, err := zip.NewReader(bytes.NewReader(zb), sz)
   174  		switch {
   175  		case errors.Is(err, nil):
   176  		case errors.Is(err, zip.ErrFormat):
   177  			zlog.Info(ctx).
   178  				Err(err).
   179  				Msg("not actually a jar: invalid zip")
   180  			continue
   181  		default:
   182  			return nil, err
   183  		}
   184  
   185  		infos, err := jar.Parse(ctx, n, z)
   186  		switch {
   187  		case err == nil:
   188  		case errors.Is(err, jar.ErrUnidentified) || errors.Is(err, jar.ErrNotAJar):
   189  			// If there's an error that's one of the "known" reasons (e.g. not a
   190  			// read error or a malformed file), just log it and continue on.
   191  			zlog.Info(ctx).
   192  				AnErr("reason", err).
   193  				Msg("skipping jar")
   194  			continue
   195  		default:
   196  			return nil, err
   197  		}
   198  		sh.Sum(ck[:0])
   199  		ps := make([]*claircore.Package, len(infos))
   200  		for j := range infos {
   201  			i := &infos[j]
   202  			// If we discovered a pom file, don't bother talking to the network.
   203  			// If not, talk to the network if configured to do so.
   204  			if !strings.HasSuffix(i.Source, "pom.properties") && doSearch {
   205  				switch err := s.search(ctx, i, ck); {
   206  				case errors.Is(err, nil): // OK
   207  				case errors.Is(err, errRPC):
   208  				// BUG(hank) There's no way for a scanner that makes RPC calls
   209  				// to signal "the call failed, these are best-effort results,
   210  				// and please retry."
   211  				default:
   212  					return nil, err
   213  				}
   214  			}
   215  
   216  			var pkg claircore.Package
   217  			pkg.Name = i.Name
   218  			pkg.Version = i.Version
   219  			pkg.Kind = claircore.BINARY
   220  			pkg.Filepath = n
   221  			b := ck
   222  			if len(i.SHA) != 0 {
   223  				b = i.SHA
   224  			}
   225  			pkg.RepositoryHint = fmt.Sprintf(`sha1:%40x`, b)
   226  			// BUG(hank) There's probably some bugs lurking in the jar.Info →
   227  			// claircore.Package mapping code around embedded jars. There's a
   228  			// testcase to be written, there.
   229  
   230  			// Only examine the last element of the source list:
   231  			js := strings.Split(i.Source, ":")
   232  			switch l := js[len(js)-1]; {
   233  			case strings.HasSuffix(l, "pom.properties"):
   234  				fallthrough
   235  			case s.root != nil && i.Source == s.root.String():
   236  				// Populate as a maven artifact.
   237  				pkg.PackageDB = `maven:` + n
   238  			case l == "META-INF/MANIFEST.MF":
   239  				// information pulled from a manifest file
   240  				pkg.PackageDB = `jar:` + n
   241  			case l == ".":
   242  				// Name guess.
   243  				pkg.PackageDB = `file:` + n
   244  			default:
   245  				return nil, fmt.Errorf("java: martian Info: %+v", i)
   246  			}
   247  			ps[j] = &pkg
   248  		}
   249  		ret = append(ret, ps...)
   250  	}
   251  	return ret, nil
   252  }
   253  
   254  // DefaultRepository implements [indexer.DefaultRepoScanner].
   255  func (Scanner) DefaultRepository(ctx context.Context) *claircore.Repository {
   256  	return &Repository
   257  }
   258  
   259  // Search attempts to search with the configured client and API endpoint.
   260  //
   261  // This function modifies the passed Info in-place if successful. The passed
   262  // byte slice should be a SHA1 sum of the jar. It is used if the "SHA" member of
   263  // the Info is not populated.
   264  //
   265  // ErrRPC is reported if anything went wrong making the request or reading the
   266  // response.
   267  func (s *Scanner) search(ctx context.Context, i *jar.Info, ck []byte) error {
   268  	if i.SHA != nil {
   269  		ck = i.SHA
   270  	}
   271  	success := false
   272  	defer func() {
   273  		searchCounter.WithLabelValues(strconv.FormatBool(success)).Inc()
   274  	}()
   275  	tctx, done := context.WithTimeout(ctx, s.rootRequestTimeout)
   276  	defer done()
   277  	req, err := http.NewRequestWithContext(tctx, http.MethodGet, s.root.String(), nil)
   278  	if err != nil {
   279  		zlog.Warn(ctx).
   280  			Err(err).
   281  			Msg("unable to construct request")
   282  		return errRPC
   283  	}
   284  	v := req.URL.Query()
   285  	// 40 == 2 * sha1.Size. I don't there's a good way to keep it as
   286  	// a constant.
   287  	v.Set("q", fmt.Sprintf(`1:"%40x"`, ck))
   288  	v.Set("wt", "json")
   289  	req.URL.RawQuery = v.Encode()
   290  	res, err := s.client.Do(req)
   291  	if err != nil {
   292  		zlog.Warn(ctx).
   293  			Err(err).
   294  			Msg("error making request")
   295  		return errRPC
   296  	}
   297  	if res.StatusCode != http.StatusOK {
   298  		res.Body.Close()
   299  		zlog.Warn(ctx).
   300  			Str("status", res.Status).
   301  			Msg("unexpected response status")
   302  		return errRPC
   303  	}
   304  	var sr searchResponse
   305  	err = json.NewDecoder(res.Body).Decode(&sr)
   306  	res.Body.Close()
   307  	if err != nil {
   308  		zlog.Warn(ctx).
   309  			Err(err).
   310  			Msg("error decoding json")
   311  		return errRPC
   312  	}
   313  	success = true
   314  	if len(sr.Response.Doc) == 0 {
   315  		zlog.Debug(ctx).Msg("no matching artifacts found")
   316  		return nil
   317  	}
   318  	// Sort and then take the first one, because apparently the same
   319  	// artifact is uploaded under different names sometimes?
   320  	sort.SliceStable(sr.Response.Doc, func(i, j int) bool {
   321  		return sr.Response.Doc[i].ID < sr.Response.Doc[j].ID
   322  	})
   323  	i.Source = s.root.String()
   324  	d := &sr.Response.Doc[0]
   325  	i.Version = d.Version
   326  	i.Name = d.Group + ":" + d.Artifact
   327  	return nil
   328  }
   329  
   330  var errRPC = errors.New("search rpc failed")
   331  
   332  // SearchResponse is the response from maven.
   333  //
   334  // Created by eyeballing the response from
   335  // https://search.maven.org/solrsearch/select?q=1:%2235379fb6526fd019f331542b4e9ae2e566c57933%22&wt=json
   336  type searchResponse struct {
   337  	Response struct {
   338  		Doc []struct {
   339  			ID         string `json:"id"`
   340  			Group      string `json:"g"`
   341  			Artifact   string `json:"a"`
   342  			Version    string `json:"v"`
   343  			Classifier string `json:"p"`
   344  		} `json:"docs"`
   345  	} `json:"response"`
   346  }