github.com/quay/claircore@v1.5.28/java/jar/jar.go (about)

     1  // Package jar implements a scanner on Java archive (jar) files.
     2  //
     3  // In addition to bog standard archives, this package attempts to handle more
     4  // esoteric uses, also.
     5  //
     6  // Throughout the code and comments, "jar" should be understood to mean "any
     7  // kind of JVM archive." A brief primer on the different kinds:
     8  //
     9  //   - jar:
    10  //     Java Archive. It's a zip with a manifest file, some compiled class files,
    11  //     and other assets.
    12  //
    13  //   - fatjar/onejar:
    14  //     Some jars unpacked, merged, then repacked. I gather this isn't in favor in
    15  //     the java scene.
    16  //
    17  //   - war:
    18  //     Webapp Archive. These are consumed by application servers like Tomcat, and
    19  //     are an all-in-one of code, dependencies, and metadata for configuring the
    20  //     server.
    21  //
    22  //   - ear:
    23  //     Enterprise Archive. These are bundles of wars, with hook points for
    24  //     configuration. They're only used on JEE servers, so they're comparatively
    25  //     rare in the real world.
    26  package jar
    27  
    28  import (
    29  	"archive/zip"
    30  	"bufio"
    31  	"bytes"
    32  	"context"
    33  	"crypto/sha1"
    34  	"encoding/hex"
    35  	"errors"
    36  	"fmt"
    37  	"io"
    38  	"io/fs"
    39  	"net/textproto"
    40  	"path"
    41  	"path/filepath"
    42  	"regexp"
    43  	"strings"
    44  
    45  	"github.com/quay/zlog"
    46  )
    47  
    48  // Header is the magic bytes at the beginning of a jar.
    49  //
    50  // JAR files are documented as only using the "standard" zip magic number.
    51  // There are two other magic numbers (ending in "\x05\x06" and "\x07\x08"
    52  // respectively) for zips, but they should not be used.
    53  var Header = []byte{'P', 'K', 0x03, 0x04}
    54  
    55  // MinSize is the absolute minimum size for a jar.
    56  //
    57  // This is the size of an empty zip. Files smaller than this cannot be jars.
    58  const MinSize = 22
    59  
    60  // Parse returns Info structs describing all of the discovered "artifacts" in
    61  // the jar.
    62  //
    63  // POM properties are a preferred source of information, falling back to
    64  // examining the jar manifest and then looking at the name. Anything that looks
    65  // like a jar bundled into the archive is also examined.
    66  //
    67  // The provided name is expected to be the full path within the layer to the jar
    68  // file being provided as "z".
    69  func Parse(ctx context.Context, name string, z *zip.Reader) ([]Info, error) {
    70  	ctx = zlog.ContextWithValues(ctx,
    71  		"component", "java/jar/Parse",
    72  		"jar", name)
    73  	return parse(ctx, srcPath{name}, z)
    74  }
    75  
    76  // SrcPath is a helper for tracking where an archive member is.
    77  // The [Push] and [Pop] methods are not concurrency-safe.
    78  type srcPath []string
    79  
    80  func (p srcPath) String() string {
    81  	return strings.Join(p, ":")
    82  }
    83  
    84  func (p srcPath) Cur() string {
    85  	return p[len(p)-1]
    86  }
    87  
    88  func (p *srcPath) Push(n string) {
    89  	*p = append(*p, n)
    90  }
    91  
    92  func (p *srcPath) Pop() string {
    93  	r := (*p)[len(*p)-1]
    94  	*p = (*p)[:len(*p)-1]
    95  	return r
    96  }
    97  
    98  // Parse is the inner function that uses a srcPath to keep track of recursions.
    99  func parse(ctx context.Context, name srcPath, z *zip.Reader) ([]Info, error) {
   100  	ctx = zlog.ContextWithValues(ctx,
   101  		"component", "java/jar/Parse",
   102  		"name", name.String())
   103  
   104  	// This uses an admittedly non-idiomatic, C-like goto construction. We want
   105  	// to attempt a few heuristics and keep the results of the first one that
   106  	// looks good. This does mean that there are restrictions on declarations in
   107  	// the following block.
   108  
   109  	var ret []Info
   110  	var i Info
   111  	var err error
   112  	base := filepath.Base(name.Cur())
   113  	// Try the pom.properties files first. Fatjars hopefully have the multiple
   114  	// properties files preserved.
   115  	ret, err = extractProperties(ctx, name, z)
   116  	switch {
   117  	case errors.Is(err, nil):
   118  		zlog.Debug(ctx).
   119  			Msg("using discovered properties file(s)")
   120  		goto Finish
   121  	case errors.Is(err, errUnpopulated):
   122  	case strings.HasPrefix(base, "javax") && errors.Is(err, ErrNotAJar):
   123  	default:
   124  		return nil, archiveErr(name, err)
   125  	}
   126  	// Look at the jar manifest if that fails.
   127  	i, err = extractManifest(ctx, name, z)
   128  	switch {
   129  	case errors.Is(err, nil):
   130  		zlog.Debug(ctx).
   131  			Msg("using discovered manifest")
   132  		ret = append(ret, i)
   133  		goto Finish
   134  	case errors.Is(err, errUnpopulated) || errors.Is(err, errInsaneManifest):
   135  	case strings.HasPrefix(base, "javax") && errors.Is(err, ErrNotAJar):
   136  	default:
   137  		return nil, archiveErr(name, err)
   138  	}
   139  	// As a last resort, just look at the name of the jar.
   140  	i, err = checkName(ctx, name.Cur())
   141  	switch {
   142  	case errors.Is(err, nil):
   143  		zlog.Debug(ctx).
   144  			Msg("using name mangling")
   145  		ret = append(ret, i)
   146  		goto Finish
   147  	case errors.Is(err, errUnpopulated):
   148  	default:
   149  		return nil, archiveErr(name, err)
   150  	}
   151  	// If we haven't jumped past this point, this is almost certainly not a jar,
   152  	// so return an error.
   153  	return nil, mkErr("", unidentified(base))
   154  
   155  Finish:
   156  	// Now, we need to examine any jars bundled in this jar.
   157  	inner, err := extractInner(ctx, name, z)
   158  	if err != nil {
   159  		return nil, archiveErr(name, err)
   160  	}
   161  	if ct := len(inner); ct != 0 {
   162  		zlog.Debug(ctx).
   163  			Int("count", ct).
   164  			Msg("found embedded jars")
   165  	}
   166  	ret = append(ret, inner...)
   167  
   168  	return ret, nil
   169  }
   170  
   171  // ExtractManifest attempts to open the manifest file at the well-known path.
   172  //
   173  // Reports NotAJar if the file doesn't exist.
   174  func extractManifest(ctx context.Context, name srcPath, z *zip.Reader) (Info, error) {
   175  	const manifestPath = `META-INF/MANIFEST.MF`
   176  	mf, err := z.Open(manifestPath)
   177  	switch {
   178  	case errors.Is(err, nil):
   179  	case errors.Is(err, fs.ErrNotExist), errors.Is(err, zip.ErrFormat):
   180  		err = notAJar(name, err)
   181  		fallthrough
   182  	default:
   183  		return Info{}, mkErr("opening manifest", err)
   184  	}
   185  	defer mf.Close()
   186  	var i Info
   187  	err = i.parseManifest(ctx, mf)
   188  	if err != nil {
   189  		return Info{}, mkErr("parsing manifest", err)
   190  	}
   191  	name.Push(manifestPath)
   192  	i.Source = name.String()
   193  	return i, nil
   194  }
   195  
   196  // ExtractProperties pulls pom.properties files out of the provided zip.
   197  func extractProperties(ctx context.Context, name srcPath, z *zip.Reader) ([]Info, error) {
   198  	const filename = "pom.properties"
   199  	mf, err := z.Open(`META-INF`)
   200  	switch {
   201  	case errors.Is(err, nil):
   202  	case errors.Is(err, fs.ErrNotExist),
   203  		errors.Is(err, zip.ErrFormat),
   204  		errors.Is(err, zip.ErrChecksum):
   205  		return nil, mkErr("properties", notAJar(name, err))
   206  	default:
   207  		return nil, mkErr("properties", err)
   208  	}
   209  	mf.Close()
   210  	var pf []string
   211  	// Go through the zip looking for properties files.
   212  	// We should end up with one info for every properties file.
   213  	for _, f := range z.File {
   214  		// Normalize the path to handle any attempted traversals
   215  		// encoded in the file names.
   216  		p := normName(f.Name)
   217  		if path.Base(p) == filename {
   218  			zlog.Debug(ctx).
   219  				Str("path", p).
   220  				Msg("found properties file")
   221  			pf = append(pf, p)
   222  		}
   223  	}
   224  	if len(pf) == 0 {
   225  		zlog.Debug(ctx).Msg("properties not found")
   226  		return nil, errUnpopulated
   227  	}
   228  	ret := make([]Info, len(pf))
   229  	for i, p := range pf {
   230  		f, err := z.Open(p)
   231  		switch {
   232  		case errors.Is(err, nil):
   233  		case errors.Is(err, zip.ErrFormat), errors.Is(err, zip.ErrChecksum):
   234  			return nil, mkErr("properties", notAJar(name, err))
   235  		default:
   236  			return nil, mkErr("failed opening properties", err)
   237  		}
   238  		err = ret[i].parseProperties(ctx, f)
   239  		f.Close()
   240  		if err != nil {
   241  			return nil, mkErr("failed parsing properties", err)
   242  		}
   243  		name.Push(p)
   244  		ret[i].Source = name.String()
   245  		name.Pop()
   246  	}
   247  	return ret, nil
   248  }
   249  
   250  // ExtractInner recurses into anything that looks like a jar in "z".
   251  func extractInner(ctx context.Context, p srcPath, z *zip.Reader) ([]Info, error) {
   252  	ctx = zlog.ContextWithValues(ctx, "parent", p.String())
   253  	var ret []Info
   254  	// Zips need random access, so allocate a buffer for any we find.
   255  	var buf bytes.Buffer
   256  	h := sha1.New()
   257  	checkFile := func(ctx context.Context, f *zip.File) error {
   258  		name := normName(f.Name)
   259  		// Check name.
   260  		if !ValidExt(name) {
   261  			return nil
   262  		}
   263  		fi := f.FileInfo()
   264  		// Check size.
   265  		if fi.Size() < MinSize {
   266  			zlog.Debug(ctx).Str("member", name).Msg("not actually a jar: too small")
   267  			return nil
   268  		}
   269  		rc, err := f.Open()
   270  		if err != nil {
   271  			return mkErr("failed opening file", err)
   272  		}
   273  		defer rc.Close()
   274  		buf.Reset()
   275  		buf.Grow(int(fi.Size()))
   276  		h.Reset()
   277  		sz, err := buf.ReadFrom(io.TeeReader(rc, h))
   278  		if err != nil {
   279  			return mkErr("failed buffering file", err)
   280  		}
   281  		bs := buf.Bytes()
   282  		// Check header.
   283  		if !bytes.Equal(bs[:4], Header) {
   284  			zlog.Debug(ctx).Str("member", name).Msg("not actually a jar: bad header")
   285  			return nil
   286  		}
   287  		// Okay, now reasonably certain this is a jar.
   288  		zr, err := zip.NewReader(bytes.NewReader(bs), sz)
   289  		switch {
   290  		case errors.Is(err, nil):
   291  		case errors.Is(err, io.EOF):
   292  			// BUG(go1.21) Older versions of the stdlib can report io.EOF when
   293  			// opening malformed zips.
   294  			fallthrough
   295  		case errors.Is(err, zip.ErrFormat) || errors.Is(err, io.EOF):
   296  			zlog.Debug(ctx).
   297  				Str("member", name).
   298  				Err(err).
   299  				Msg("not actually a jar: invalid zip")
   300  			return nil
   301  		default:
   302  			return mkErr("failed opening inner zip", err)
   303  		}
   304  
   305  		p.Push(name)
   306  		defer p.Pop()
   307  		ps, err := parse(ctx, p, zr)
   308  		switch {
   309  		case errors.Is(err, nil):
   310  		case errors.Is(err, ErrNotAJar) ||
   311  			errors.Is(err, ErrUnidentified) ||
   312  			errors.Is(err, errInsaneManifest):
   313  			zlog.Debug(ctx).
   314  				Str("member", name).
   315  				Err(err).
   316  				Msg("not actually a jar")
   317  			return nil
   318  		default:
   319  			return mkErr("parse error", err)
   320  		}
   321  		c := make([]byte, sha1.Size)
   322  		h.Sum(c[:0])
   323  		for i := range ps {
   324  			ps[i].SHA = c
   325  		}
   326  		ret = append(ret, ps...)
   327  		return nil
   328  	}
   329  
   330  	for _, f := range z.File {
   331  		if err := checkFile(ctx, f); err != nil {
   332  			return nil, fmt.Errorf("walking %s: %s: %w", p, f.Name, err)
   333  		}
   334  	}
   335  
   336  	if len(ret) == 0 {
   337  		zlog.Debug(ctx).
   338  			Msg("found no bundled jars")
   339  	}
   340  	return ret, nil
   341  }
   342  
   343  // NormName normalizes a name from a raw zip file header.
   344  //
   345  // This should be used in all cases that pull the name out of the zip header.
   346  func normName(p string) string {
   347  	return path.Join("/", p)[1:]
   348  }
   349  
   350  // NameRegexp is used to attempt to pull a name and version out of a jar's
   351  // filename.
   352  var nameRegexp = regexp.MustCompile(`([[:graph:]]+)-([[:digit:]][\-.[:alnum:]]*(?:-SNAPSHOT)?)\.jar`)
   353  
   354  // CheckName tries to populate the Info just from the above regexp.
   355  func checkName(ctx context.Context, name string) (Info, error) {
   356  	m := nameRegexp.FindStringSubmatch(filepath.Base(name))
   357  	if m == nil {
   358  		zlog.Debug(ctx).
   359  			Msg("name not useful")
   360  		return Info{}, errUnpopulated
   361  	}
   362  	return Info{
   363  		Name:    m[1],
   364  		Version: m[2],
   365  		Source:  ".",
   366  	}, nil
   367  }
   368  
   369  // Info reports the discovered information for a jar file.
   370  //
   371  // Any given jar may actually contain multiple jars or recombined classes.
   372  type Info struct {
   373  	// Name is the machine name found.
   374  	//
   375  	// Metadata that contains a "presentation" name isn't used to populate this
   376  	// field.
   377  	Name string
   378  	// Version is the version.
   379  	Version string
   380  	// Source is the archive member used to populate the information. If the
   381  	// name of the archive was used, this will be ".".
   382  	Source string
   383  	// SHA is populated with the SHA1 of the file if this entry was discovered
   384  	// inside another archive.
   385  	SHA []byte
   386  }
   387  
   388  func (i *Info) String() string {
   389  	var b strings.Builder
   390  	b.WriteString(i.Name)
   391  	b.WriteByte('/')
   392  	b.WriteString(i.Version)
   393  	if len(i.SHA) != 0 {
   394  		b.WriteString("(sha1:")
   395  		hex.NewEncoder(&b).Write(i.SHA)
   396  		b.WriteByte(')')
   397  	}
   398  	b.WriteString(" [")
   399  	b.WriteString(i.Source)
   400  	b.WriteByte(']')
   401  	return b.String()
   402  }
   403  
   404  // ErrUnpopulated is returned by the parse* methods when they didn't populate
   405  // the Info struct.
   406  var errUnpopulated = errors.New("unpopulated")
   407  
   408  // ErrInsaneManifest is returned by the parse* method when it the expected sanity
   409  // checks fail.
   410  var errInsaneManifest = errors.New("jar manifest does not pass sanity checks")
   411  
   412  // ParseManifest does what it says on the tin.
   413  //
   414  // This extracts "Main Attributes", as defined at
   415  // https://docs.oracle.com/javase/8/docs/technotes/guides/jar/jar.html.
   416  //
   417  // This also examines "Bundle" metadata, aka OSGI metadata, as described in the
   418  // spec: https://github.com/osgi/osgi/wiki/Release:-Bundle-Hook-Service-Specification-1.1
   419  func (i *Info) parseManifest(ctx context.Context, r io.Reader) error {
   420  	tp := textproto.NewReader(bufio.NewReader(newMainSectionReader(r)))
   421  	hdr, err := tp.ReadMIMEHeader()
   422  	if err != nil {
   423  		zlog.Debug(ctx).
   424  			Err(err).
   425  			Msg("unable to read manifest")
   426  		return errInsaneManifest
   427  	}
   428  	// Sanity checks:
   429  	switch {
   430  	case len(hdr) == 0:
   431  		zlog.Debug(ctx).
   432  			Msg("no headers found")
   433  		return errInsaneManifest
   434  	case !manifestVer.MatchString(hdr.Get("Manifest-Version")):
   435  		v := hdr.Get("Manifest-Version")
   436  		zlog.Debug(ctx).
   437  			Str("manifest_version", v).
   438  			Msg("invalid manifest version")
   439  		return errInsaneManifest
   440  	case hdr.Get("Name") != "":
   441  		zlog.Debug(ctx).
   442  			Msg("martian manifest")
   443  		// This shouldn't be happening in the Main section.
   444  		return errInsaneManifest
   445  	}
   446  
   447  	var name, version string
   448  	var groupID, artifactID string
   449  
   450  	for _, key := range []string{
   451  		"Group-Id",
   452  		"Bundle-SymbolicName",
   453  		"Implementation-Vendor-Id",
   454  		"Implementation-Vendor",
   455  		"Specification-Vendor",
   456  	} {
   457  		value := hdr.Get(key)
   458  		if key == "Bundle-SymbolicName" {
   459  			if i := strings.IndexByte(value, ';'); i != -1 {
   460  				value = value[:i]
   461  			}
   462  		}
   463  		if value != "" && !strings.Contains(value, " ") {
   464  			groupID = value
   465  			break
   466  		}
   467  	}
   468  
   469  	for _, key := range []string{
   470  		"Implementation-Title",
   471  		"Specification-Title",
   472  		"Bundle-Name",
   473  		"Extension-Name",
   474  		"Short-Name",
   475  	} {
   476  		value := hdr.Get(key)
   477  		if value != "" && !strings.Contains(value, " ") {
   478  			artifactID = value
   479  			break
   480  		}
   481  	}
   482  
   483  	if artifactID == groupID {
   484  		artifactID = ""
   485  	}
   486  
   487  	// Trim to account for empty components.
   488  	name = strings.Trim(groupID+":"+artifactID, ":")
   489  
   490  	for _, key := range []string{
   491  		"Bundle-Version",
   492  		"Implementation-Version",
   493  		"Plugin-Version",
   494  		"Specification-Version",
   495  	} {
   496  		if v := hdr.Get(key); v != "" {
   497  			version = v
   498  			break
   499  		}
   500  	}
   501  
   502  	if name == "" || version == "" {
   503  		zlog.Debug(ctx).
   504  			Strs("attrs", []string{name, version}).
   505  			Msg("manifest not useful")
   506  		return errUnpopulated
   507  	}
   508  	i.Name = name
   509  	i.Version = version
   510  	return nil
   511  }
   512  
   513  // NewMainSectionReader returns a reader wrapping "r" that reads until the main
   514  // section of the manifest ends, or EOF. It appends newlines as needed to make
   515  // the manifest parse like MIME headers.
   516  //
   517  // To quote from the spec:
   518  //
   519  //	A JAR file manifest consists of a main section followed by a list of
   520  //	sections for individual JAR file entries, each separated by a newline. Both
   521  //	the main section and individual sections follow the section syntax specified
   522  //	above. They each have their own specific restrictions and rules.
   523  //
   524  //	The main section contains security and configuration information about the
   525  //	JAR file itself, as well as the application or extension that this JAR file
   526  //	is a part of. It also defines main attributes that apply to every individual
   527  //	manifest entry.  No attribute in this section can have its name equal to
   528  //	"Name". This section is terminated by an empty line.
   529  //
   530  //	The individual sections define various attributes for packages or files
   531  //	contained in this JAR file. Not all files in the JAR file need to be listed
   532  //	in the manifest as entries, but all files which are to be signed must be
   533  //	listed. The manifest file itself must not be listed.  Each section must
   534  //	start with an attribute with the name as "Name", and the value must be
   535  //	relative path to the file, or an absolute URL referencing data outside the
   536  //	archive.
   537  //
   538  // This is contradicted by the example given and manifests seen in the wild, so
   539  // don't trust that the newline exists between sections.
   540  func newMainSectionReader(r io.Reader) io.Reader {
   541  	return &mainSectionReader{
   542  		Reader: r,
   543  	}
   544  }
   545  
   546  type mainSectionReader struct {
   547  	io.Reader
   548  	ended bool
   549  }
   550  
   551  var _ io.Reader = (*mainSectionReader)(nil)
   552  
   553  // Read implements io.Reader.
   554  func (m *mainSectionReader) Read(b []byte) (int, error) {
   555  	switch {
   556  	case len(b) == 0:
   557  		return 0, nil
   558  	case len(b) < 6: // Minimum size to detect the "Name" header.
   559  		return 0, io.ErrShortBuffer
   560  	case m.Reader == nil && m.ended:
   561  		return 0, io.EOF
   562  	case m.Reader == nil && !m.ended:
   563  		b[0] = '\r'
   564  		b[1] = '\n'
   565  		b[2] = '\r'
   566  		b[3] = '\n'
   567  		m.ended = true
   568  		return 4, io.EOF
   569  	}
   570  
   571  	n, err := m.Reader.Read(b)
   572  	peek := b[:n]
   573  	// Check for EOF conditions:
   574  	hPos := bytes.Index(peek, nameHeader)
   575  	switch {
   576  	case hPos != -1:
   577  		m.Reader = nil
   578  		// Skip the newline that's at hPos
   579  		b[hPos+1] = '\r'
   580  		b[hPos+2] = '\n'
   581  		n = hPos + 3
   582  		m.ended = true
   583  		return n, io.EOF
   584  	case errors.Is(err, io.EOF) && n == 0:
   585  		m.Reader = nil
   586  	case errors.Is(err, io.EOF):
   587  		m.Reader = nil
   588  		m.ended = true
   589  		slack := cap(b) - n
   590  		switch {
   591  		case bytes.HasSuffix(peek, []byte("\r\n\r\n")):
   592  		case bytes.HasSuffix(peek, []byte("\r\n")) && slack >= 2:
   593  			// add in extra line-end.
   594  			b[n+0] = '\r'
   595  			b[n+1] = '\n'
   596  			n += 2
   597  		case slack >= 4:
   598  			b[n+0] = '\r'
   599  			b[n+1] = '\n'
   600  			b[n+2] = '\r'
   601  			b[n+3] = '\n'
   602  			n += 4
   603  		default:
   604  			m.ended = false
   605  			// no slack space
   606  			return n, nil
   607  		}
   608  	}
   609  	return n, err
   610  }
   611  
   612  // NameHeader is the header that marks the end of the main section.
   613  var nameHeader = []byte("\nName:")
   614  
   615  // ManifestVer is a regexp describing a manifest version string.
   616  //
   617  // Our code doesn't need or prefer a certain manifest version, but every example
   618  // seems to be "1.0"?
   619  //
   620  //	% find testdata/manifest -type f -exec awk '/Manifest-Version/{print}' '{}' +|sort|uniq
   621  //	Manifest-Version: 1.0
   622  var manifestVer = regexp.MustCompile(`[[:digit:]]+(\.[[:digit:]]+)*`)
   623  
   624  // ParseProperties parses the pom properties file.
   625  //
   626  // This is the best-case scenario.
   627  func (i *Info) parseProperties(ctx context.Context, r io.Reader) error {
   628  	var group, artifact, version string
   629  	s := bufio.NewScanner(r)
   630  	for s.Scan() && (group == "" || artifact == "" || version == "") {
   631  		line := strings.TrimSpace(s.Text())
   632  		key, value, found := strings.Cut(line, "=")
   633  		if !found {
   634  			continue
   635  		}
   636  		switch key {
   637  		case "groupId":
   638  			group = value
   639  		case "artifactId":
   640  			artifact = value
   641  		case "version":
   642  			version = value
   643  		}
   644  	}
   645  	if err := s.Err(); err != nil {
   646  		return mkErr("properties scanner", err)
   647  	}
   648  	if group == "" || artifact == "" || version == "" {
   649  		zlog.Debug(ctx).
   650  			Strs("attrs", []string{group, artifact, version}).
   651  			Msg("properties not useful")
   652  		return errUnpopulated
   653  	}
   654  
   655  	i.Name = group + ":" + artifact
   656  	i.Version = version
   657  	return nil
   658  }