github.com/quay/claircore@v1.5.28/java/jar/jar.go (about) 1 // Package jar implements a scanner on Java archive (jar) files. 2 // 3 // In addition to bog standard archives, this package attempts to handle more 4 // esoteric uses, also. 5 // 6 // Throughout the code and comments, "jar" should be understood to mean "any 7 // kind of JVM archive." A brief primer on the different kinds: 8 // 9 // - jar: 10 // Java Archive. It's a zip with a manifest file, some compiled class files, 11 // and other assets. 12 // 13 // - fatjar/onejar: 14 // Some jars unpacked, merged, then repacked. I gather this isn't in favor in 15 // the java scene. 16 // 17 // - war: 18 // Webapp Archive. These are consumed by application servers like Tomcat, and 19 // are an all-in-one of code, dependencies, and metadata for configuring the 20 // server. 21 // 22 // - ear: 23 // Enterprise Archive. These are bundles of wars, with hook points for 24 // configuration. They're only used on JEE servers, so they're comparatively 25 // rare in the real world. 26 package jar 27 28 import ( 29 "archive/zip" 30 "bufio" 31 "bytes" 32 "context" 33 "crypto/sha1" 34 "encoding/hex" 35 "errors" 36 "fmt" 37 "io" 38 "io/fs" 39 "net/textproto" 40 "path" 41 "path/filepath" 42 "regexp" 43 "strings" 44 45 "github.com/quay/zlog" 46 ) 47 48 // Header is the magic bytes at the beginning of a jar. 49 // 50 // JAR files are documented as only using the "standard" zip magic number. 51 // There are two other magic numbers (ending in "\x05\x06" and "\x07\x08" 52 // respectively) for zips, but they should not be used. 53 var Header = []byte{'P', 'K', 0x03, 0x04} 54 55 // MinSize is the absolute minimum size for a jar. 56 // 57 // This is the size of an empty zip. Files smaller than this cannot be jars. 58 const MinSize = 22 59 60 // Parse returns Info structs describing all of the discovered "artifacts" in 61 // the jar. 62 // 63 // POM properties are a preferred source of information, falling back to 64 // examining the jar manifest and then looking at the name. Anything that looks 65 // like a jar bundled into the archive is also examined. 66 // 67 // The provided name is expected to be the full path within the layer to the jar 68 // file being provided as "z". 69 func Parse(ctx context.Context, name string, z *zip.Reader) ([]Info, error) { 70 ctx = zlog.ContextWithValues(ctx, 71 "component", "java/jar/Parse", 72 "jar", name) 73 return parse(ctx, srcPath{name}, z) 74 } 75 76 // SrcPath is a helper for tracking where an archive member is. 77 // The [Push] and [Pop] methods are not concurrency-safe. 78 type srcPath []string 79 80 func (p srcPath) String() string { 81 return strings.Join(p, ":") 82 } 83 84 func (p srcPath) Cur() string { 85 return p[len(p)-1] 86 } 87 88 func (p *srcPath) Push(n string) { 89 *p = append(*p, n) 90 } 91 92 func (p *srcPath) Pop() string { 93 r := (*p)[len(*p)-1] 94 *p = (*p)[:len(*p)-1] 95 return r 96 } 97 98 // Parse is the inner function that uses a srcPath to keep track of recursions. 99 func parse(ctx context.Context, name srcPath, z *zip.Reader) ([]Info, error) { 100 ctx = zlog.ContextWithValues(ctx, 101 "component", "java/jar/Parse", 102 "name", name.String()) 103 104 // This uses an admittedly non-idiomatic, C-like goto construction. We want 105 // to attempt a few heuristics and keep the results of the first one that 106 // looks good. This does mean that there are restrictions on declarations in 107 // the following block. 108 109 var ret []Info 110 var i Info 111 var err error 112 base := filepath.Base(name.Cur()) 113 // Try the pom.properties files first. Fatjars hopefully have the multiple 114 // properties files preserved. 115 ret, err = extractProperties(ctx, name, z) 116 switch { 117 case errors.Is(err, nil): 118 zlog.Debug(ctx). 119 Msg("using discovered properties file(s)") 120 goto Finish 121 case errors.Is(err, errUnpopulated): 122 case strings.HasPrefix(base, "javax") && errors.Is(err, ErrNotAJar): 123 default: 124 return nil, archiveErr(name, err) 125 } 126 // Look at the jar manifest if that fails. 127 i, err = extractManifest(ctx, name, z) 128 switch { 129 case errors.Is(err, nil): 130 zlog.Debug(ctx). 131 Msg("using discovered manifest") 132 ret = append(ret, i) 133 goto Finish 134 case errors.Is(err, errUnpopulated) || errors.Is(err, errInsaneManifest): 135 case strings.HasPrefix(base, "javax") && errors.Is(err, ErrNotAJar): 136 default: 137 return nil, archiveErr(name, err) 138 } 139 // As a last resort, just look at the name of the jar. 140 i, err = checkName(ctx, name.Cur()) 141 switch { 142 case errors.Is(err, nil): 143 zlog.Debug(ctx). 144 Msg("using name mangling") 145 ret = append(ret, i) 146 goto Finish 147 case errors.Is(err, errUnpopulated): 148 default: 149 return nil, archiveErr(name, err) 150 } 151 // If we haven't jumped past this point, this is almost certainly not a jar, 152 // so return an error. 153 return nil, mkErr("", unidentified(base)) 154 155 Finish: 156 // Now, we need to examine any jars bundled in this jar. 157 inner, err := extractInner(ctx, name, z) 158 if err != nil { 159 return nil, archiveErr(name, err) 160 } 161 if ct := len(inner); ct != 0 { 162 zlog.Debug(ctx). 163 Int("count", ct). 164 Msg("found embedded jars") 165 } 166 ret = append(ret, inner...) 167 168 return ret, nil 169 } 170 171 // ExtractManifest attempts to open the manifest file at the well-known path. 172 // 173 // Reports NotAJar if the file doesn't exist. 174 func extractManifest(ctx context.Context, name srcPath, z *zip.Reader) (Info, error) { 175 const manifestPath = `META-INF/MANIFEST.MF` 176 mf, err := z.Open(manifestPath) 177 switch { 178 case errors.Is(err, nil): 179 case errors.Is(err, fs.ErrNotExist), errors.Is(err, zip.ErrFormat): 180 err = notAJar(name, err) 181 fallthrough 182 default: 183 return Info{}, mkErr("opening manifest", err) 184 } 185 defer mf.Close() 186 var i Info 187 err = i.parseManifest(ctx, mf) 188 if err != nil { 189 return Info{}, mkErr("parsing manifest", err) 190 } 191 name.Push(manifestPath) 192 i.Source = name.String() 193 return i, nil 194 } 195 196 // ExtractProperties pulls pom.properties files out of the provided zip. 197 func extractProperties(ctx context.Context, name srcPath, z *zip.Reader) ([]Info, error) { 198 const filename = "pom.properties" 199 mf, err := z.Open(`META-INF`) 200 switch { 201 case errors.Is(err, nil): 202 case errors.Is(err, fs.ErrNotExist), 203 errors.Is(err, zip.ErrFormat), 204 errors.Is(err, zip.ErrChecksum): 205 return nil, mkErr("properties", notAJar(name, err)) 206 default: 207 return nil, mkErr("properties", err) 208 } 209 mf.Close() 210 var pf []string 211 // Go through the zip looking for properties files. 212 // We should end up with one info for every properties file. 213 for _, f := range z.File { 214 // Normalize the path to handle any attempted traversals 215 // encoded in the file names. 216 p := normName(f.Name) 217 if path.Base(p) == filename { 218 zlog.Debug(ctx). 219 Str("path", p). 220 Msg("found properties file") 221 pf = append(pf, p) 222 } 223 } 224 if len(pf) == 0 { 225 zlog.Debug(ctx).Msg("properties not found") 226 return nil, errUnpopulated 227 } 228 ret := make([]Info, len(pf)) 229 for i, p := range pf { 230 f, err := z.Open(p) 231 switch { 232 case errors.Is(err, nil): 233 case errors.Is(err, zip.ErrFormat), errors.Is(err, zip.ErrChecksum): 234 return nil, mkErr("properties", notAJar(name, err)) 235 default: 236 return nil, mkErr("failed opening properties", err) 237 } 238 err = ret[i].parseProperties(ctx, f) 239 f.Close() 240 if err != nil { 241 return nil, mkErr("failed parsing properties", err) 242 } 243 name.Push(p) 244 ret[i].Source = name.String() 245 name.Pop() 246 } 247 return ret, nil 248 } 249 250 // ExtractInner recurses into anything that looks like a jar in "z". 251 func extractInner(ctx context.Context, p srcPath, z *zip.Reader) ([]Info, error) { 252 ctx = zlog.ContextWithValues(ctx, "parent", p.String()) 253 var ret []Info 254 // Zips need random access, so allocate a buffer for any we find. 255 var buf bytes.Buffer 256 h := sha1.New() 257 checkFile := func(ctx context.Context, f *zip.File) error { 258 name := normName(f.Name) 259 // Check name. 260 if !ValidExt(name) { 261 return nil 262 } 263 fi := f.FileInfo() 264 // Check size. 265 if fi.Size() < MinSize { 266 zlog.Debug(ctx).Str("member", name).Msg("not actually a jar: too small") 267 return nil 268 } 269 rc, err := f.Open() 270 if err != nil { 271 return mkErr("failed opening file", err) 272 } 273 defer rc.Close() 274 buf.Reset() 275 buf.Grow(int(fi.Size())) 276 h.Reset() 277 sz, err := buf.ReadFrom(io.TeeReader(rc, h)) 278 if err != nil { 279 return mkErr("failed buffering file", err) 280 } 281 bs := buf.Bytes() 282 // Check header. 283 if !bytes.Equal(bs[:4], Header) { 284 zlog.Debug(ctx).Str("member", name).Msg("not actually a jar: bad header") 285 return nil 286 } 287 // Okay, now reasonably certain this is a jar. 288 zr, err := zip.NewReader(bytes.NewReader(bs), sz) 289 switch { 290 case errors.Is(err, nil): 291 case errors.Is(err, io.EOF): 292 // BUG(go1.21) Older versions of the stdlib can report io.EOF when 293 // opening malformed zips. 294 fallthrough 295 case errors.Is(err, zip.ErrFormat) || errors.Is(err, io.EOF): 296 zlog.Debug(ctx). 297 Str("member", name). 298 Err(err). 299 Msg("not actually a jar: invalid zip") 300 return nil 301 default: 302 return mkErr("failed opening inner zip", err) 303 } 304 305 p.Push(name) 306 defer p.Pop() 307 ps, err := parse(ctx, p, zr) 308 switch { 309 case errors.Is(err, nil): 310 case errors.Is(err, ErrNotAJar) || 311 errors.Is(err, ErrUnidentified) || 312 errors.Is(err, errInsaneManifest): 313 zlog.Debug(ctx). 314 Str("member", name). 315 Err(err). 316 Msg("not actually a jar") 317 return nil 318 default: 319 return mkErr("parse error", err) 320 } 321 c := make([]byte, sha1.Size) 322 h.Sum(c[:0]) 323 for i := range ps { 324 ps[i].SHA = c 325 } 326 ret = append(ret, ps...) 327 return nil 328 } 329 330 for _, f := range z.File { 331 if err := checkFile(ctx, f); err != nil { 332 return nil, fmt.Errorf("walking %s: %s: %w", p, f.Name, err) 333 } 334 } 335 336 if len(ret) == 0 { 337 zlog.Debug(ctx). 338 Msg("found no bundled jars") 339 } 340 return ret, nil 341 } 342 343 // NormName normalizes a name from a raw zip file header. 344 // 345 // This should be used in all cases that pull the name out of the zip header. 346 func normName(p string) string { 347 return path.Join("/", p)[1:] 348 } 349 350 // NameRegexp is used to attempt to pull a name and version out of a jar's 351 // filename. 352 var nameRegexp = regexp.MustCompile(`([[:graph:]]+)-([[:digit:]][\-.[:alnum:]]*(?:-SNAPSHOT)?)\.jar`) 353 354 // CheckName tries to populate the Info just from the above regexp. 355 func checkName(ctx context.Context, name string) (Info, error) { 356 m := nameRegexp.FindStringSubmatch(filepath.Base(name)) 357 if m == nil { 358 zlog.Debug(ctx). 359 Msg("name not useful") 360 return Info{}, errUnpopulated 361 } 362 return Info{ 363 Name: m[1], 364 Version: m[2], 365 Source: ".", 366 }, nil 367 } 368 369 // Info reports the discovered information for a jar file. 370 // 371 // Any given jar may actually contain multiple jars or recombined classes. 372 type Info struct { 373 // Name is the machine name found. 374 // 375 // Metadata that contains a "presentation" name isn't used to populate this 376 // field. 377 Name string 378 // Version is the version. 379 Version string 380 // Source is the archive member used to populate the information. If the 381 // name of the archive was used, this will be ".". 382 Source string 383 // SHA is populated with the SHA1 of the file if this entry was discovered 384 // inside another archive. 385 SHA []byte 386 } 387 388 func (i *Info) String() string { 389 var b strings.Builder 390 b.WriteString(i.Name) 391 b.WriteByte('/') 392 b.WriteString(i.Version) 393 if len(i.SHA) != 0 { 394 b.WriteString("(sha1:") 395 hex.NewEncoder(&b).Write(i.SHA) 396 b.WriteByte(')') 397 } 398 b.WriteString(" [") 399 b.WriteString(i.Source) 400 b.WriteByte(']') 401 return b.String() 402 } 403 404 // ErrUnpopulated is returned by the parse* methods when they didn't populate 405 // the Info struct. 406 var errUnpopulated = errors.New("unpopulated") 407 408 // ErrInsaneManifest is returned by the parse* method when it the expected sanity 409 // checks fail. 410 var errInsaneManifest = errors.New("jar manifest does not pass sanity checks") 411 412 // ParseManifest does what it says on the tin. 413 // 414 // This extracts "Main Attributes", as defined at 415 // https://docs.oracle.com/javase/8/docs/technotes/guides/jar/jar.html. 416 // 417 // This also examines "Bundle" metadata, aka OSGI metadata, as described in the 418 // spec: https://github.com/osgi/osgi/wiki/Release:-Bundle-Hook-Service-Specification-1.1 419 func (i *Info) parseManifest(ctx context.Context, r io.Reader) error { 420 tp := textproto.NewReader(bufio.NewReader(newMainSectionReader(r))) 421 hdr, err := tp.ReadMIMEHeader() 422 if err != nil { 423 zlog.Debug(ctx). 424 Err(err). 425 Msg("unable to read manifest") 426 return errInsaneManifest 427 } 428 // Sanity checks: 429 switch { 430 case len(hdr) == 0: 431 zlog.Debug(ctx). 432 Msg("no headers found") 433 return errInsaneManifest 434 case !manifestVer.MatchString(hdr.Get("Manifest-Version")): 435 v := hdr.Get("Manifest-Version") 436 zlog.Debug(ctx). 437 Str("manifest_version", v). 438 Msg("invalid manifest version") 439 return errInsaneManifest 440 case hdr.Get("Name") != "": 441 zlog.Debug(ctx). 442 Msg("martian manifest") 443 // This shouldn't be happening in the Main section. 444 return errInsaneManifest 445 } 446 447 var name, version string 448 var groupID, artifactID string 449 450 for _, key := range []string{ 451 "Group-Id", 452 "Bundle-SymbolicName", 453 "Implementation-Vendor-Id", 454 "Implementation-Vendor", 455 "Specification-Vendor", 456 } { 457 value := hdr.Get(key) 458 if key == "Bundle-SymbolicName" { 459 if i := strings.IndexByte(value, ';'); i != -1 { 460 value = value[:i] 461 } 462 } 463 if value != "" && !strings.Contains(value, " ") { 464 groupID = value 465 break 466 } 467 } 468 469 for _, key := range []string{ 470 "Implementation-Title", 471 "Specification-Title", 472 "Bundle-Name", 473 "Extension-Name", 474 "Short-Name", 475 } { 476 value := hdr.Get(key) 477 if value != "" && !strings.Contains(value, " ") { 478 artifactID = value 479 break 480 } 481 } 482 483 if artifactID == groupID { 484 artifactID = "" 485 } 486 487 // Trim to account for empty components. 488 name = strings.Trim(groupID+":"+artifactID, ":") 489 490 for _, key := range []string{ 491 "Bundle-Version", 492 "Implementation-Version", 493 "Plugin-Version", 494 "Specification-Version", 495 } { 496 if v := hdr.Get(key); v != "" { 497 version = v 498 break 499 } 500 } 501 502 if name == "" || version == "" { 503 zlog.Debug(ctx). 504 Strs("attrs", []string{name, version}). 505 Msg("manifest not useful") 506 return errUnpopulated 507 } 508 i.Name = name 509 i.Version = version 510 return nil 511 } 512 513 // NewMainSectionReader returns a reader wrapping "r" that reads until the main 514 // section of the manifest ends, or EOF. It appends newlines as needed to make 515 // the manifest parse like MIME headers. 516 // 517 // To quote from the spec: 518 // 519 // A JAR file manifest consists of a main section followed by a list of 520 // sections for individual JAR file entries, each separated by a newline. Both 521 // the main section and individual sections follow the section syntax specified 522 // above. They each have their own specific restrictions and rules. 523 // 524 // The main section contains security and configuration information about the 525 // JAR file itself, as well as the application or extension that this JAR file 526 // is a part of. It also defines main attributes that apply to every individual 527 // manifest entry. No attribute in this section can have its name equal to 528 // "Name". This section is terminated by an empty line. 529 // 530 // The individual sections define various attributes for packages or files 531 // contained in this JAR file. Not all files in the JAR file need to be listed 532 // in the manifest as entries, but all files which are to be signed must be 533 // listed. The manifest file itself must not be listed. Each section must 534 // start with an attribute with the name as "Name", and the value must be 535 // relative path to the file, or an absolute URL referencing data outside the 536 // archive. 537 // 538 // This is contradicted by the example given and manifests seen in the wild, so 539 // don't trust that the newline exists between sections. 540 func newMainSectionReader(r io.Reader) io.Reader { 541 return &mainSectionReader{ 542 Reader: r, 543 } 544 } 545 546 type mainSectionReader struct { 547 io.Reader 548 ended bool 549 } 550 551 var _ io.Reader = (*mainSectionReader)(nil) 552 553 // Read implements io.Reader. 554 func (m *mainSectionReader) Read(b []byte) (int, error) { 555 switch { 556 case len(b) == 0: 557 return 0, nil 558 case len(b) < 6: // Minimum size to detect the "Name" header. 559 return 0, io.ErrShortBuffer 560 case m.Reader == nil && m.ended: 561 return 0, io.EOF 562 case m.Reader == nil && !m.ended: 563 b[0] = '\r' 564 b[1] = '\n' 565 b[2] = '\r' 566 b[3] = '\n' 567 m.ended = true 568 return 4, io.EOF 569 } 570 571 n, err := m.Reader.Read(b) 572 peek := b[:n] 573 // Check for EOF conditions: 574 hPos := bytes.Index(peek, nameHeader) 575 switch { 576 case hPos != -1: 577 m.Reader = nil 578 // Skip the newline that's at hPos 579 b[hPos+1] = '\r' 580 b[hPos+2] = '\n' 581 n = hPos + 3 582 m.ended = true 583 return n, io.EOF 584 case errors.Is(err, io.EOF) && n == 0: 585 m.Reader = nil 586 case errors.Is(err, io.EOF): 587 m.Reader = nil 588 m.ended = true 589 slack := cap(b) - n 590 switch { 591 case bytes.HasSuffix(peek, []byte("\r\n\r\n")): 592 case bytes.HasSuffix(peek, []byte("\r\n")) && slack >= 2: 593 // add in extra line-end. 594 b[n+0] = '\r' 595 b[n+1] = '\n' 596 n += 2 597 case slack >= 4: 598 b[n+0] = '\r' 599 b[n+1] = '\n' 600 b[n+2] = '\r' 601 b[n+3] = '\n' 602 n += 4 603 default: 604 m.ended = false 605 // no slack space 606 return n, nil 607 } 608 } 609 return n, err 610 } 611 612 // NameHeader is the header that marks the end of the main section. 613 var nameHeader = []byte("\nName:") 614 615 // ManifestVer is a regexp describing a manifest version string. 616 // 617 // Our code doesn't need or prefer a certain manifest version, but every example 618 // seems to be "1.0"? 619 // 620 // % find testdata/manifest -type f -exec awk '/Manifest-Version/{print}' '{}' +|sort|uniq 621 // Manifest-Version: 1.0 622 var manifestVer = regexp.MustCompile(`[[:digit:]]+(\.[[:digit:]]+)*`) 623 624 // ParseProperties parses the pom properties file. 625 // 626 // This is the best-case scenario. 627 func (i *Info) parseProperties(ctx context.Context, r io.Reader) error { 628 var group, artifact, version string 629 s := bufio.NewScanner(r) 630 for s.Scan() && (group == "" || artifact == "" || version == "") { 631 line := strings.TrimSpace(s.Text()) 632 key, value, found := strings.Cut(line, "=") 633 if !found { 634 continue 635 } 636 switch key { 637 case "groupId": 638 group = value 639 case "artifactId": 640 artifact = value 641 case "version": 642 version = value 643 } 644 } 645 if err := s.Err(); err != nil { 646 return mkErr("properties scanner", err) 647 } 648 if group == "" || artifact == "" || version == "" { 649 zlog.Debug(ctx). 650 Strs("attrs", []string{group, artifact, version}). 651 Msg("properties not useful") 652 return errUnpopulated 653 } 654 655 i.Name = group + ":" + artifact 656 i.Version = version 657 return nil 658 }