github.com/rsc/go@v0.0.0-20150416155037-e040fd465409/src/compress/flate/inflate.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run gen.go -output fixedhuff.go 6 7 // Package flate implements the DEFLATE compressed data format, described in 8 // RFC 1951. The gzip and zlib packages implement access to DEFLATE-based file 9 // formats. 10 package flate 11 12 import ( 13 "bufio" 14 "io" 15 "strconv" 16 ) 17 18 const ( 19 maxCodeLen = 16 // max length of Huffman code 20 maxHist = 32768 // max history required 21 // The next three numbers come from the RFC, section 3.2.7. 22 maxLit = 286 23 maxDist = 32 24 numCodes = 19 // number of codes in Huffman meta-code 25 ) 26 27 // A CorruptInputError reports the presence of corrupt input at a given offset. 28 type CorruptInputError int64 29 30 func (e CorruptInputError) Error() string { 31 return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10) 32 } 33 34 // An InternalError reports an error in the flate code itself. 35 type InternalError string 36 37 func (e InternalError) Error() string { return "flate: internal error: " + string(e) } 38 39 // A ReadError reports an error encountered while reading input. 40 type ReadError struct { 41 Offset int64 // byte offset where error occurred 42 Err error // error returned by underlying Read 43 } 44 45 func (e *ReadError) Error() string { 46 return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error() 47 } 48 49 // A WriteError reports an error encountered while writing output. 50 type WriteError struct { 51 Offset int64 // byte offset where error occurred 52 Err error // error returned by underlying Write 53 } 54 55 func (e *WriteError) Error() string { 56 return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error() 57 } 58 59 // Resetter resets a ReadCloser returned by NewReader or NewReaderDict to 60 // to switch to a new underlying Reader. This permits reusing a ReadCloser 61 // instead of allocating a new one. 62 type Resetter interface { 63 // Reset discards any buffered data and resets the Resetter as if it was 64 // newly initialized with the given reader. 65 Reset(r io.Reader, dict []byte) error 66 } 67 68 // Note that much of the implementation of huffmanDecoder is also copied 69 // into gen.go (in package main) for the purpose of precomputing the 70 // fixed huffman tables so they can be included statically. 71 72 // The data structure for decoding Huffman tables is based on that of 73 // zlib. There is a lookup table of a fixed bit width (huffmanChunkBits), 74 // For codes smaller than the table width, there are multiple entries 75 // (each combination of trailing bits has the same value). For codes 76 // larger than the table width, the table contains a link to an overflow 77 // table. The width of each entry in the link table is the maximum code 78 // size minus the chunk width. 79 80 // Note that you can do a lookup in the table even without all bits 81 // filled. Since the extra bits are zero, and the DEFLATE Huffman codes 82 // have the property that shorter codes come before longer ones, the 83 // bit length estimate in the result is a lower bound on the actual 84 // number of bits. 85 86 // chunk & 15 is number of bits 87 // chunk >> 4 is value, including table link 88 89 const ( 90 huffmanChunkBits = 9 91 huffmanNumChunks = 1 << huffmanChunkBits 92 huffmanCountMask = 15 93 huffmanValueShift = 4 94 ) 95 96 type huffmanDecoder struct { 97 min int // the minimum code length 98 chunks [huffmanNumChunks]uint32 // chunks as described above 99 links [][]uint32 // overflow links 100 linkMask uint32 // mask the width of the link table 101 } 102 103 // Initialize Huffman decoding tables from array of code lengths. 104 func (h *huffmanDecoder) init(bits []int) bool { 105 // Sanity enables additional runtime tests during Huffman 106 // table construction. It's intended to be used during 107 // development to supplement the currently ad-hoc unit tests. 108 const sanity = false 109 110 if h.min != 0 { 111 *h = huffmanDecoder{} 112 } 113 114 // Count number of codes of each length, 115 // compute min and max length. 116 var count [maxCodeLen]int 117 var min, max int 118 for _, n := range bits { 119 if n == 0 { 120 continue 121 } 122 if min == 0 || n < min { 123 min = n 124 } 125 if n > max { 126 max = n 127 } 128 count[n]++ 129 } 130 if max == 0 { 131 return false 132 } 133 134 code := 0 135 var nextcode [maxCodeLen]int 136 for i := min; i <= max; i++ { 137 code <<= 1 138 nextcode[i] = code 139 code += count[i] 140 } 141 142 // Check that the coding is complete (i.e., that we've 143 // assigned all 2-to-the-max possible bit sequences). 144 // Exception: To be compatible with zlib, we also need to 145 // accept degenerate single-code codings. See also 146 // TestDegenerateHuffmanCoding. 147 if code != 1<<uint(max) && !(code == 1 && max == 1) { 148 return false 149 } 150 151 h.min = min 152 if max > huffmanChunkBits { 153 numLinks := 1 << (uint(max) - huffmanChunkBits) 154 h.linkMask = uint32(numLinks - 1) 155 156 // create link tables 157 link := nextcode[huffmanChunkBits+1] >> 1 158 h.links = make([][]uint32, huffmanNumChunks-link) 159 for j := uint(link); j < huffmanNumChunks; j++ { 160 reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8 161 reverse >>= uint(16 - huffmanChunkBits) 162 off := j - uint(link) 163 if sanity && h.chunks[reverse] != 0 { 164 panic("impossible: overwriting existing chunk") 165 } 166 h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1)) 167 h.links[off] = make([]uint32, numLinks) 168 } 169 } 170 171 for i, n := range bits { 172 if n == 0 { 173 continue 174 } 175 code := nextcode[n] 176 nextcode[n]++ 177 chunk := uint32(i<<huffmanValueShift | n) 178 reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8 179 reverse >>= uint(16 - n) 180 if n <= huffmanChunkBits { 181 for off := reverse; off < len(h.chunks); off += 1 << uint(n) { 182 // We should never need to overwrite 183 // an existing chunk. Also, 0 is 184 // never a valid chunk, because the 185 // lower 4 "count" bits should be 186 // between 1 and 15. 187 if sanity && h.chunks[off] != 0 { 188 panic("impossible: overwriting existing chunk") 189 } 190 h.chunks[off] = chunk 191 } 192 } else { 193 j := reverse & (huffmanNumChunks - 1) 194 if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 { 195 // Longer codes should have been 196 // associated with a link table above. 197 panic("impossible: not an indirect chunk") 198 } 199 value := h.chunks[j] >> huffmanValueShift 200 linktab := h.links[value] 201 reverse >>= huffmanChunkBits 202 for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) { 203 if sanity && linktab[off] != 0 { 204 panic("impossible: overwriting existing chunk") 205 } 206 linktab[off] = chunk 207 } 208 } 209 } 210 211 if sanity { 212 // Above we've sanity checked that we never overwrote 213 // an existing entry. Here we additionally check that 214 // we filled the tables completely. 215 for i, chunk := range h.chunks { 216 if chunk == 0 { 217 // As an exception, in the degenerate 218 // single-code case, we allow odd 219 // chunks to be missing. 220 if code == 1 && i%2 == 1 { 221 continue 222 } 223 panic("impossible: missing chunk") 224 } 225 } 226 for _, linktab := range h.links { 227 for _, chunk := range linktab { 228 if chunk == 0 { 229 panic("impossible: missing chunk") 230 } 231 } 232 } 233 } 234 235 return true 236 } 237 238 // The actual read interface needed by NewReader. 239 // If the passed in io.Reader does not also have ReadByte, 240 // the NewReader will introduce its own buffering. 241 type Reader interface { 242 io.Reader 243 io.ByteReader 244 } 245 246 // Decompress state. 247 type decompressor struct { 248 // Input source. 249 r Reader 250 roffset int64 251 woffset int64 252 253 // Input bits, in top of b. 254 b uint32 255 nb uint 256 257 // Huffman decoders for literal/length, distance. 258 h1, h2 huffmanDecoder 259 260 // Length arrays used to define Huffman codes. 261 bits *[maxLit + maxDist]int 262 codebits *[numCodes]int 263 264 // Output history, buffer. 265 hist *[maxHist]byte 266 hp int // current output position in buffer 267 hw int // have written hist[0:hw] already 268 hfull bool // buffer has filled at least once 269 270 // Temporary buffer (avoids repeated allocation). 271 buf [4]byte 272 273 // Next step in the decompression, 274 // and decompression state. 275 step func(*decompressor) 276 final bool 277 err error 278 toRead []byte 279 hl, hd *huffmanDecoder 280 copyLen int 281 copyDist int 282 } 283 284 func (f *decompressor) nextBlock() { 285 if f.final { 286 if f.hw != f.hp { 287 f.flush((*decompressor).nextBlock) 288 return 289 } 290 f.err = io.EOF 291 return 292 } 293 for f.nb < 1+2 { 294 if f.err = f.moreBits(); f.err != nil { 295 return 296 } 297 } 298 f.final = f.b&1 == 1 299 f.b >>= 1 300 typ := f.b & 3 301 f.b >>= 2 302 f.nb -= 1 + 2 303 switch typ { 304 case 0: 305 f.dataBlock() 306 case 1: 307 // compressed, fixed Huffman tables 308 f.hl = &fixedHuffmanDecoder 309 f.hd = nil 310 f.huffmanBlock() 311 case 2: 312 // compressed, dynamic Huffman tables 313 if f.err = f.readHuffman(); f.err != nil { 314 break 315 } 316 f.hl = &f.h1 317 f.hd = &f.h2 318 f.huffmanBlock() 319 default: 320 // 3 is reserved. 321 f.err = CorruptInputError(f.roffset) 322 } 323 } 324 325 func (f *decompressor) Read(b []byte) (int, error) { 326 for { 327 if len(f.toRead) > 0 { 328 n := copy(b, f.toRead) 329 f.toRead = f.toRead[n:] 330 return n, nil 331 } 332 if f.err != nil { 333 return 0, f.err 334 } 335 f.step(f) 336 } 337 } 338 339 func (f *decompressor) Close() error { 340 if f.err == io.EOF { 341 return nil 342 } 343 return f.err 344 } 345 346 // RFC 1951 section 3.2.7. 347 // Compression with dynamic Huffman codes 348 349 var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15} 350 351 func (f *decompressor) readHuffman() error { 352 // HLIT[5], HDIST[5], HCLEN[4]. 353 for f.nb < 5+5+4 { 354 if err := f.moreBits(); err != nil { 355 return err 356 } 357 } 358 nlit := int(f.b&0x1F) + 257 359 if nlit > maxLit { 360 return CorruptInputError(f.roffset) 361 } 362 f.b >>= 5 363 ndist := int(f.b&0x1F) + 1 364 // maxDist is 32, so ndist is always valid. 365 f.b >>= 5 366 nclen := int(f.b&0xF) + 4 367 // numCodes is 19, so nclen is always valid. 368 f.b >>= 4 369 f.nb -= 5 + 5 + 4 370 371 // (HCLEN+4)*3 bits: code lengths in the magic codeOrder order. 372 for i := 0; i < nclen; i++ { 373 for f.nb < 3 { 374 if err := f.moreBits(); err != nil { 375 return err 376 } 377 } 378 f.codebits[codeOrder[i]] = int(f.b & 0x7) 379 f.b >>= 3 380 f.nb -= 3 381 } 382 for i := nclen; i < len(codeOrder); i++ { 383 f.codebits[codeOrder[i]] = 0 384 } 385 if !f.h1.init(f.codebits[0:]) { 386 return CorruptInputError(f.roffset) 387 } 388 389 // HLIT + 257 code lengths, HDIST + 1 code lengths, 390 // using the code length Huffman code. 391 for i, n := 0, nlit+ndist; i < n; { 392 x, err := f.huffSym(&f.h1) 393 if err != nil { 394 return err 395 } 396 if x < 16 { 397 // Actual length. 398 f.bits[i] = x 399 i++ 400 continue 401 } 402 // Repeat previous length or zero. 403 var rep int 404 var nb uint 405 var b int 406 switch x { 407 default: 408 return InternalError("unexpected length code") 409 case 16: 410 rep = 3 411 nb = 2 412 if i == 0 { 413 return CorruptInputError(f.roffset) 414 } 415 b = f.bits[i-1] 416 case 17: 417 rep = 3 418 nb = 3 419 b = 0 420 case 18: 421 rep = 11 422 nb = 7 423 b = 0 424 } 425 for f.nb < nb { 426 if err := f.moreBits(); err != nil { 427 return err 428 } 429 } 430 rep += int(f.b & uint32(1<<nb-1)) 431 f.b >>= nb 432 f.nb -= nb 433 if i+rep > n { 434 return CorruptInputError(f.roffset) 435 } 436 for j := 0; j < rep; j++ { 437 f.bits[i] = b 438 i++ 439 } 440 } 441 442 if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) { 443 return CorruptInputError(f.roffset) 444 } 445 446 return nil 447 } 448 449 // Decode a single Huffman block from f. 450 // hl and hd are the Huffman states for the lit/length values 451 // and the distance values, respectively. If hd == nil, using the 452 // fixed distance encoding associated with fixed Huffman blocks. 453 func (f *decompressor) huffmanBlock() { 454 for { 455 v, err := f.huffSym(f.hl) 456 if err != nil { 457 f.err = err 458 return 459 } 460 var n uint // number of bits extra 461 var length int 462 switch { 463 case v < 256: 464 f.hist[f.hp] = byte(v) 465 f.hp++ 466 if f.hp == len(f.hist) { 467 // After the flush, continue this loop. 468 f.flush((*decompressor).huffmanBlock) 469 return 470 } 471 continue 472 case v == 256: 473 // Done with huffman block; read next block. 474 f.step = (*decompressor).nextBlock 475 return 476 // otherwise, reference to older data 477 case v < 265: 478 length = v - (257 - 3) 479 n = 0 480 case v < 269: 481 length = v*2 - (265*2 - 11) 482 n = 1 483 case v < 273: 484 length = v*4 - (269*4 - 19) 485 n = 2 486 case v < 277: 487 length = v*8 - (273*8 - 35) 488 n = 3 489 case v < 281: 490 length = v*16 - (277*16 - 67) 491 n = 4 492 case v < 285: 493 length = v*32 - (281*32 - 131) 494 n = 5 495 default: 496 length = 258 497 n = 0 498 } 499 if n > 0 { 500 for f.nb < n { 501 if err = f.moreBits(); err != nil { 502 f.err = err 503 return 504 } 505 } 506 length += int(f.b & uint32(1<<n-1)) 507 f.b >>= n 508 f.nb -= n 509 } 510 511 var dist int 512 if f.hd == nil { 513 for f.nb < 5 { 514 if err = f.moreBits(); err != nil { 515 f.err = err 516 return 517 } 518 } 519 dist = int(reverseByte[(f.b&0x1F)<<3]) 520 f.b >>= 5 521 f.nb -= 5 522 } else { 523 if dist, err = f.huffSym(f.hd); err != nil { 524 f.err = err 525 return 526 } 527 } 528 529 switch { 530 case dist < 4: 531 dist++ 532 case dist >= 30: 533 f.err = CorruptInputError(f.roffset) 534 return 535 default: 536 nb := uint(dist-2) >> 1 537 // have 1 bit in bottom of dist, need nb more. 538 extra := (dist & 1) << nb 539 for f.nb < nb { 540 if err = f.moreBits(); err != nil { 541 f.err = err 542 return 543 } 544 } 545 extra |= int(f.b & uint32(1<<nb-1)) 546 f.b >>= nb 547 f.nb -= nb 548 dist = 1<<(nb+1) + 1 + extra 549 } 550 551 // Copy history[-dist:-dist+length] into output. 552 if dist > len(f.hist) { 553 f.err = InternalError("bad history distance") 554 return 555 } 556 557 // No check on length; encoding can be prescient. 558 if !f.hfull && dist > f.hp { 559 f.err = CorruptInputError(f.roffset) 560 return 561 } 562 563 f.copyLen, f.copyDist = length, dist 564 if f.copyHist() { 565 return 566 } 567 } 568 } 569 570 // copyHist copies f.copyLen bytes from f.hist (f.copyDist bytes ago) to itself. 571 // It reports whether the f.hist buffer is full. 572 func (f *decompressor) copyHist() bool { 573 p := f.hp - f.copyDist 574 if p < 0 { 575 p += len(f.hist) 576 } 577 for f.copyLen > 0 { 578 n := f.copyLen 579 if x := len(f.hist) - f.hp; n > x { 580 n = x 581 } 582 if x := len(f.hist) - p; n > x { 583 n = x 584 } 585 forwardCopy(f.hist[:], f.hp, p, n) 586 p += n 587 f.hp += n 588 f.copyLen -= n 589 if f.hp == len(f.hist) { 590 // After flush continue copying out of history. 591 f.flush((*decompressor).copyHuff) 592 return true 593 } 594 if p == len(f.hist) { 595 p = 0 596 } 597 } 598 return false 599 } 600 601 func (f *decompressor) copyHuff() { 602 if f.copyHist() { 603 return 604 } 605 f.huffmanBlock() 606 } 607 608 // Copy a single uncompressed data block from input to output. 609 func (f *decompressor) dataBlock() { 610 // Uncompressed. 611 // Discard current half-byte. 612 f.nb = 0 613 f.b = 0 614 615 // Length then ones-complement of length. 616 nr, err := io.ReadFull(f.r, f.buf[0:4]) 617 f.roffset += int64(nr) 618 if err != nil { 619 f.err = &ReadError{f.roffset, err} 620 return 621 } 622 n := int(f.buf[0]) | int(f.buf[1])<<8 623 nn := int(f.buf[2]) | int(f.buf[3])<<8 624 if uint16(nn) != uint16(^n) { 625 f.err = CorruptInputError(f.roffset) 626 return 627 } 628 629 if n == 0 { 630 // 0-length block means sync 631 f.flush((*decompressor).nextBlock) 632 return 633 } 634 635 f.copyLen = n 636 f.copyData() 637 } 638 639 // copyData copies f.copyLen bytes from the underlying reader into f.hist. 640 // It pauses for reads when f.hist is full. 641 func (f *decompressor) copyData() { 642 n := f.copyLen 643 for n > 0 { 644 m := len(f.hist) - f.hp 645 if m > n { 646 m = n 647 } 648 m, err := io.ReadFull(f.r, f.hist[f.hp:f.hp+m]) 649 f.roffset += int64(m) 650 if err != nil { 651 f.err = &ReadError{f.roffset, err} 652 return 653 } 654 n -= m 655 f.hp += m 656 if f.hp == len(f.hist) { 657 f.copyLen = n 658 f.flush((*decompressor).copyData) 659 return 660 } 661 } 662 f.step = (*decompressor).nextBlock 663 } 664 665 func (f *decompressor) setDict(dict []byte) { 666 if len(dict) > len(f.hist) { 667 // Will only remember the tail. 668 dict = dict[len(dict)-len(f.hist):] 669 } 670 671 f.hp = copy(f.hist[:], dict) 672 if f.hp == len(f.hist) { 673 f.hp = 0 674 f.hfull = true 675 } 676 f.hw = f.hp 677 } 678 679 func (f *decompressor) moreBits() error { 680 c, err := f.r.ReadByte() 681 if err != nil { 682 if err == io.EOF { 683 err = io.ErrUnexpectedEOF 684 } 685 return err 686 } 687 f.roffset++ 688 f.b |= uint32(c) << f.nb 689 f.nb += 8 690 return nil 691 } 692 693 // Read the next Huffman-encoded symbol from f according to h. 694 func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) { 695 n := uint(h.min) 696 for { 697 for f.nb < n { 698 if err := f.moreBits(); err != nil { 699 return 0, err 700 } 701 } 702 chunk := h.chunks[f.b&(huffmanNumChunks-1)] 703 n = uint(chunk & huffmanCountMask) 704 if n > huffmanChunkBits { 705 chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask] 706 n = uint(chunk & huffmanCountMask) 707 } 708 if n <= f.nb { 709 if n == 0 { 710 f.err = CorruptInputError(f.roffset) 711 return 0, f.err 712 } 713 f.b >>= n 714 f.nb -= n 715 return int(chunk >> huffmanValueShift), nil 716 } 717 } 718 } 719 720 // Flush any buffered output to the underlying writer. 721 func (f *decompressor) flush(step func(*decompressor)) { 722 f.toRead = f.hist[f.hw:f.hp] 723 f.woffset += int64(f.hp - f.hw) 724 f.hw = f.hp 725 if f.hp == len(f.hist) { 726 f.hp = 0 727 f.hw = 0 728 f.hfull = true 729 } 730 f.step = step 731 } 732 733 func makeReader(r io.Reader) Reader { 734 if rr, ok := r.(Reader); ok { 735 return rr 736 } 737 return bufio.NewReader(r) 738 } 739 740 func (f *decompressor) Reset(r io.Reader, dict []byte) error { 741 *f = decompressor{ 742 r: makeReader(r), 743 bits: f.bits, 744 codebits: f.codebits, 745 hist: f.hist, 746 step: (*decompressor).nextBlock, 747 } 748 if dict != nil { 749 f.setDict(dict) 750 } 751 return nil 752 } 753 754 // NewReader returns a new ReadCloser that can be used 755 // to read the uncompressed version of r. 756 // If r does not also implement io.ByteReader, 757 // the decompressor may read more data than necessary from r. 758 // It is the caller's responsibility to call Close on the ReadCloser 759 // when finished reading. 760 // 761 // The ReadCloser returned by NewReader also implements Resetter. 762 func NewReader(r io.Reader) io.ReadCloser { 763 var f decompressor 764 f.bits = new([maxLit + maxDist]int) 765 f.codebits = new([numCodes]int) 766 f.r = makeReader(r) 767 f.hist = new([maxHist]byte) 768 f.step = (*decompressor).nextBlock 769 return &f 770 } 771 772 // NewReaderDict is like NewReader but initializes the reader 773 // with a preset dictionary. The returned Reader behaves as if 774 // the uncompressed data stream started with the given dictionary, 775 // which has already been read. NewReaderDict is typically used 776 // to read data compressed by NewWriterDict. 777 // 778 // The ReadCloser returned by NewReader also implements Resetter. 779 func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser { 780 var f decompressor 781 f.r = makeReader(r) 782 f.hist = new([maxHist]byte) 783 f.bits = new([maxLit + maxDist]int) 784 f.codebits = new([numCodes]int) 785 f.step = (*decompressor).nextBlock 786 f.setDict(dict) 787 return &f 788 }