github.com/ader1990/go@v0.0.0-20140630135419-8c24447fa791/src/pkg/compress/flate/inflate.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package flate implements the DEFLATE compressed data format, described in
     6  // RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
     7  // formats.
     8  package flate
     9  
    10  import (
    11  	"bufio"
    12  	"io"
    13  	"strconv"
    14  )
    15  
    16  const (
    17  	maxCodeLen = 16    // max length of Huffman code
    18  	maxHist    = 32768 // max history required
    19  	// The next three numbers come from the RFC, section 3.2.7.
    20  	maxLit   = 286
    21  	maxDist  = 32
    22  	numCodes = 19 // number of codes in Huffman meta-code
    23  )
    24  
    25  // A CorruptInputError reports the presence of corrupt input at a given offset.
    26  type CorruptInputError int64
    27  
    28  func (e CorruptInputError) Error() string {
    29  	return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
    30  }
    31  
    32  // An InternalError reports an error in the flate code itself.
    33  type InternalError string
    34  
    35  func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
    36  
    37  // A ReadError reports an error encountered while reading input.
    38  type ReadError struct {
    39  	Offset int64 // byte offset where error occurred
    40  	Err    error // error returned by underlying Read
    41  }
    42  
    43  func (e *ReadError) Error() string {
    44  	return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
    45  }
    46  
    47  // A WriteError reports an error encountered while writing output.
    48  type WriteError struct {
    49  	Offset int64 // byte offset where error occurred
    50  	Err    error // error returned by underlying Write
    51  }
    52  
    53  func (e *WriteError) Error() string {
    54  	return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
    55  }
    56  
    57  // Note that much of the implementation of huffmanDecoder is also copied
    58  // into gen.go (in package main) for the purpose of precomputing the
    59  // fixed huffman tables so they can be included statically.
    60  
    61  // The data structure for decoding Huffman tables is based on that of
    62  // zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
    63  // For codes smaller than the table width, there are multiple entries
    64  // (each combination of trailing bits has the same value). For codes
    65  // larger than the table width, the table contains a link to an overflow
    66  // table. The width of each entry in the link table is the maximum code
    67  // size minus the chunk width.
    68  
    69  // Note that you can do a lookup in the table even without all bits
    70  // filled. Since the extra bits are zero, and the DEFLATE Huffman codes
    71  // have the property that shorter codes come before longer ones, the
    72  // bit length estimate in the result is a lower bound on the actual
    73  // number of bits.
    74  
    75  // chunk & 15 is number of bits
    76  // chunk >> 4 is value, including table link
    77  
    78  const (
    79  	huffmanChunkBits  = 9
    80  	huffmanNumChunks  = 1 << huffmanChunkBits
    81  	huffmanCountMask  = 15
    82  	huffmanValueShift = 4
    83  )
    84  
    85  type huffmanDecoder struct {
    86  	min      int                      // the minimum code length
    87  	chunks   [huffmanNumChunks]uint32 // chunks as described above
    88  	links    [][]uint32               // overflow links
    89  	linkMask uint32                   // mask the width of the link table
    90  }
    91  
    92  // Initialize Huffman decoding tables from array of code lengths.
    93  func (h *huffmanDecoder) init(bits []int) bool {
    94  	if h.min != 0 {
    95  		*h = huffmanDecoder{}
    96  	}
    97  
    98  	// Count number of codes of each length,
    99  	// compute min and max length.
   100  	var count [maxCodeLen]int
   101  	var min, max int
   102  	for _, n := range bits {
   103  		if n == 0 {
   104  			continue
   105  		}
   106  		if min == 0 || n < min {
   107  			min = n
   108  		}
   109  		if n > max {
   110  			max = n
   111  		}
   112  		count[n]++
   113  	}
   114  	if max == 0 {
   115  		return false
   116  	}
   117  
   118  	h.min = min
   119  	var linkBits uint
   120  	var numLinks int
   121  	if max > huffmanChunkBits {
   122  		linkBits = uint(max) - huffmanChunkBits
   123  		numLinks = 1 << linkBits
   124  		h.linkMask = uint32(numLinks - 1)
   125  	}
   126  	code := 0
   127  	var nextcode [maxCodeLen]int
   128  	for i := min; i <= max; i++ {
   129  		if i == huffmanChunkBits+1 {
   130  			// create link tables
   131  			link := code >> 1
   132  			if huffmanNumChunks < link {
   133  				return false
   134  			}
   135  			h.links = make([][]uint32, huffmanNumChunks-link)
   136  			for j := uint(link); j < huffmanNumChunks; j++ {
   137  				reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
   138  				reverse >>= uint(16 - huffmanChunkBits)
   139  				off := j - uint(link)
   140  				h.chunks[reverse] = uint32(off<<huffmanValueShift + uint(i))
   141  				h.links[off] = make([]uint32, 1<<linkBits)
   142  			}
   143  		}
   144  		n := count[i]
   145  		nextcode[i] = code
   146  		code += n
   147  		code <<= 1
   148  	}
   149  
   150  	for i, n := range bits {
   151  		if n == 0 {
   152  			continue
   153  		}
   154  		code := nextcode[n]
   155  		nextcode[n]++
   156  		chunk := uint32(i<<huffmanValueShift | n)
   157  		reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
   158  		reverse >>= uint(16 - n)
   159  		if n <= huffmanChunkBits {
   160  			for off := reverse; off < huffmanNumChunks; off += 1 << uint(n) {
   161  				h.chunks[off] = chunk
   162  			}
   163  		} else {
   164  			value := h.chunks[reverse&(huffmanNumChunks-1)] >> huffmanValueShift
   165  			if value >= uint32(len(h.links)) {
   166  				return false
   167  			}
   168  			linktab := h.links[value]
   169  			reverse >>= huffmanChunkBits
   170  			for off := reverse; off < numLinks; off += 1 << uint(n-huffmanChunkBits) {
   171  				linktab[off] = chunk
   172  			}
   173  		}
   174  	}
   175  	return true
   176  }
   177  
   178  // The actual read interface needed by NewReader.
   179  // If the passed in io.Reader does not also have ReadByte,
   180  // the NewReader will introduce its own buffering.
   181  type Reader interface {
   182  	io.Reader
   183  	io.ByteReader
   184  }
   185  
   186  // Decompress state.
   187  type decompressor struct {
   188  	// Input source.
   189  	r       Reader
   190  	roffset int64
   191  	woffset int64
   192  
   193  	// Input bits, in top of b.
   194  	b  uint32
   195  	nb uint
   196  
   197  	// Huffman decoders for literal/length, distance.
   198  	h1, h2 huffmanDecoder
   199  
   200  	// Length arrays used to define Huffman codes.
   201  	bits     *[maxLit + maxDist]int
   202  	codebits *[numCodes]int
   203  
   204  	// Output history, buffer.
   205  	hist  *[maxHist]byte
   206  	hp    int  // current output position in buffer
   207  	hw    int  // have written hist[0:hw] already
   208  	hfull bool // buffer has filled at least once
   209  
   210  	// Temporary buffer (avoids repeated allocation).
   211  	buf [4]byte
   212  
   213  	// Next step in the decompression,
   214  	// and decompression state.
   215  	step     func(*decompressor)
   216  	final    bool
   217  	err      error
   218  	toRead   []byte
   219  	hl, hd   *huffmanDecoder
   220  	copyLen  int
   221  	copyDist int
   222  }
   223  
   224  func (f *decompressor) nextBlock() {
   225  	if f.final {
   226  		if f.hw != f.hp {
   227  			f.flush((*decompressor).nextBlock)
   228  			return
   229  		}
   230  		f.err = io.EOF
   231  		return
   232  	}
   233  	for f.nb < 1+2 {
   234  		if f.err = f.moreBits(); f.err != nil {
   235  			return
   236  		}
   237  	}
   238  	f.final = f.b&1 == 1
   239  	f.b >>= 1
   240  	typ := f.b & 3
   241  	f.b >>= 2
   242  	f.nb -= 1 + 2
   243  	switch typ {
   244  	case 0:
   245  		f.dataBlock()
   246  	case 1:
   247  		// compressed, fixed Huffman tables
   248  		f.hl = &fixedHuffmanDecoder
   249  		f.hd = nil
   250  		f.huffmanBlock()
   251  	case 2:
   252  		// compressed, dynamic Huffman tables
   253  		if f.err = f.readHuffman(); f.err != nil {
   254  			break
   255  		}
   256  		f.hl = &f.h1
   257  		f.hd = &f.h2
   258  		f.huffmanBlock()
   259  	default:
   260  		// 3 is reserved.
   261  		f.err = CorruptInputError(f.roffset)
   262  	}
   263  }
   264  
   265  func (f *decompressor) Read(b []byte) (int, error) {
   266  	for {
   267  		if len(f.toRead) > 0 {
   268  			n := copy(b, f.toRead)
   269  			f.toRead = f.toRead[n:]
   270  			return n, nil
   271  		}
   272  		if f.err != nil {
   273  			return 0, f.err
   274  		}
   275  		f.step(f)
   276  	}
   277  }
   278  
   279  func (f *decompressor) Close() error {
   280  	if f.err == io.EOF {
   281  		return nil
   282  	}
   283  	return f.err
   284  }
   285  
   286  // RFC 1951 section 3.2.7.
   287  // Compression with dynamic Huffman codes
   288  
   289  var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
   290  
   291  func (f *decompressor) readHuffman() error {
   292  	// HLIT[5], HDIST[5], HCLEN[4].
   293  	for f.nb < 5+5+4 {
   294  		if err := f.moreBits(); err != nil {
   295  			return err
   296  		}
   297  	}
   298  	nlit := int(f.b&0x1F) + 257
   299  	if nlit > maxLit {
   300  		return CorruptInputError(f.roffset)
   301  	}
   302  	f.b >>= 5
   303  	ndist := int(f.b&0x1F) + 1
   304  	// maxDist is 32, so ndist is always valid.
   305  	f.b >>= 5
   306  	nclen := int(f.b&0xF) + 4
   307  	// numCodes is 19, so nclen is always valid.
   308  	f.b >>= 4
   309  	f.nb -= 5 + 5 + 4
   310  
   311  	// (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
   312  	for i := 0; i < nclen; i++ {
   313  		for f.nb < 3 {
   314  			if err := f.moreBits(); err != nil {
   315  				return err
   316  			}
   317  		}
   318  		f.codebits[codeOrder[i]] = int(f.b & 0x7)
   319  		f.b >>= 3
   320  		f.nb -= 3
   321  	}
   322  	for i := nclen; i < len(codeOrder); i++ {
   323  		f.codebits[codeOrder[i]] = 0
   324  	}
   325  	if !f.h1.init(f.codebits[0:]) {
   326  		return CorruptInputError(f.roffset)
   327  	}
   328  
   329  	// HLIT + 257 code lengths, HDIST + 1 code lengths,
   330  	// using the code length Huffman code.
   331  	for i, n := 0, nlit+ndist; i < n; {
   332  		x, err := f.huffSym(&f.h1)
   333  		if err != nil {
   334  			return err
   335  		}
   336  		if x < 16 {
   337  			// Actual length.
   338  			f.bits[i] = x
   339  			i++
   340  			continue
   341  		}
   342  		// Repeat previous length or zero.
   343  		var rep int
   344  		var nb uint
   345  		var b int
   346  		switch x {
   347  		default:
   348  			return InternalError("unexpected length code")
   349  		case 16:
   350  			rep = 3
   351  			nb = 2
   352  			if i == 0 {
   353  				return CorruptInputError(f.roffset)
   354  			}
   355  			b = f.bits[i-1]
   356  		case 17:
   357  			rep = 3
   358  			nb = 3
   359  			b = 0
   360  		case 18:
   361  			rep = 11
   362  			nb = 7
   363  			b = 0
   364  		}
   365  		for f.nb < nb {
   366  			if err := f.moreBits(); err != nil {
   367  				return err
   368  			}
   369  		}
   370  		rep += int(f.b & uint32(1<<nb-1))
   371  		f.b >>= nb
   372  		f.nb -= nb
   373  		if i+rep > n {
   374  			return CorruptInputError(f.roffset)
   375  		}
   376  		for j := 0; j < rep; j++ {
   377  			f.bits[i] = b
   378  			i++
   379  		}
   380  	}
   381  
   382  	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
   383  		return CorruptInputError(f.roffset)
   384  	}
   385  
   386  	return nil
   387  }
   388  
   389  // Decode a single Huffman block from f.
   390  // hl and hd are the Huffman states for the lit/length values
   391  // and the distance values, respectively.  If hd == nil, using the
   392  // fixed distance encoding associated with fixed Huffman blocks.
   393  func (f *decompressor) huffmanBlock() {
   394  	for {
   395  		v, err := f.huffSym(f.hl)
   396  		if err != nil {
   397  			f.err = err
   398  			return
   399  		}
   400  		var n uint // number of bits extra
   401  		var length int
   402  		switch {
   403  		case v < 256:
   404  			f.hist[f.hp] = byte(v)
   405  			f.hp++
   406  			if f.hp == len(f.hist) {
   407  				// After the flush, continue this loop.
   408  				f.flush((*decompressor).huffmanBlock)
   409  				return
   410  			}
   411  			continue
   412  		case v == 256:
   413  			// Done with huffman block; read next block.
   414  			f.step = (*decompressor).nextBlock
   415  			return
   416  		// otherwise, reference to older data
   417  		case v < 265:
   418  			length = v - (257 - 3)
   419  			n = 0
   420  		case v < 269:
   421  			length = v*2 - (265*2 - 11)
   422  			n = 1
   423  		case v < 273:
   424  			length = v*4 - (269*4 - 19)
   425  			n = 2
   426  		case v < 277:
   427  			length = v*8 - (273*8 - 35)
   428  			n = 3
   429  		case v < 281:
   430  			length = v*16 - (277*16 - 67)
   431  			n = 4
   432  		case v < 285:
   433  			length = v*32 - (281*32 - 131)
   434  			n = 5
   435  		default:
   436  			length = 258
   437  			n = 0
   438  		}
   439  		if n > 0 {
   440  			for f.nb < n {
   441  				if err = f.moreBits(); err != nil {
   442  					f.err = err
   443  					return
   444  				}
   445  			}
   446  			length += int(f.b & uint32(1<<n-1))
   447  			f.b >>= n
   448  			f.nb -= n
   449  		}
   450  
   451  		var dist int
   452  		if f.hd == nil {
   453  			for f.nb < 5 {
   454  				if err = f.moreBits(); err != nil {
   455  					f.err = err
   456  					return
   457  				}
   458  			}
   459  			dist = int(reverseByte[(f.b&0x1F)<<3])
   460  			f.b >>= 5
   461  			f.nb -= 5
   462  		} else {
   463  			if dist, err = f.huffSym(f.hd); err != nil {
   464  				f.err = err
   465  				return
   466  			}
   467  		}
   468  
   469  		switch {
   470  		case dist < 4:
   471  			dist++
   472  		case dist >= 30:
   473  			f.err = CorruptInputError(f.roffset)
   474  			return
   475  		default:
   476  			nb := uint(dist-2) >> 1
   477  			// have 1 bit in bottom of dist, need nb more.
   478  			extra := (dist & 1) << nb
   479  			for f.nb < nb {
   480  				if err = f.moreBits(); err != nil {
   481  					f.err = err
   482  					return
   483  				}
   484  			}
   485  			extra |= int(f.b & uint32(1<<nb-1))
   486  			f.b >>= nb
   487  			f.nb -= nb
   488  			dist = 1<<(nb+1) + 1 + extra
   489  		}
   490  
   491  		// Copy history[-dist:-dist+length] into output.
   492  		if dist > len(f.hist) {
   493  			f.err = InternalError("bad history distance")
   494  			return
   495  		}
   496  
   497  		// No check on length; encoding can be prescient.
   498  		if !f.hfull && dist > f.hp {
   499  			f.err = CorruptInputError(f.roffset)
   500  			return
   501  		}
   502  
   503  		f.copyLen, f.copyDist = length, dist
   504  		if f.copyHist() {
   505  			return
   506  		}
   507  	}
   508  }
   509  
   510  // copyHist copies f.copyLen bytes from f.hist (f.copyDist bytes ago) to itself.
   511  // It reports whether the f.hist buffer is full.
   512  func (f *decompressor) copyHist() bool {
   513  	p := f.hp - f.copyDist
   514  	if p < 0 {
   515  		p += len(f.hist)
   516  	}
   517  	for f.copyLen > 0 {
   518  		n := f.copyLen
   519  		if x := len(f.hist) - f.hp; n > x {
   520  			n = x
   521  		}
   522  		if x := len(f.hist) - p; n > x {
   523  			n = x
   524  		}
   525  		forwardCopy(f.hist[:], f.hp, p, n)
   526  		p += n
   527  		f.hp += n
   528  		f.copyLen -= n
   529  		if f.hp == len(f.hist) {
   530  			// After flush continue copying out of history.
   531  			f.flush((*decompressor).copyHuff)
   532  			return true
   533  		}
   534  		if p == len(f.hist) {
   535  			p = 0
   536  		}
   537  	}
   538  	return false
   539  }
   540  
   541  func (f *decompressor) copyHuff() {
   542  	if f.copyHist() {
   543  		return
   544  	}
   545  	f.huffmanBlock()
   546  }
   547  
   548  // Copy a single uncompressed data block from input to output.
   549  func (f *decompressor) dataBlock() {
   550  	// Uncompressed.
   551  	// Discard current half-byte.
   552  	f.nb = 0
   553  	f.b = 0
   554  
   555  	// Length then ones-complement of length.
   556  	nr, err := io.ReadFull(f.r, f.buf[0:4])
   557  	f.roffset += int64(nr)
   558  	if err != nil {
   559  		f.err = &ReadError{f.roffset, err}
   560  		return
   561  	}
   562  	n := int(f.buf[0]) | int(f.buf[1])<<8
   563  	nn := int(f.buf[2]) | int(f.buf[3])<<8
   564  	if uint16(nn) != uint16(^n) {
   565  		f.err = CorruptInputError(f.roffset)
   566  		return
   567  	}
   568  
   569  	if n == 0 {
   570  		// 0-length block means sync
   571  		f.flush((*decompressor).nextBlock)
   572  		return
   573  	}
   574  
   575  	f.copyLen = n
   576  	f.copyData()
   577  }
   578  
   579  // copyData copies f.copyLen bytes from the underlying reader into f.hist.
   580  // It pauses for reads when f.hist is full.
   581  func (f *decompressor) copyData() {
   582  	n := f.copyLen
   583  	for n > 0 {
   584  		m := len(f.hist) - f.hp
   585  		if m > n {
   586  			m = n
   587  		}
   588  		m, err := io.ReadFull(f.r, f.hist[f.hp:f.hp+m])
   589  		f.roffset += int64(m)
   590  		if err != nil {
   591  			f.err = &ReadError{f.roffset, err}
   592  			return
   593  		}
   594  		n -= m
   595  		f.hp += m
   596  		if f.hp == len(f.hist) {
   597  			f.copyLen = n
   598  			f.flush((*decompressor).copyData)
   599  			return
   600  		}
   601  	}
   602  	f.step = (*decompressor).nextBlock
   603  }
   604  
   605  func (f *decompressor) setDict(dict []byte) {
   606  	if len(dict) > len(f.hist) {
   607  		// Will only remember the tail.
   608  		dict = dict[len(dict)-len(f.hist):]
   609  	}
   610  
   611  	f.hp = copy(f.hist[:], dict)
   612  	if f.hp == len(f.hist) {
   613  		f.hp = 0
   614  		f.hfull = true
   615  	}
   616  	f.hw = f.hp
   617  }
   618  
   619  func (f *decompressor) moreBits() error {
   620  	c, err := f.r.ReadByte()
   621  	if err != nil {
   622  		if err == io.EOF {
   623  			err = io.ErrUnexpectedEOF
   624  		}
   625  		return err
   626  	}
   627  	f.roffset++
   628  	f.b |= uint32(c) << f.nb
   629  	f.nb += 8
   630  	return nil
   631  }
   632  
   633  // Read the next Huffman-encoded symbol from f according to h.
   634  func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
   635  	n := uint(h.min)
   636  	for {
   637  		for f.nb < n {
   638  			if err := f.moreBits(); err != nil {
   639  				return 0, err
   640  			}
   641  		}
   642  		chunk := h.chunks[f.b&(huffmanNumChunks-1)]
   643  		n = uint(chunk & huffmanCountMask)
   644  		if n > huffmanChunkBits {
   645  			chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask]
   646  			n = uint(chunk & huffmanCountMask)
   647  			if n == 0 {
   648  				f.err = CorruptInputError(f.roffset)
   649  				return 0, f.err
   650  			}
   651  		}
   652  		if n <= f.nb {
   653  			f.b >>= n
   654  			f.nb -= n
   655  			return int(chunk >> huffmanValueShift), nil
   656  		}
   657  	}
   658  }
   659  
   660  // Flush any buffered output to the underlying writer.
   661  func (f *decompressor) flush(step func(*decompressor)) {
   662  	f.toRead = f.hist[f.hw:f.hp]
   663  	f.woffset += int64(f.hp - f.hw)
   664  	f.hw = f.hp
   665  	if f.hp == len(f.hist) {
   666  		f.hp = 0
   667  		f.hw = 0
   668  		f.hfull = true
   669  	}
   670  	f.step = step
   671  }
   672  
   673  func makeReader(r io.Reader) Reader {
   674  	if rr, ok := r.(Reader); ok {
   675  		return rr
   676  	}
   677  	return bufio.NewReader(r)
   678  }
   679  
   680  // NewReader returns a new ReadCloser that can be used
   681  // to read the uncompressed version of r.  It is the caller's
   682  // responsibility to call Close on the ReadCloser when
   683  // finished reading.
   684  func NewReader(r io.Reader) io.ReadCloser {
   685  	var f decompressor
   686  	f.bits = new([maxLit + maxDist]int)
   687  	f.codebits = new([numCodes]int)
   688  	f.r = makeReader(r)
   689  	f.hist = new([maxHist]byte)
   690  	f.step = (*decompressor).nextBlock
   691  	return &f
   692  }
   693  
   694  // NewReaderDict is like NewReader but initializes the reader
   695  // with a preset dictionary.  The returned Reader behaves as if
   696  // the uncompressed data stream started with the given dictionary,
   697  // which has already been read.  NewReaderDict is typically used
   698  // to read data compressed by NewWriterDict.
   699  func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
   700  	var f decompressor
   701  	f.r = makeReader(r)
   702  	f.hist = new([maxHist]byte)
   703  	f.bits = new([maxLit + maxDist]int)
   704  	f.codebits = new([numCodes]int)
   705  	f.step = (*decompressor).nextBlock
   706  	f.setDict(dict)
   707  	return &f
   708  }