github.com/peggyl/go@v0.0.0-20151008231540-ae315999c2d5/src/compress/bzip2/bzip2.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package bzip2 implements bzip2 decompression.
     6  package bzip2
     7  
     8  import "io"
     9  
    10  // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
    11  // of guessing: http://en.wikipedia.org/wiki/Bzip2
    12  // The source code to pyflate was useful for debugging:
    13  // http://www.paul.sladen.org/projects/pyflate
    14  
    15  // A StructuralError is returned when the bzip2 data is found to be
    16  // syntactically invalid.
    17  type StructuralError string
    18  
    19  func (s StructuralError) Error() string {
    20  	return "bzip2 data invalid: " + string(s)
    21  }
    22  
    23  // A reader decompresses bzip2 compressed data.
    24  type reader struct {
    25  	br           bitReader
    26  	fileCRC      uint32
    27  	blockCRC     uint32
    28  	wantBlockCRC uint32
    29  	setupDone    bool // true if we have parsed the bzip2 header.
    30  	blockSize    int  // blockSize in bytes, i.e. 900 * 1024.
    31  	eof          bool
    32  	buf          []byte    // stores Burrows-Wheeler transformed data.
    33  	c            [256]uint // the `C' array for the inverse BWT.
    34  	tt           []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
    35  	tPos         uint32    // Index of the next output byte in tt.
    36  
    37  	preRLE      []uint32 // contains the RLE data still to be processed.
    38  	preRLEUsed  int      // number of entries of preRLE used.
    39  	lastByte    int      // the last byte value seen.
    40  	byteRepeats uint     // the number of repeats of lastByte seen.
    41  	repeats     uint     // the number of copies of lastByte to output.
    42  }
    43  
    44  // NewReader returns an io.Reader which decompresses bzip2 data from r.
    45  // If r does not also implement io.ByteReader,
    46  // the decompressor may read more data than necessary from r.
    47  func NewReader(r io.Reader) io.Reader {
    48  	bz2 := new(reader)
    49  	bz2.br = newBitReader(r)
    50  	return bz2
    51  }
    52  
    53  const bzip2FileMagic = 0x425a // "BZ"
    54  const bzip2BlockMagic = 0x314159265359
    55  const bzip2FinalMagic = 0x177245385090
    56  
    57  // setup parses the bzip2 header.
    58  func (bz2 *reader) setup(needMagic bool) error {
    59  	br := &bz2.br
    60  
    61  	if needMagic {
    62  		magic := br.ReadBits(16)
    63  		if magic != bzip2FileMagic {
    64  			return StructuralError("bad magic value")
    65  		}
    66  	}
    67  
    68  	t := br.ReadBits(8)
    69  	if t != 'h' {
    70  		return StructuralError("non-Huffman entropy encoding")
    71  	}
    72  
    73  	level := br.ReadBits(8)
    74  	if level < '1' || level > '9' {
    75  		return StructuralError("invalid compression level")
    76  	}
    77  
    78  	bz2.fileCRC = 0
    79  	bz2.blockSize = 100 * 1024 * (int(level) - '0')
    80  	if bz2.blockSize > len(bz2.tt) {
    81  		bz2.tt = make([]uint32, bz2.blockSize)
    82  	}
    83  	return nil
    84  }
    85  
    86  func (bz2 *reader) Read(buf []byte) (n int, err error) {
    87  	if bz2.eof {
    88  		return 0, io.EOF
    89  	}
    90  
    91  	if !bz2.setupDone {
    92  		err = bz2.setup(true)
    93  		brErr := bz2.br.Err()
    94  		if brErr != nil {
    95  			err = brErr
    96  		}
    97  		if err != nil {
    98  			return 0, err
    99  		}
   100  		bz2.setupDone = true
   101  	}
   102  
   103  	n, err = bz2.read(buf)
   104  	brErr := bz2.br.Err()
   105  	if brErr != nil {
   106  		err = brErr
   107  	}
   108  	return
   109  }
   110  
   111  func (bz2 *reader) readFromBlock(buf []byte) int {
   112  	// bzip2 is a block based compressor, except that it has a run-length
   113  	// preprocessing step. The block based nature means that we can
   114  	// preallocate fixed-size buffers and reuse them. However, the RLE
   115  	// preprocessing would require allocating huge buffers to store the
   116  	// maximum expansion. Thus we process blocks all at once, except for
   117  	// the RLE which we decompress as required.
   118  	n := 0
   119  	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
   120  		// We have RLE data pending.
   121  
   122  		// The run-length encoding works like this:
   123  		// Any sequence of four equal bytes is followed by a length
   124  		// byte which contains the number of repeats of that byte to
   125  		// include. (The number of repeats can be zero.) Because we are
   126  		// decompressing on-demand our state is kept in the reader
   127  		// object.
   128  
   129  		if bz2.repeats > 0 {
   130  			buf[n] = byte(bz2.lastByte)
   131  			n++
   132  			bz2.repeats--
   133  			if bz2.repeats == 0 {
   134  				bz2.lastByte = -1
   135  			}
   136  			continue
   137  		}
   138  
   139  		bz2.tPos = bz2.preRLE[bz2.tPos]
   140  		b := byte(bz2.tPos)
   141  		bz2.tPos >>= 8
   142  		bz2.preRLEUsed++
   143  
   144  		if bz2.byteRepeats == 3 {
   145  			bz2.repeats = uint(b)
   146  			bz2.byteRepeats = 0
   147  			continue
   148  		}
   149  
   150  		if bz2.lastByte == int(b) {
   151  			bz2.byteRepeats++
   152  		} else {
   153  			bz2.byteRepeats = 0
   154  		}
   155  		bz2.lastByte = int(b)
   156  
   157  		buf[n] = b
   158  		n++
   159  	}
   160  
   161  	return n
   162  }
   163  
   164  func (bz2 *reader) read(buf []byte) (int, error) {
   165  	for {
   166  		n := bz2.readFromBlock(buf)
   167  		if n > 0 {
   168  			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
   169  			return n, nil
   170  		}
   171  
   172  		// End of block. Check CRC.
   173  		if bz2.blockCRC != bz2.wantBlockCRC {
   174  			bz2.br.err = StructuralError("block checksum mismatch")
   175  			return 0, bz2.br.err
   176  		}
   177  
   178  		// Find next block.
   179  		br := &bz2.br
   180  		switch br.ReadBits64(48) {
   181  		default:
   182  			return 0, StructuralError("bad magic value found")
   183  
   184  		case bzip2BlockMagic:
   185  			// Start of block.
   186  			err := bz2.readBlock()
   187  			if err != nil {
   188  				return 0, err
   189  			}
   190  
   191  		case bzip2FinalMagic:
   192  			// Check end-of-file CRC.
   193  			wantFileCRC := uint32(br.ReadBits64(32))
   194  			if br.err != nil {
   195  				return 0, br.err
   196  			}
   197  			if bz2.fileCRC != wantFileCRC {
   198  				br.err = StructuralError("file checksum mismatch")
   199  				return 0, br.err
   200  			}
   201  
   202  			// Skip ahead to byte boundary.
   203  			// Is there a file concatenated to this one?
   204  			// It would start with BZ.
   205  			if br.bits%8 != 0 {
   206  				br.ReadBits(br.bits % 8)
   207  			}
   208  			b, err := br.r.ReadByte()
   209  			if err == io.EOF {
   210  				br.err = io.EOF
   211  				bz2.eof = true
   212  				return 0, io.EOF
   213  			}
   214  			if err != nil {
   215  				br.err = err
   216  				return 0, err
   217  			}
   218  			z, err := br.r.ReadByte()
   219  			if err != nil {
   220  				if err == io.EOF {
   221  					err = io.ErrUnexpectedEOF
   222  				}
   223  				br.err = err
   224  				return 0, err
   225  			}
   226  			if b != 'B' || z != 'Z' {
   227  				return 0, StructuralError("bad magic value in continuation file")
   228  			}
   229  			if err := bz2.setup(false); err != nil {
   230  				return 0, err
   231  			}
   232  		}
   233  	}
   234  }
   235  
   236  // readBlock reads a bzip2 block. The magic number should already have been consumed.
   237  func (bz2 *reader) readBlock() (err error) {
   238  	br := &bz2.br
   239  	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
   240  	bz2.blockCRC = 0
   241  	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
   242  	randomized := br.ReadBits(1)
   243  	if randomized != 0 {
   244  		return StructuralError("deprecated randomized files")
   245  	}
   246  	origPtr := uint(br.ReadBits(24))
   247  
   248  	// If not every byte value is used in the block (i.e., it's text) then
   249  	// the symbol set is reduced. The symbols used are stored as a
   250  	// two-level, 16x16 bitmap.
   251  	symbolRangeUsedBitmap := br.ReadBits(16)
   252  	symbolPresent := make([]bool, 256)
   253  	numSymbols := 0
   254  	for symRange := uint(0); symRange < 16; symRange++ {
   255  		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
   256  			bits := br.ReadBits(16)
   257  			for symbol := uint(0); symbol < 16; symbol++ {
   258  				if bits&(1<<(15-symbol)) != 0 {
   259  					symbolPresent[16*symRange+symbol] = true
   260  					numSymbols++
   261  				}
   262  			}
   263  		}
   264  	}
   265  
   266  	if numSymbols == 0 {
   267  		// There must be an EOF symbol.
   268  		return StructuralError("no symbols in input")
   269  	}
   270  
   271  	// A block uses between two and six different Huffman trees.
   272  	numHuffmanTrees := br.ReadBits(3)
   273  	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
   274  		return StructuralError("invalid number of Huffman trees")
   275  	}
   276  
   277  	// The Huffman tree can switch every 50 symbols so there's a list of
   278  	// tree indexes telling us which tree to use for each 50 symbol block.
   279  	numSelectors := br.ReadBits(15)
   280  	treeIndexes := make([]uint8, numSelectors)
   281  
   282  	// The tree indexes are move-to-front transformed and stored as unary
   283  	// numbers.
   284  	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
   285  	for i := range treeIndexes {
   286  		c := 0
   287  		for {
   288  			inc := br.ReadBits(1)
   289  			if inc == 0 {
   290  				break
   291  			}
   292  			c++
   293  		}
   294  		if c >= numHuffmanTrees {
   295  			return StructuralError("tree index too large")
   296  		}
   297  		treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
   298  	}
   299  
   300  	// The list of symbols for the move-to-front transform is taken from
   301  	// the previously decoded symbol bitmap.
   302  	symbols := make([]byte, numSymbols)
   303  	nextSymbol := 0
   304  	for i := 0; i < 256; i++ {
   305  		if symbolPresent[i] {
   306  			symbols[nextSymbol] = byte(i)
   307  			nextSymbol++
   308  		}
   309  	}
   310  	mtf := newMTFDecoder(symbols)
   311  
   312  	numSymbols += 2 // to account for RUNA and RUNB symbols
   313  	huffmanTrees := make([]huffmanTree, numHuffmanTrees)
   314  
   315  	// Now we decode the arrays of code-lengths for each tree.
   316  	lengths := make([]uint8, numSymbols)
   317  	for i := range huffmanTrees {
   318  		// The code lengths are delta encoded from a 5-bit base value.
   319  		length := br.ReadBits(5)
   320  		for j := range lengths {
   321  			for {
   322  				if !br.ReadBit() {
   323  					break
   324  				}
   325  				if br.ReadBit() {
   326  					length--
   327  				} else {
   328  					length++
   329  				}
   330  			}
   331  			if length < 0 || length > 20 {
   332  				return StructuralError("Huffman length out of range")
   333  			}
   334  			lengths[j] = uint8(length)
   335  		}
   336  		huffmanTrees[i], err = newHuffmanTree(lengths)
   337  		if err != nil {
   338  			return err
   339  		}
   340  	}
   341  
   342  	selectorIndex := 1 // the next tree index to use
   343  	if len(treeIndexes) == 0 {
   344  		return StructuralError("no tree selectors given")
   345  	}
   346  	if int(treeIndexes[0]) >= len(huffmanTrees) {
   347  		return StructuralError("tree selector out of range")
   348  	}
   349  	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
   350  	bufIndex := 0 // indexes bz2.buf, the output buffer.
   351  	// The output of the move-to-front transform is run-length encoded and
   352  	// we merge the decoding into the Huffman parsing loop. These two
   353  	// variables accumulate the repeat count. See the Wikipedia page for
   354  	// details.
   355  	repeat := 0
   356  	repeatPower := 0
   357  
   358  	// The `C' array (used by the inverse BWT) needs to be zero initialized.
   359  	for i := range bz2.c {
   360  		bz2.c[i] = 0
   361  	}
   362  
   363  	decoded := 0 // counts the number of symbols decoded by the current tree.
   364  	for {
   365  		if decoded == 50 {
   366  			if selectorIndex >= numSelectors {
   367  				return StructuralError("insufficient selector indices for number of symbols")
   368  			}
   369  			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
   370  				return StructuralError("tree selector out of range")
   371  			}
   372  			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
   373  			selectorIndex++
   374  			decoded = 0
   375  		}
   376  
   377  		v := currentHuffmanTree.Decode(br)
   378  		decoded++
   379  
   380  		if v < 2 {
   381  			// This is either the RUNA or RUNB symbol.
   382  			if repeat == 0 {
   383  				repeatPower = 1
   384  			}
   385  			repeat += repeatPower << v
   386  			repeatPower <<= 1
   387  
   388  			// This limit of 2 million comes from the bzip2 source
   389  			// code. It prevents repeat from overflowing.
   390  			if repeat > 2*1024*1024 {
   391  				return StructuralError("repeat count too large")
   392  			}
   393  			continue
   394  		}
   395  
   396  		if repeat > 0 {
   397  			// We have decoded a complete run-length so we need to
   398  			// replicate the last output symbol.
   399  			if repeat > bz2.blockSize-bufIndex {
   400  				return StructuralError("repeats past end of block")
   401  			}
   402  			for i := 0; i < repeat; i++ {
   403  				b := byte(mtf.First())
   404  				bz2.tt[bufIndex] = uint32(b)
   405  				bz2.c[b]++
   406  				bufIndex++
   407  			}
   408  			repeat = 0
   409  		}
   410  
   411  		if int(v) == numSymbols-1 {
   412  			// This is the EOF symbol. Because it's always at the
   413  			// end of the move-to-front list, and never gets moved
   414  			// to the front, it has this unique value.
   415  			break
   416  		}
   417  
   418  		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
   419  		// one would expect |v-2| to be passed to the MTF decoder.
   420  		// However, the front of the MTF list is never referenced as 0,
   421  		// it's always referenced with a run-length of 1. Thus 0
   422  		// doesn't need to be encoded and we have |v-1| in the next
   423  		// line.
   424  		b := byte(mtf.Decode(int(v - 1)))
   425  		if bufIndex >= bz2.blockSize {
   426  			return StructuralError("data exceeds block size")
   427  		}
   428  		bz2.tt[bufIndex] = uint32(b)
   429  		bz2.c[b]++
   430  		bufIndex++
   431  	}
   432  
   433  	if origPtr >= uint(bufIndex) {
   434  		return StructuralError("origPtr out of bounds")
   435  	}
   436  
   437  	// We have completed the entropy decoding. Now we can perform the
   438  	// inverse BWT and setup the RLE buffer.
   439  	bz2.preRLE = bz2.tt[:bufIndex]
   440  	bz2.preRLEUsed = 0
   441  	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
   442  	bz2.lastByte = -1
   443  	bz2.byteRepeats = 0
   444  	bz2.repeats = 0
   445  
   446  	return nil
   447  }
   448  
   449  // inverseBWT implements the inverse Burrows-Wheeler transform as described in
   450  // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
   451  // In that document, origPtr is called `I' and c is the `C' array after the
   452  // first pass over the data. It's an argument here because we merge the first
   453  // pass with the Huffman decoding.
   454  //
   455  // This also implements the `single array' method from the bzip2 source code
   456  // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
   457  // index of the next byte in the top 24-bits. The index of the first byte is
   458  // returned.
   459  func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
   460  	sum := uint(0)
   461  	for i := 0; i < 256; i++ {
   462  		sum += c[i]
   463  		c[i] = sum - c[i]
   464  	}
   465  
   466  	for i := range tt {
   467  		b := tt[i] & 0xff
   468  		tt[c[b]] |= uint32(i) << 8
   469  		c[b]++
   470  	}
   471  
   472  	return tt[origPtr] >> 8
   473  }
   474  
   475  // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
   476  // causing the bits in the input to be processed in the reverse of the usual order.
   477  
   478  var crctab [256]uint32
   479  
   480  func init() {
   481  	const poly = 0x04C11DB7
   482  	for i := range crctab {
   483  		crc := uint32(i) << 24
   484  		for j := 0; j < 8; j++ {
   485  			if crc&0x80000000 != 0 {
   486  				crc = (crc << 1) ^ poly
   487  			} else {
   488  				crc <<= 1
   489  			}
   490  		}
   491  		crctab[i] = crc
   492  	}
   493  }
   494  
   495  // updateCRC updates the crc value to incorporate the data in b.
   496  // The initial value is 0.
   497  func updateCRC(val uint32, b []byte) uint32 {
   498  	crc := ^val
   499  	for _, v := range b {
   500  		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
   501  	}
   502  	return ^crc
   503  }