github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/compress/bzip2/bzip2.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package bzip2 implements bzip2 decompression.
     6  package bzip2
     7  
     8  import "io"
     9  
    10  // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
    11  // of guessing: http://en.wikipedia.org/wiki/Bzip2
    12  // The source code to pyflate was useful for debugging:
    13  // http://www.paul.sladen.org/projects/pyflate
    14  
    15  // A StructuralError is returned when the bzip2 data is found to be
    16  // syntactically invalid.
    17  type StructuralError string
    18  
    19  func (s StructuralError) Error() string {
    20  	return "bzip2 data invalid: " + string(s)
    21  }
    22  
    23  // A reader decompresses bzip2 compressed data.
    24  type reader struct {
    25  	br        bitReader
    26  	setupDone bool // true if we have parsed the bzip2 header.
    27  	blockSize int  // blockSize in bytes, i.e. 900 * 1024.
    28  	eof       bool
    29  	buf       []byte    // stores Burrows-Wheeler transformed data.
    30  	c         [256]uint // the `C' array for the inverse BWT.
    31  	tt        []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
    32  	tPos      uint32    // Index of the next output byte in tt.
    33  
    34  	preRLE      []uint32 // contains the RLE data still to be processed.
    35  	preRLEUsed  int      // number of entries of preRLE used.
    36  	lastByte    int      // the last byte value seen.
    37  	byteRepeats uint     // the number of repeats of lastByte seen.
    38  	repeats     uint     // the number of copies of lastByte to output.
    39  }
    40  
    41  // NewReader returns an io.Reader which decompresses bzip2 data from r.
    42  func NewReader(r io.Reader) io.Reader {
    43  	bz2 := new(reader)
    44  	bz2.br = newBitReader(r)
    45  	return bz2
    46  }
    47  
    48  const bzip2FileMagic = 0x425a // "BZ"
    49  const bzip2BlockMagic = 0x314159265359
    50  const bzip2FinalMagic = 0x177245385090
    51  
    52  // setup parses the bzip2 header.
    53  func (bz2 *reader) setup() error {
    54  	br := &bz2.br
    55  
    56  	magic := br.ReadBits(16)
    57  	if magic != bzip2FileMagic {
    58  		return StructuralError("bad magic value")
    59  	}
    60  
    61  	t := br.ReadBits(8)
    62  	if t != 'h' {
    63  		return StructuralError("non-Huffman entropy encoding")
    64  	}
    65  
    66  	level := br.ReadBits(8)
    67  	if level < '1' || level > '9' {
    68  		return StructuralError("invalid compression level")
    69  	}
    70  
    71  	bz2.blockSize = 100 * 1024 * (int(level) - '0')
    72  	bz2.tt = make([]uint32, bz2.blockSize)
    73  	return nil
    74  }
    75  
    76  func (bz2 *reader) Read(buf []byte) (n int, err error) {
    77  	if bz2.eof {
    78  		return 0, io.EOF
    79  	}
    80  
    81  	if !bz2.setupDone {
    82  		err = bz2.setup()
    83  		brErr := bz2.br.Err()
    84  		if brErr != nil {
    85  			err = brErr
    86  		}
    87  		if err != nil {
    88  			return 0, err
    89  		}
    90  		bz2.setupDone = true
    91  	}
    92  
    93  	n, err = bz2.read(buf)
    94  	brErr := bz2.br.Err()
    95  	if brErr != nil {
    96  		err = brErr
    97  	}
    98  	return
    99  }
   100  
   101  func (bz2 *reader) read(buf []byte) (n int, err error) {
   102  	// bzip2 is a block based compressor, except that it has a run-length
   103  	// preprocessing step. The block based nature means that we can
   104  	// preallocate fixed-size buffers and reuse them. However, the RLE
   105  	// preprocessing would require allocating huge buffers to store the
   106  	// maximum expansion. Thus we process blocks all at once, except for
   107  	// the RLE which we decompress as required.
   108  
   109  	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
   110  		// We have RLE data pending.
   111  
   112  		// The run-length encoding works like this:
   113  		// Any sequence of four equal bytes is followed by a length
   114  		// byte which contains the number of repeats of that byte to
   115  		// include. (The number of repeats can be zero.) Because we are
   116  		// decompressing on-demand our state is kept in the reader
   117  		// object.
   118  
   119  		if bz2.repeats > 0 {
   120  			buf[n] = byte(bz2.lastByte)
   121  			n++
   122  			bz2.repeats--
   123  			if bz2.repeats == 0 {
   124  				bz2.lastByte = -1
   125  			}
   126  			continue
   127  		}
   128  
   129  		bz2.tPos = bz2.preRLE[bz2.tPos]
   130  		b := byte(bz2.tPos)
   131  		bz2.tPos >>= 8
   132  		bz2.preRLEUsed++
   133  
   134  		if bz2.byteRepeats == 3 {
   135  			bz2.repeats = uint(b)
   136  			bz2.byteRepeats = 0
   137  			continue
   138  		}
   139  
   140  		if bz2.lastByte == int(b) {
   141  			bz2.byteRepeats++
   142  		} else {
   143  			bz2.byteRepeats = 0
   144  		}
   145  		bz2.lastByte = int(b)
   146  
   147  		buf[n] = b
   148  		n++
   149  	}
   150  
   151  	if n > 0 {
   152  		return
   153  	}
   154  
   155  	// No RLE data is pending so we need to read a block.
   156  
   157  	br := &bz2.br
   158  	magic := br.ReadBits64(48)
   159  	if magic == bzip2FinalMagic {
   160  		br.ReadBits64(32) // ignored CRC
   161  		bz2.eof = true
   162  		return 0, io.EOF
   163  	} else if magic != bzip2BlockMagic {
   164  		return 0, StructuralError("bad magic value found")
   165  	}
   166  
   167  	err = bz2.readBlock()
   168  	if err != nil {
   169  		return 0, err
   170  	}
   171  
   172  	return bz2.read(buf)
   173  }
   174  
   175  // readBlock reads a bzip2 block. The magic number should already have been consumed.
   176  func (bz2 *reader) readBlock() (err error) {
   177  	br := &bz2.br
   178  	br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is.
   179  	randomized := br.ReadBits(1)
   180  	if randomized != 0 {
   181  		return StructuralError("deprecated randomized files")
   182  	}
   183  	origPtr := uint(br.ReadBits(24))
   184  
   185  	// If not every byte value is used in the block (i.e., it's text) then
   186  	// the symbol set is reduced. The symbols used are stored as a
   187  	// two-level, 16x16 bitmap.
   188  	symbolRangeUsedBitmap := br.ReadBits(16)
   189  	symbolPresent := make([]bool, 256)
   190  	numSymbols := 0
   191  	for symRange := uint(0); symRange < 16; symRange++ {
   192  		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
   193  			bits := br.ReadBits(16)
   194  			for symbol := uint(0); symbol < 16; symbol++ {
   195  				if bits&(1<<(15-symbol)) != 0 {
   196  					symbolPresent[16*symRange+symbol] = true
   197  					numSymbols++
   198  				}
   199  			}
   200  		}
   201  	}
   202  
   203  	// A block uses between two and six different Huffman trees.
   204  	numHuffmanTrees := br.ReadBits(3)
   205  	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
   206  		return StructuralError("invalid number of Huffman trees")
   207  	}
   208  
   209  	// The Huffman tree can switch every 50 symbols so there's a list of
   210  	// tree indexes telling us which tree to use for each 50 symbol block.
   211  	numSelectors := br.ReadBits(15)
   212  	treeIndexes := make([]uint8, numSelectors)
   213  
   214  	// The tree indexes are move-to-front transformed and stored as unary
   215  	// numbers.
   216  	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
   217  	for i := range treeIndexes {
   218  		c := 0
   219  		for {
   220  			inc := br.ReadBits(1)
   221  			if inc == 0 {
   222  				break
   223  			}
   224  			c++
   225  		}
   226  		if c >= numHuffmanTrees {
   227  			return StructuralError("tree index too large")
   228  		}
   229  		treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
   230  	}
   231  
   232  	// The list of symbols for the move-to-front transform is taken from
   233  	// the previously decoded symbol bitmap.
   234  	symbols := make([]byte, numSymbols)
   235  	nextSymbol := 0
   236  	for i := 0; i < 256; i++ {
   237  		if symbolPresent[i] {
   238  			symbols[nextSymbol] = byte(i)
   239  			nextSymbol++
   240  		}
   241  	}
   242  	mtf := newMTFDecoder(symbols)
   243  
   244  	numSymbols += 2 // to account for RUNA and RUNB symbols
   245  	huffmanTrees := make([]huffmanTree, numHuffmanTrees)
   246  
   247  	// Now we decode the arrays of code-lengths for each tree.
   248  	lengths := make([]uint8, numSymbols)
   249  	for i := 0; i < numHuffmanTrees; i++ {
   250  		// The code lengths are delta encoded from a 5-bit base value.
   251  		length := br.ReadBits(5)
   252  		for j := 0; j < numSymbols; j++ {
   253  			for {
   254  				if !br.ReadBit() {
   255  					break
   256  				}
   257  				if br.ReadBit() {
   258  					length--
   259  				} else {
   260  					length++
   261  				}
   262  			}
   263  			if length < 0 || length > 20 {
   264  				return StructuralError("Huffman length out of range")
   265  			}
   266  			lengths[j] = uint8(length)
   267  		}
   268  		huffmanTrees[i], err = newHuffmanTree(lengths)
   269  		if err != nil {
   270  			return err
   271  		}
   272  	}
   273  
   274  	selectorIndex := 1 // the next tree index to use
   275  	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
   276  	bufIndex := 0 // indexes bz2.buf, the output buffer.
   277  	// The output of the move-to-front transform is run-length encoded and
   278  	// we merge the decoding into the Huffman parsing loop. These two
   279  	// variables accumulate the repeat count. See the Wikipedia page for
   280  	// details.
   281  	repeat := 0
   282  	repeat_power := 0
   283  
   284  	// The `C' array (used by the inverse BWT) needs to be zero initialized.
   285  	for i := range bz2.c {
   286  		bz2.c[i] = 0
   287  	}
   288  
   289  	decoded := 0 // counts the number of symbols decoded by the current tree.
   290  	for {
   291  		if decoded == 50 {
   292  			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
   293  			selectorIndex++
   294  			decoded = 0
   295  		}
   296  
   297  		v := currentHuffmanTree.Decode(br)
   298  		decoded++
   299  
   300  		if v < 2 {
   301  			// This is either the RUNA or RUNB symbol.
   302  			if repeat == 0 {
   303  				repeat_power = 1
   304  			}
   305  			repeat += repeat_power << v
   306  			repeat_power <<= 1
   307  
   308  			// This limit of 2 million comes from the bzip2 source
   309  			// code. It prevents repeat from overflowing.
   310  			if repeat > 2*1024*1024 {
   311  				return StructuralError("repeat count too large")
   312  			}
   313  			continue
   314  		}
   315  
   316  		if repeat > 0 {
   317  			// We have decoded a complete run-length so we need to
   318  			// replicate the last output symbol.
   319  			for i := 0; i < repeat; i++ {
   320  				b := byte(mtf.First())
   321  				bz2.tt[bufIndex] = uint32(b)
   322  				bz2.c[b]++
   323  				bufIndex++
   324  			}
   325  			repeat = 0
   326  		}
   327  
   328  		if int(v) == numSymbols-1 {
   329  			// This is the EOF symbol. Because it's always at the
   330  			// end of the move-to-front list, and never gets moved
   331  			// to the front, it has this unique value.
   332  			break
   333  		}
   334  
   335  		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
   336  		// one would expect |v-2| to be passed to the MTF decoder.
   337  		// However, the front of the MTF list is never referenced as 0,
   338  		// it's always referenced with a run-length of 1. Thus 0
   339  		// doesn't need to be encoded and we have |v-1| in the next
   340  		// line.
   341  		b := byte(mtf.Decode(int(v - 1)))
   342  		bz2.tt[bufIndex] = uint32(b)
   343  		bz2.c[b]++
   344  		bufIndex++
   345  	}
   346  
   347  	if origPtr >= uint(bufIndex) {
   348  		return StructuralError("origPtr out of bounds")
   349  	}
   350  
   351  	// We have completed the entropy decoding. Now we can perform the
   352  	// inverse BWT and setup the RLE buffer.
   353  	bz2.preRLE = bz2.tt[:bufIndex]
   354  	bz2.preRLEUsed = 0
   355  	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
   356  	bz2.lastByte = -1
   357  	bz2.byteRepeats = 0
   358  	bz2.repeats = 0
   359  
   360  	return nil
   361  }
   362  
   363  // inverseBWT implements the inverse Burrows-Wheeler transform as described in
   364  // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
   365  // In that document, origPtr is called `I' and c is the `C' array after the
   366  // first pass over the data. It's an argument here because we merge the first
   367  // pass with the Huffman decoding.
   368  //
   369  // This also implements the `single array' method from the bzip2 source code
   370  // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
   371  // index of the next byte in the top 24-bits. The index of the first byte is
   372  // returned.
   373  func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
   374  	sum := uint(0)
   375  	for i := 0; i < 256; i++ {
   376  		sum += c[i]
   377  		c[i] = sum - c[i]
   378  	}
   379  
   380  	for i := range tt {
   381  		b := tt[i] & 0xff
   382  		tt[c[b]] |= uint32(i) << 8
   383  		c[b]++
   384  	}
   385  
   386  	return tt[origPtr] >> 8
   387  }