github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/compress/bzip2/bzip2.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bzip2 implements bzip2 decompression. 6 package bzip2 7 8 import "io" 9 10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot 11 // of guessing: http://en.wikipedia.org/wiki/Bzip2 12 // The source code to pyflate was useful for debugging: 13 // http://www.paul.sladen.org/projects/pyflate 14 15 // A StructuralError is returned when the bzip2 data is found to be 16 // syntactically invalid. 17 type StructuralError string 18 19 func (s StructuralError) Error() string { 20 return "bzip2 data invalid: " + string(s) 21 } 22 23 // A reader decompresses bzip2 compressed data. 24 type reader struct { 25 br bitReader 26 setupDone bool // true if we have parsed the bzip2 header. 27 blockSize int // blockSize in bytes, i.e. 900 * 1024. 28 eof bool 29 buf []byte // stores Burrows-Wheeler transformed data. 30 c [256]uint // the `C' array for the inverse BWT. 31 tt []uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits. 32 tPos uint32 // Index of the next output byte in tt. 33 34 preRLE []uint32 // contains the RLE data still to be processed. 35 preRLEUsed int // number of entries of preRLE used. 36 lastByte int // the last byte value seen. 37 byteRepeats uint // the number of repeats of lastByte seen. 38 repeats uint // the number of copies of lastByte to output. 39 } 40 41 // NewReader returns an io.Reader which decompresses bzip2 data from r. 42 func NewReader(r io.Reader) io.Reader { 43 bz2 := new(reader) 44 bz2.br = newBitReader(r) 45 return bz2 46 } 47 48 const bzip2FileMagic = 0x425a // "BZ" 49 const bzip2BlockMagic = 0x314159265359 50 const bzip2FinalMagic = 0x177245385090 51 52 // setup parses the bzip2 header. 53 func (bz2 *reader) setup() error { 54 br := &bz2.br 55 56 magic := br.ReadBits(16) 57 if magic != bzip2FileMagic { 58 return StructuralError("bad magic value") 59 } 60 61 t := br.ReadBits(8) 62 if t != 'h' { 63 return StructuralError("non-Huffman entropy encoding") 64 } 65 66 level := br.ReadBits(8) 67 if level < '1' || level > '9' { 68 return StructuralError("invalid compression level") 69 } 70 71 bz2.blockSize = 100 * 1024 * (int(level) - '0') 72 bz2.tt = make([]uint32, bz2.blockSize) 73 return nil 74 } 75 76 func (bz2 *reader) Read(buf []byte) (n int, err error) { 77 if bz2.eof { 78 return 0, io.EOF 79 } 80 81 if !bz2.setupDone { 82 err = bz2.setup() 83 brErr := bz2.br.Err() 84 if brErr != nil { 85 err = brErr 86 } 87 if err != nil { 88 return 0, err 89 } 90 bz2.setupDone = true 91 } 92 93 n, err = bz2.read(buf) 94 brErr := bz2.br.Err() 95 if brErr != nil { 96 err = brErr 97 } 98 return 99 } 100 101 func (bz2 *reader) read(buf []byte) (n int, err error) { 102 // bzip2 is a block based compressor, except that it has a run-length 103 // preprocessing step. The block based nature means that we can 104 // preallocate fixed-size buffers and reuse them. However, the RLE 105 // preprocessing would require allocating huge buffers to store the 106 // maximum expansion. Thus we process blocks all at once, except for 107 // the RLE which we decompress as required. 108 109 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { 110 // We have RLE data pending. 111 112 // The run-length encoding works like this: 113 // Any sequence of four equal bytes is followed by a length 114 // byte which contains the number of repeats of that byte to 115 // include. (The number of repeats can be zero.) Because we are 116 // decompressing on-demand our state is kept in the reader 117 // object. 118 119 if bz2.repeats > 0 { 120 buf[n] = byte(bz2.lastByte) 121 n++ 122 bz2.repeats-- 123 if bz2.repeats == 0 { 124 bz2.lastByte = -1 125 } 126 continue 127 } 128 129 bz2.tPos = bz2.preRLE[bz2.tPos] 130 b := byte(bz2.tPos) 131 bz2.tPos >>= 8 132 bz2.preRLEUsed++ 133 134 if bz2.byteRepeats == 3 { 135 bz2.repeats = uint(b) 136 bz2.byteRepeats = 0 137 continue 138 } 139 140 if bz2.lastByte == int(b) { 141 bz2.byteRepeats++ 142 } else { 143 bz2.byteRepeats = 0 144 } 145 bz2.lastByte = int(b) 146 147 buf[n] = b 148 n++ 149 } 150 151 if n > 0 { 152 return 153 } 154 155 // No RLE data is pending so we need to read a block. 156 157 br := &bz2.br 158 magic := br.ReadBits64(48) 159 if magic == bzip2FinalMagic { 160 br.ReadBits64(32) // ignored CRC 161 bz2.eof = true 162 return 0, io.EOF 163 } else if magic != bzip2BlockMagic { 164 return 0, StructuralError("bad magic value found") 165 } 166 167 err = bz2.readBlock() 168 if err != nil { 169 return 0, err 170 } 171 172 return bz2.read(buf) 173 } 174 175 // readBlock reads a bzip2 block. The magic number should already have been consumed. 176 func (bz2 *reader) readBlock() (err error) { 177 br := &bz2.br 178 br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is. 179 randomized := br.ReadBits(1) 180 if randomized != 0 { 181 return StructuralError("deprecated randomized files") 182 } 183 origPtr := uint(br.ReadBits(24)) 184 185 // If not every byte value is used in the block (i.e., it's text) then 186 // the symbol set is reduced. The symbols used are stored as a 187 // two-level, 16x16 bitmap. 188 symbolRangeUsedBitmap := br.ReadBits(16) 189 symbolPresent := make([]bool, 256) 190 numSymbols := 0 191 for symRange := uint(0); symRange < 16; symRange++ { 192 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { 193 bits := br.ReadBits(16) 194 for symbol := uint(0); symbol < 16; symbol++ { 195 if bits&(1<<(15-symbol)) != 0 { 196 symbolPresent[16*symRange+symbol] = true 197 numSymbols++ 198 } 199 } 200 } 201 } 202 203 // A block uses between two and six different Huffman trees. 204 numHuffmanTrees := br.ReadBits(3) 205 if numHuffmanTrees < 2 || numHuffmanTrees > 6 { 206 return StructuralError("invalid number of Huffman trees") 207 } 208 209 // The Huffman tree can switch every 50 symbols so there's a list of 210 // tree indexes telling us which tree to use for each 50 symbol block. 211 numSelectors := br.ReadBits(15) 212 treeIndexes := make([]uint8, numSelectors) 213 214 // The tree indexes are move-to-front transformed and stored as unary 215 // numbers. 216 mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) 217 for i := range treeIndexes { 218 c := 0 219 for { 220 inc := br.ReadBits(1) 221 if inc == 0 { 222 break 223 } 224 c++ 225 } 226 if c >= numHuffmanTrees { 227 return StructuralError("tree index too large") 228 } 229 treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c)) 230 } 231 232 // The list of symbols for the move-to-front transform is taken from 233 // the previously decoded symbol bitmap. 234 symbols := make([]byte, numSymbols) 235 nextSymbol := 0 236 for i := 0; i < 256; i++ { 237 if symbolPresent[i] { 238 symbols[nextSymbol] = byte(i) 239 nextSymbol++ 240 } 241 } 242 mtf := newMTFDecoder(symbols) 243 244 numSymbols += 2 // to account for RUNA and RUNB symbols 245 huffmanTrees := make([]huffmanTree, numHuffmanTrees) 246 247 // Now we decode the arrays of code-lengths for each tree. 248 lengths := make([]uint8, numSymbols) 249 for i := 0; i < numHuffmanTrees; i++ { 250 // The code lengths are delta encoded from a 5-bit base value. 251 length := br.ReadBits(5) 252 for j := 0; j < numSymbols; j++ { 253 for { 254 if !br.ReadBit() { 255 break 256 } 257 if br.ReadBit() { 258 length-- 259 } else { 260 length++ 261 } 262 } 263 if length < 0 || length > 20 { 264 return StructuralError("Huffman length out of range") 265 } 266 lengths[j] = uint8(length) 267 } 268 huffmanTrees[i], err = newHuffmanTree(lengths) 269 if err != nil { 270 return err 271 } 272 } 273 274 selectorIndex := 1 // the next tree index to use 275 currentHuffmanTree := huffmanTrees[treeIndexes[0]] 276 bufIndex := 0 // indexes bz2.buf, the output buffer. 277 // The output of the move-to-front transform is run-length encoded and 278 // we merge the decoding into the Huffman parsing loop. These two 279 // variables accumulate the repeat count. See the Wikipedia page for 280 // details. 281 repeat := 0 282 repeat_power := 0 283 284 // The `C' array (used by the inverse BWT) needs to be zero initialized. 285 for i := range bz2.c { 286 bz2.c[i] = 0 287 } 288 289 decoded := 0 // counts the number of symbols decoded by the current tree. 290 for { 291 if decoded == 50 { 292 currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] 293 selectorIndex++ 294 decoded = 0 295 } 296 297 v := currentHuffmanTree.Decode(br) 298 decoded++ 299 300 if v < 2 { 301 // This is either the RUNA or RUNB symbol. 302 if repeat == 0 { 303 repeat_power = 1 304 } 305 repeat += repeat_power << v 306 repeat_power <<= 1 307 308 // This limit of 2 million comes from the bzip2 source 309 // code. It prevents repeat from overflowing. 310 if repeat > 2*1024*1024 { 311 return StructuralError("repeat count too large") 312 } 313 continue 314 } 315 316 if repeat > 0 { 317 // We have decoded a complete run-length so we need to 318 // replicate the last output symbol. 319 for i := 0; i < repeat; i++ { 320 b := byte(mtf.First()) 321 bz2.tt[bufIndex] = uint32(b) 322 bz2.c[b]++ 323 bufIndex++ 324 } 325 repeat = 0 326 } 327 328 if int(v) == numSymbols-1 { 329 // This is the EOF symbol. Because it's always at the 330 // end of the move-to-front list, and never gets moved 331 // to the front, it has this unique value. 332 break 333 } 334 335 // Since two metasymbols (RUNA and RUNB) have values 0 and 1, 336 // one would expect |v-2| to be passed to the MTF decoder. 337 // However, the front of the MTF list is never referenced as 0, 338 // it's always referenced with a run-length of 1. Thus 0 339 // doesn't need to be encoded and we have |v-1| in the next 340 // line. 341 b := byte(mtf.Decode(int(v - 1))) 342 bz2.tt[bufIndex] = uint32(b) 343 bz2.c[b]++ 344 bufIndex++ 345 } 346 347 if origPtr >= uint(bufIndex) { 348 return StructuralError("origPtr out of bounds") 349 } 350 351 // We have completed the entropy decoding. Now we can perform the 352 // inverse BWT and setup the RLE buffer. 353 bz2.preRLE = bz2.tt[:bufIndex] 354 bz2.preRLEUsed = 0 355 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) 356 bz2.lastByte = -1 357 bz2.byteRepeats = 0 358 bz2.repeats = 0 359 360 return nil 361 } 362 363 // inverseBWT implements the inverse Burrows-Wheeler transform as described in 364 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. 365 // In that document, origPtr is called `I' and c is the `C' array after the 366 // first pass over the data. It's an argument here because we merge the first 367 // pass with the Huffman decoding. 368 // 369 // This also implements the `single array' method from the bzip2 source code 370 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the 371 // index of the next byte in the top 24-bits. The index of the first byte is 372 // returned. 373 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { 374 sum := uint(0) 375 for i := 0; i < 256; i++ { 376 sum += c[i] 377 c[i] = sum - c[i] 378 } 379 380 for i := range tt { 381 b := tt[i] & 0xff 382 tt[c[b]] |= uint32(i) << 8 383 c[b]++ 384 } 385 386 return tt[origPtr] >> 8 387 }