github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/compress/bzip2/bzip2.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bzip2 implements bzip2 decompression. 6 package bzip2 7 8 import "io" 9 10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot 11 // of guessing: http://en.wikipedia.org/wiki/Bzip2 12 // The source code to pyflate was useful for debugging: 13 // http://www.paul.sladen.org/projects/pyflate 14 15 // A StructuralError is returned when the bzip2 data is found to be 16 // syntactically invalid. 17 type StructuralError string 18 19 func (s StructuralError) Error() string { 20 return "bzip2 data invalid: " + string(s) 21 } 22 23 // A reader decompresses bzip2 compressed data. 24 type reader struct { 25 br bitReader 26 fileCRC uint32 27 blockCRC uint32 28 wantBlockCRC uint32 29 setupDone bool // true if we have parsed the bzip2 header. 30 blockSize int // blockSize in bytes, i.e. 900 * 1000. 31 eof bool 32 c [256]uint // the `C' array for the inverse BWT. 33 tt []uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits. 34 tPos uint32 // Index of the next output byte in tt. 35 36 preRLE []uint32 // contains the RLE data still to be processed. 37 preRLEUsed int // number of entries of preRLE used. 38 lastByte int // the last byte value seen. 39 byteRepeats uint // the number of repeats of lastByte seen. 40 repeats uint // the number of copies of lastByte to output. 41 } 42 43 // NewReader returns an io.Reader which decompresses bzip2 data from r. 44 // If r does not also implement io.ByteReader, 45 // the decompressor may read more data than necessary from r. 46 func NewReader(r io.Reader) io.Reader { 47 bz2 := new(reader) 48 bz2.br = newBitReader(r) 49 return bz2 50 } 51 52 const bzip2FileMagic = 0x425a // "BZ" 53 const bzip2BlockMagic = 0x314159265359 54 const bzip2FinalMagic = 0x177245385090 55 56 // setup parses the bzip2 header. 57 func (bz2 *reader) setup(needMagic bool) error { 58 br := &bz2.br 59 60 if needMagic { 61 magic := br.ReadBits(16) 62 if magic != bzip2FileMagic { 63 return StructuralError("bad magic value") 64 } 65 } 66 67 t := br.ReadBits(8) 68 if t != 'h' { 69 return StructuralError("non-Huffman entropy encoding") 70 } 71 72 level := br.ReadBits(8) 73 if level < '1' || level > '9' { 74 return StructuralError("invalid compression level") 75 } 76 77 bz2.fileCRC = 0 78 bz2.blockSize = 100 * 1000 * (level - '0') 79 if bz2.blockSize > len(bz2.tt) { 80 bz2.tt = make([]uint32, bz2.blockSize) 81 } 82 return nil 83 } 84 85 func (bz2 *reader) Read(buf []byte) (n int, err error) { 86 if bz2.eof { 87 return 0, io.EOF 88 } 89 90 if !bz2.setupDone { 91 err = bz2.setup(true) 92 brErr := bz2.br.Err() 93 if brErr != nil { 94 err = brErr 95 } 96 if err != nil { 97 return 0, err 98 } 99 bz2.setupDone = true 100 } 101 102 n, err = bz2.read(buf) 103 brErr := bz2.br.Err() 104 if brErr != nil { 105 err = brErr 106 } 107 return 108 } 109 110 func (bz2 *reader) readFromBlock(buf []byte) int { 111 // bzip2 is a block based compressor, except that it has a run-length 112 // preprocessing step. The block based nature means that we can 113 // preallocate fixed-size buffers and reuse them. However, the RLE 114 // preprocessing would require allocating huge buffers to store the 115 // maximum expansion. Thus we process blocks all at once, except for 116 // the RLE which we decompress as required. 117 n := 0 118 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { 119 // We have RLE data pending. 120 121 // The run-length encoding works like this: 122 // Any sequence of four equal bytes is followed by a length 123 // byte which contains the number of repeats of that byte to 124 // include. (The number of repeats can be zero.) Because we are 125 // decompressing on-demand our state is kept in the reader 126 // object. 127 128 if bz2.repeats > 0 { 129 buf[n] = byte(bz2.lastByte) 130 n++ 131 bz2.repeats-- 132 if bz2.repeats == 0 { 133 bz2.lastByte = -1 134 } 135 continue 136 } 137 138 bz2.tPos = bz2.preRLE[bz2.tPos] 139 b := byte(bz2.tPos) 140 bz2.tPos >>= 8 141 bz2.preRLEUsed++ 142 143 if bz2.byteRepeats == 3 { 144 bz2.repeats = uint(b) 145 bz2.byteRepeats = 0 146 continue 147 } 148 149 if bz2.lastByte == int(b) { 150 bz2.byteRepeats++ 151 } else { 152 bz2.byteRepeats = 0 153 } 154 bz2.lastByte = int(b) 155 156 buf[n] = b 157 n++ 158 } 159 160 return n 161 } 162 163 func (bz2 *reader) read(buf []byte) (int, error) { 164 for { 165 n := bz2.readFromBlock(buf) 166 if n > 0 { 167 bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) 168 return n, nil 169 } 170 171 // End of block. Check CRC. 172 if bz2.blockCRC != bz2.wantBlockCRC { 173 bz2.br.err = StructuralError("block checksum mismatch") 174 return 0, bz2.br.err 175 } 176 177 // Find next block. 178 br := &bz2.br 179 switch br.ReadBits64(48) { 180 default: 181 return 0, StructuralError("bad magic value found") 182 183 case bzip2BlockMagic: 184 // Start of block. 185 err := bz2.readBlock() 186 if err != nil { 187 return 0, err 188 } 189 190 case bzip2FinalMagic: 191 // Check end-of-file CRC. 192 wantFileCRC := uint32(br.ReadBits64(32)) 193 if br.err != nil { 194 return 0, br.err 195 } 196 if bz2.fileCRC != wantFileCRC { 197 br.err = StructuralError("file checksum mismatch") 198 return 0, br.err 199 } 200 201 // Skip ahead to byte boundary. 202 // Is there a file concatenated to this one? 203 // It would start with BZ. 204 if br.bits%8 != 0 { 205 br.ReadBits(br.bits % 8) 206 } 207 b, err := br.r.ReadByte() 208 if err == io.EOF { 209 br.err = io.EOF 210 bz2.eof = true 211 return 0, io.EOF 212 } 213 if err != nil { 214 br.err = err 215 return 0, err 216 } 217 z, err := br.r.ReadByte() 218 if err != nil { 219 if err == io.EOF { 220 err = io.ErrUnexpectedEOF 221 } 222 br.err = err 223 return 0, err 224 } 225 if b != 'B' || z != 'Z' { 226 return 0, StructuralError("bad magic value in continuation file") 227 } 228 if err := bz2.setup(false); err != nil { 229 return 0, err 230 } 231 } 232 } 233 } 234 235 // readBlock reads a bzip2 block. The magic number should already have been consumed. 236 func (bz2 *reader) readBlock() (err error) { 237 br := &bz2.br 238 bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. 239 bz2.blockCRC = 0 240 bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC 241 randomized := br.ReadBits(1) 242 if randomized != 0 { 243 return StructuralError("deprecated randomized files") 244 } 245 origPtr := uint(br.ReadBits(24)) 246 247 // If not every byte value is used in the block (i.e., it's text) then 248 // the symbol set is reduced. The symbols used are stored as a 249 // two-level, 16x16 bitmap. 250 symbolRangeUsedBitmap := br.ReadBits(16) 251 symbolPresent := make([]bool, 256) 252 numSymbols := 0 253 for symRange := uint(0); symRange < 16; symRange++ { 254 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { 255 bits := br.ReadBits(16) 256 for symbol := uint(0); symbol < 16; symbol++ { 257 if bits&(1<<(15-symbol)) != 0 { 258 symbolPresent[16*symRange+symbol] = true 259 numSymbols++ 260 } 261 } 262 } 263 } 264 265 if numSymbols == 0 { 266 // There must be an EOF symbol. 267 return StructuralError("no symbols in input") 268 } 269 270 // A block uses between two and six different Huffman trees. 271 numHuffmanTrees := br.ReadBits(3) 272 if numHuffmanTrees < 2 || numHuffmanTrees > 6 { 273 return StructuralError("invalid number of Huffman trees") 274 } 275 276 // The Huffman tree can switch every 50 symbols so there's a list of 277 // tree indexes telling us which tree to use for each 50 symbol block. 278 numSelectors := br.ReadBits(15) 279 treeIndexes := make([]uint8, numSelectors) 280 281 // The tree indexes are move-to-front transformed and stored as unary 282 // numbers. 283 mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) 284 for i := range treeIndexes { 285 c := 0 286 for { 287 inc := br.ReadBits(1) 288 if inc == 0 { 289 break 290 } 291 c++ 292 } 293 if c >= numHuffmanTrees { 294 return StructuralError("tree index too large") 295 } 296 treeIndexes[i] = mtfTreeDecoder.Decode(c) 297 } 298 299 // The list of symbols for the move-to-front transform is taken from 300 // the previously decoded symbol bitmap. 301 symbols := make([]byte, numSymbols) 302 nextSymbol := 0 303 for i := 0; i < 256; i++ { 304 if symbolPresent[i] { 305 symbols[nextSymbol] = byte(i) 306 nextSymbol++ 307 } 308 } 309 mtf := newMTFDecoder(symbols) 310 311 numSymbols += 2 // to account for RUNA and RUNB symbols 312 huffmanTrees := make([]huffmanTree, numHuffmanTrees) 313 314 // Now we decode the arrays of code-lengths for each tree. 315 lengths := make([]uint8, numSymbols) 316 for i := range huffmanTrees { 317 // The code lengths are delta encoded from a 5-bit base value. 318 length := br.ReadBits(5) 319 for j := range lengths { 320 for { 321 if length < 1 || length > 20 { 322 return StructuralError("Huffman length out of range") 323 } 324 if !br.ReadBit() { 325 break 326 } 327 if br.ReadBit() { 328 length-- 329 } else { 330 length++ 331 } 332 } 333 lengths[j] = uint8(length) 334 } 335 huffmanTrees[i], err = newHuffmanTree(lengths) 336 if err != nil { 337 return err 338 } 339 } 340 341 selectorIndex := 1 // the next tree index to use 342 if len(treeIndexes) == 0 { 343 return StructuralError("no tree selectors given") 344 } 345 if int(treeIndexes[0]) >= len(huffmanTrees) { 346 return StructuralError("tree selector out of range") 347 } 348 currentHuffmanTree := huffmanTrees[treeIndexes[0]] 349 bufIndex := 0 // indexes bz2.buf, the output buffer. 350 // The output of the move-to-front transform is run-length encoded and 351 // we merge the decoding into the Huffman parsing loop. These two 352 // variables accumulate the repeat count. See the Wikipedia page for 353 // details. 354 repeat := 0 355 repeatPower := 0 356 357 // The `C' array (used by the inverse BWT) needs to be zero initialized. 358 for i := range bz2.c { 359 bz2.c[i] = 0 360 } 361 362 decoded := 0 // counts the number of symbols decoded by the current tree. 363 for { 364 if decoded == 50 { 365 if selectorIndex >= numSelectors { 366 return StructuralError("insufficient selector indices for number of symbols") 367 } 368 if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) { 369 return StructuralError("tree selector out of range") 370 } 371 currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] 372 selectorIndex++ 373 decoded = 0 374 } 375 376 v := currentHuffmanTree.Decode(br) 377 decoded++ 378 379 if v < 2 { 380 // This is either the RUNA or RUNB symbol. 381 if repeat == 0 { 382 repeatPower = 1 383 } 384 repeat += repeatPower << v 385 repeatPower <<= 1 386 387 // This limit of 2 million comes from the bzip2 source 388 // code. It prevents repeat from overflowing. 389 if repeat > 2*1024*1024 { 390 return StructuralError("repeat count too large") 391 } 392 continue 393 } 394 395 if repeat > 0 { 396 // We have decoded a complete run-length so we need to 397 // replicate the last output symbol. 398 if repeat > bz2.blockSize-bufIndex { 399 return StructuralError("repeats past end of block") 400 } 401 for i := 0; i < repeat; i++ { 402 b := mtf.First() 403 bz2.tt[bufIndex] = uint32(b) 404 bz2.c[b]++ 405 bufIndex++ 406 } 407 repeat = 0 408 } 409 410 if int(v) == numSymbols-1 { 411 // This is the EOF symbol. Because it's always at the 412 // end of the move-to-front list, and never gets moved 413 // to the front, it has this unique value. 414 break 415 } 416 417 // Since two metasymbols (RUNA and RUNB) have values 0 and 1, 418 // one would expect |v-2| to be passed to the MTF decoder. 419 // However, the front of the MTF list is never referenced as 0, 420 // it's always referenced with a run-length of 1. Thus 0 421 // doesn't need to be encoded and we have |v-1| in the next 422 // line. 423 b := mtf.Decode(int(v - 1)) 424 if bufIndex >= bz2.blockSize { 425 return StructuralError("data exceeds block size") 426 } 427 bz2.tt[bufIndex] = uint32(b) 428 bz2.c[b]++ 429 bufIndex++ 430 } 431 432 if origPtr >= uint(bufIndex) { 433 return StructuralError("origPtr out of bounds") 434 } 435 436 // We have completed the entropy decoding. Now we can perform the 437 // inverse BWT and setup the RLE buffer. 438 bz2.preRLE = bz2.tt[:bufIndex] 439 bz2.preRLEUsed = 0 440 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) 441 bz2.lastByte = -1 442 bz2.byteRepeats = 0 443 bz2.repeats = 0 444 445 return nil 446 } 447 448 // inverseBWT implements the inverse Burrows-Wheeler transform as described in 449 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. 450 // In that document, origPtr is called `I' and c is the `C' array after the 451 // first pass over the data. It's an argument here because we merge the first 452 // pass with the Huffman decoding. 453 // 454 // This also implements the `single array' method from the bzip2 source code 455 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the 456 // index of the next byte in the top 24-bits. The index of the first byte is 457 // returned. 458 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { 459 sum := uint(0) 460 for i := 0; i < 256; i++ { 461 sum += c[i] 462 c[i] = sum - c[i] 463 } 464 465 for i := range tt { 466 b := tt[i] & 0xff 467 tt[c[b]] |= uint32(i) << 8 468 c[b]++ 469 } 470 471 return tt[origPtr] >> 8 472 } 473 474 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, 475 // causing the bits in the input to be processed in the reverse of the usual order. 476 477 var crctab [256]uint32 478 479 func init() { 480 const poly = 0x04C11DB7 481 for i := range crctab { 482 crc := uint32(i) << 24 483 for j := 0; j < 8; j++ { 484 if crc&0x80000000 != 0 { 485 crc = (crc << 1) ^ poly 486 } else { 487 crc <<= 1 488 } 489 } 490 crctab[i] = crc 491 } 492 } 493 494 // updateCRC updates the crc value to incorporate the data in b. 495 // The initial value is 0. 496 func updateCRC(val uint32, b []byte) uint32 { 497 crc := ^val 498 for _, v := range b { 499 crc = crctab[byte(crc>>24)^v] ^ (crc << 8) 500 } 501 return ^crc 502 }