github.com/spotify/syslog-redirector-golang@v0.0.0-20140320174030-4859f03d829a/src/pkg/compress/bzip2/bzip2.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bzip2 implements bzip2 decompression. 6 package bzip2 7 8 import "io" 9 10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot 11 // of guessing: http://en.wikipedia.org/wiki/Bzip2 12 // The source code to pyflate was useful for debugging: 13 // http://www.paul.sladen.org/projects/pyflate 14 15 // A StructuralError is returned when the bzip2 data is found to be 16 // syntactically invalid. 17 type StructuralError string 18 19 func (s StructuralError) Error() string { 20 return "bzip2 data invalid: " + string(s) 21 } 22 23 // A reader decompresses bzip2 compressed data. 24 type reader struct { 25 br bitReader 26 fileCRC uint32 27 blockCRC uint32 28 wantBlockCRC uint32 29 setupDone bool // true if we have parsed the bzip2 header. 30 blockSize int // blockSize in bytes, i.e. 900 * 1024. 31 eof bool 32 buf []byte // stores Burrows-Wheeler transformed data. 33 c [256]uint // the `C' array for the inverse BWT. 34 tt []uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits. 35 tPos uint32 // Index of the next output byte in tt. 36 37 preRLE []uint32 // contains the RLE data still to be processed. 38 preRLEUsed int // number of entries of preRLE used. 39 lastByte int // the last byte value seen. 40 byteRepeats uint // the number of repeats of lastByte seen. 41 repeats uint // the number of copies of lastByte to output. 42 } 43 44 // NewReader returns an io.Reader which decompresses bzip2 data from r. 45 func NewReader(r io.Reader) io.Reader { 46 bz2 := new(reader) 47 bz2.br = newBitReader(r) 48 return bz2 49 } 50 51 const bzip2FileMagic = 0x425a // "BZ" 52 const bzip2BlockMagic = 0x314159265359 53 const bzip2FinalMagic = 0x177245385090 54 55 // setup parses the bzip2 header. 56 func (bz2 *reader) setup(needMagic bool) error { 57 br := &bz2.br 58 59 if needMagic { 60 magic := br.ReadBits(16) 61 if magic != bzip2FileMagic { 62 return StructuralError("bad magic value") 63 } 64 } 65 66 t := br.ReadBits(8) 67 if t != 'h' { 68 return StructuralError("non-Huffman entropy encoding") 69 } 70 71 level := br.ReadBits(8) 72 if level < '1' || level > '9' { 73 return StructuralError("invalid compression level") 74 } 75 76 bz2.fileCRC = 0 77 bz2.blockSize = 100 * 1024 * (int(level) - '0') 78 if bz2.blockSize > len(bz2.tt) { 79 bz2.tt = make([]uint32, bz2.blockSize) 80 } 81 return nil 82 } 83 84 func (bz2 *reader) Read(buf []byte) (n int, err error) { 85 if bz2.eof { 86 return 0, io.EOF 87 } 88 89 if !bz2.setupDone { 90 err = bz2.setup(true) 91 brErr := bz2.br.Err() 92 if brErr != nil { 93 err = brErr 94 } 95 if err != nil { 96 return 0, err 97 } 98 bz2.setupDone = true 99 } 100 101 n, err = bz2.read(buf) 102 brErr := bz2.br.Err() 103 if brErr != nil { 104 err = brErr 105 } 106 return 107 } 108 109 func (bz2 *reader) readFromBlock(buf []byte) int { 110 // bzip2 is a block based compressor, except that it has a run-length 111 // preprocessing step. The block based nature means that we can 112 // preallocate fixed-size buffers and reuse them. However, the RLE 113 // preprocessing would require allocating huge buffers to store the 114 // maximum expansion. Thus we process blocks all at once, except for 115 // the RLE which we decompress as required. 116 n := 0 117 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { 118 // We have RLE data pending. 119 120 // The run-length encoding works like this: 121 // Any sequence of four equal bytes is followed by a length 122 // byte which contains the number of repeats of that byte to 123 // include. (The number of repeats can be zero.) Because we are 124 // decompressing on-demand our state is kept in the reader 125 // object. 126 127 if bz2.repeats > 0 { 128 buf[n] = byte(bz2.lastByte) 129 n++ 130 bz2.repeats-- 131 if bz2.repeats == 0 { 132 bz2.lastByte = -1 133 } 134 continue 135 } 136 137 bz2.tPos = bz2.preRLE[bz2.tPos] 138 b := byte(bz2.tPos) 139 bz2.tPos >>= 8 140 bz2.preRLEUsed++ 141 142 if bz2.byteRepeats == 3 { 143 bz2.repeats = uint(b) 144 bz2.byteRepeats = 0 145 continue 146 } 147 148 if bz2.lastByte == int(b) { 149 bz2.byteRepeats++ 150 } else { 151 bz2.byteRepeats = 0 152 } 153 bz2.lastByte = int(b) 154 155 buf[n] = b 156 n++ 157 } 158 159 return n 160 } 161 162 func (bz2 *reader) read(buf []byte) (int, error) { 163 for { 164 n := bz2.readFromBlock(buf) 165 if n > 0 { 166 bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) 167 return n, nil 168 } 169 170 // End of block. Check CRC. 171 if bz2.blockCRC != bz2.wantBlockCRC { 172 bz2.br.err = StructuralError("block checksum mismatch") 173 return 0, bz2.br.err 174 } 175 176 // Find next block. 177 br := &bz2.br 178 switch br.ReadBits64(48) { 179 default: 180 return 0, StructuralError("bad magic value found") 181 182 case bzip2BlockMagic: 183 // Start of block. 184 err := bz2.readBlock() 185 if err != nil { 186 return 0, err 187 } 188 189 case bzip2FinalMagic: 190 // Check end-of-file CRC. 191 wantFileCRC := uint32(br.ReadBits64(32)) 192 if br.err != nil { 193 return 0, br.err 194 } 195 if bz2.fileCRC != wantFileCRC { 196 br.err = StructuralError("file checksum mismatch") 197 return 0, br.err 198 } 199 200 // Skip ahead to byte boundary. 201 // Is there a file concatenated to this one? 202 // It would start with BZ. 203 if br.bits%8 != 0 { 204 br.ReadBits(br.bits % 8) 205 } 206 b, err := br.r.ReadByte() 207 if err == io.EOF { 208 br.err = io.EOF 209 bz2.eof = true 210 return 0, io.EOF 211 } 212 if err != nil { 213 br.err = err 214 return 0, err 215 } 216 z, err := br.r.ReadByte() 217 if err != nil { 218 if err == io.EOF { 219 err = io.ErrUnexpectedEOF 220 } 221 br.err = err 222 return 0, err 223 } 224 if b != 'B' || z != 'Z' { 225 return 0, StructuralError("bad magic value in continuation file") 226 } 227 if err := bz2.setup(false); err != nil { 228 return 0, err 229 } 230 } 231 } 232 } 233 234 // readBlock reads a bzip2 block. The magic number should already have been consumed. 235 func (bz2 *reader) readBlock() (err error) { 236 br := &bz2.br 237 bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. 238 bz2.blockCRC = 0 239 bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC 240 randomized := br.ReadBits(1) 241 if randomized != 0 { 242 return StructuralError("deprecated randomized files") 243 } 244 origPtr := uint(br.ReadBits(24)) 245 246 // If not every byte value is used in the block (i.e., it's text) then 247 // the symbol set is reduced. The symbols used are stored as a 248 // two-level, 16x16 bitmap. 249 symbolRangeUsedBitmap := br.ReadBits(16) 250 symbolPresent := make([]bool, 256) 251 numSymbols := 0 252 for symRange := uint(0); symRange < 16; symRange++ { 253 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { 254 bits := br.ReadBits(16) 255 for symbol := uint(0); symbol < 16; symbol++ { 256 if bits&(1<<(15-symbol)) != 0 { 257 symbolPresent[16*symRange+symbol] = true 258 numSymbols++ 259 } 260 } 261 } 262 } 263 264 // A block uses between two and six different Huffman trees. 265 numHuffmanTrees := br.ReadBits(3) 266 if numHuffmanTrees < 2 || numHuffmanTrees > 6 { 267 return StructuralError("invalid number of Huffman trees") 268 } 269 270 // The Huffman tree can switch every 50 symbols so there's a list of 271 // tree indexes telling us which tree to use for each 50 symbol block. 272 numSelectors := br.ReadBits(15) 273 treeIndexes := make([]uint8, numSelectors) 274 275 // The tree indexes are move-to-front transformed and stored as unary 276 // numbers. 277 mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) 278 for i := range treeIndexes { 279 c := 0 280 for { 281 inc := br.ReadBits(1) 282 if inc == 0 { 283 break 284 } 285 c++ 286 } 287 if c >= numHuffmanTrees { 288 return StructuralError("tree index too large") 289 } 290 treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c)) 291 } 292 293 // The list of symbols for the move-to-front transform is taken from 294 // the previously decoded symbol bitmap. 295 symbols := make([]byte, numSymbols) 296 nextSymbol := 0 297 for i := 0; i < 256; i++ { 298 if symbolPresent[i] { 299 symbols[nextSymbol] = byte(i) 300 nextSymbol++ 301 } 302 } 303 mtf := newMTFDecoder(symbols) 304 305 numSymbols += 2 // to account for RUNA and RUNB symbols 306 huffmanTrees := make([]huffmanTree, numHuffmanTrees) 307 308 // Now we decode the arrays of code-lengths for each tree. 309 lengths := make([]uint8, numSymbols) 310 for i := 0; i < numHuffmanTrees; i++ { 311 // The code lengths are delta encoded from a 5-bit base value. 312 length := br.ReadBits(5) 313 for j := 0; j < numSymbols; j++ { 314 for { 315 if !br.ReadBit() { 316 break 317 } 318 if br.ReadBit() { 319 length-- 320 } else { 321 length++ 322 } 323 } 324 if length < 0 || length > 20 { 325 return StructuralError("Huffman length out of range") 326 } 327 lengths[j] = uint8(length) 328 } 329 huffmanTrees[i], err = newHuffmanTree(lengths) 330 if err != nil { 331 return err 332 } 333 } 334 335 selectorIndex := 1 // the next tree index to use 336 currentHuffmanTree := huffmanTrees[treeIndexes[0]] 337 bufIndex := 0 // indexes bz2.buf, the output buffer. 338 // The output of the move-to-front transform is run-length encoded and 339 // we merge the decoding into the Huffman parsing loop. These two 340 // variables accumulate the repeat count. See the Wikipedia page for 341 // details. 342 repeat := 0 343 repeat_power := 0 344 345 // The `C' array (used by the inverse BWT) needs to be zero initialized. 346 for i := range bz2.c { 347 bz2.c[i] = 0 348 } 349 350 decoded := 0 // counts the number of symbols decoded by the current tree. 351 for { 352 if decoded == 50 { 353 currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] 354 selectorIndex++ 355 decoded = 0 356 } 357 358 v := currentHuffmanTree.Decode(br) 359 decoded++ 360 361 if v < 2 { 362 // This is either the RUNA or RUNB symbol. 363 if repeat == 0 { 364 repeat_power = 1 365 } 366 repeat += repeat_power << v 367 repeat_power <<= 1 368 369 // This limit of 2 million comes from the bzip2 source 370 // code. It prevents repeat from overflowing. 371 if repeat > 2*1024*1024 { 372 return StructuralError("repeat count too large") 373 } 374 continue 375 } 376 377 if repeat > 0 { 378 // We have decoded a complete run-length so we need to 379 // replicate the last output symbol. 380 if repeat > bz2.blockSize-bufIndex { 381 return StructuralError("repeats past end of block") 382 } 383 for i := 0; i < repeat; i++ { 384 b := byte(mtf.First()) 385 bz2.tt[bufIndex] = uint32(b) 386 bz2.c[b]++ 387 bufIndex++ 388 } 389 repeat = 0 390 } 391 392 if int(v) == numSymbols-1 { 393 // This is the EOF symbol. Because it's always at the 394 // end of the move-to-front list, and never gets moved 395 // to the front, it has this unique value. 396 break 397 } 398 399 // Since two metasymbols (RUNA and RUNB) have values 0 and 1, 400 // one would expect |v-2| to be passed to the MTF decoder. 401 // However, the front of the MTF list is never referenced as 0, 402 // it's always referenced with a run-length of 1. Thus 0 403 // doesn't need to be encoded and we have |v-1| in the next 404 // line. 405 b := byte(mtf.Decode(int(v - 1))) 406 if bufIndex >= bz2.blockSize { 407 return StructuralError("data exceeds block size") 408 } 409 bz2.tt[bufIndex] = uint32(b) 410 bz2.c[b]++ 411 bufIndex++ 412 } 413 414 if origPtr >= uint(bufIndex) { 415 return StructuralError("origPtr out of bounds") 416 } 417 418 // We have completed the entropy decoding. Now we can perform the 419 // inverse BWT and setup the RLE buffer. 420 bz2.preRLE = bz2.tt[:bufIndex] 421 bz2.preRLEUsed = 0 422 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) 423 bz2.lastByte = -1 424 bz2.byteRepeats = 0 425 bz2.repeats = 0 426 427 return nil 428 } 429 430 // inverseBWT implements the inverse Burrows-Wheeler transform as described in 431 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. 432 // In that document, origPtr is called `I' and c is the `C' array after the 433 // first pass over the data. It's an argument here because we merge the first 434 // pass with the Huffman decoding. 435 // 436 // This also implements the `single array' method from the bzip2 source code 437 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the 438 // index of the next byte in the top 24-bits. The index of the first byte is 439 // returned. 440 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { 441 sum := uint(0) 442 for i := 0; i < 256; i++ { 443 sum += c[i] 444 c[i] = sum - c[i] 445 } 446 447 for i := range tt { 448 b := tt[i] & 0xff 449 tt[c[b]] |= uint32(i) << 8 450 c[b]++ 451 } 452 453 return tt[origPtr] >> 8 454 } 455 456 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, 457 // causing the bits in the input to be processed in the reverse of the usual order. 458 459 var crctab [256]uint32 460 461 func init() { 462 const poly = 0x04C11DB7 463 for i := range crctab { 464 crc := uint32(i) << 24 465 for j := 0; j < 8; j++ { 466 if crc&0x80000000 != 0 { 467 crc = (crc << 1) ^ poly 468 } else { 469 crc <<= 1 470 } 471 } 472 crctab[i] = crc 473 } 474 } 475 476 // updateCRC updates the crc value to incorporate the data in b. 477 // The initial value is 0. 478 func updateCRC(val uint32, b []byte) uint32 { 479 crc := ^val 480 for _, v := range b { 481 crc = crctab[byte(crc>>24)^v] ^ (crc << 8) 482 } 483 return ^crc 484 }