github.com/peggyl/go@v0.0.0-20151008231540-ae315999c2d5/src/compress/flate/huffman_bit_writer.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package flate 6 7 import ( 8 "io" 9 "math" 10 ) 11 12 const ( 13 // The largest offset code. 14 offsetCodeCount = 30 15 16 // The special code used to mark the end of a block. 17 endBlockMarker = 256 18 19 // The first length code. 20 lengthCodesStart = 257 21 22 // The number of codegen codes. 23 codegenCodeCount = 19 24 badCode = 255 25 ) 26 27 // The number of extra bits needed by length code X - LENGTH_CODES_START. 28 var lengthExtraBits = []int8{ 29 /* 257 */ 0, 0, 0, 30 /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 31 /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 32 /* 280 */ 4, 5, 5, 5, 5, 0, 33 } 34 35 // The length indicated by length code X - LENGTH_CODES_START. 36 var lengthBase = []uint32{ 37 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 38 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 39 64, 80, 96, 112, 128, 160, 192, 224, 255, 40 } 41 42 // offset code word extra bits. 43 var offsetExtraBits = []int8{ 44 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 45 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 46 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 47 /* extended window */ 48 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 49 } 50 51 var offsetBase = []uint32{ 52 /* normal deflate */ 53 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, 54 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, 55 0x000020, 0x000030, 0x000040, 0x000060, 0x000080, 56 0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300, 57 0x000400, 0x000600, 0x000800, 0x000c00, 0x001000, 58 0x001800, 0x002000, 0x003000, 0x004000, 0x006000, 59 60 /* extended window */ 61 0x008000, 0x00c000, 0x010000, 0x018000, 0x020000, 62 0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000, 63 0x100000, 0x180000, 0x200000, 0x300000, 64 } 65 66 // The odd order in which the codegen code sizes are written. 67 var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15} 68 69 type huffmanBitWriter struct { 70 w io.Writer 71 // Data waiting to be written is bytes[0:nbytes] 72 // and then the low nbits of bits. 73 bits uint32 74 nbits uint32 75 bytes [64]byte 76 nbytes int 77 literalFreq []int32 78 offsetFreq []int32 79 codegen []uint8 80 codegenFreq []int32 81 literalEncoding *huffmanEncoder 82 offsetEncoding *huffmanEncoder 83 codegenEncoding *huffmanEncoder 84 err error 85 } 86 87 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { 88 return &huffmanBitWriter{ 89 w: w, 90 literalFreq: make([]int32, maxNumLit), 91 offsetFreq: make([]int32, offsetCodeCount), 92 codegen: make([]uint8, maxNumLit+offsetCodeCount+1), 93 codegenFreq: make([]int32, codegenCodeCount), 94 literalEncoding: newHuffmanEncoder(maxNumLit), 95 offsetEncoding: newHuffmanEncoder(offsetCodeCount), 96 codegenEncoding: newHuffmanEncoder(codegenCodeCount), 97 } 98 } 99 100 func (w *huffmanBitWriter) reset(writer io.Writer) { 101 w.w = writer 102 w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil 103 w.bytes = [64]byte{} 104 for i := range w.codegen { 105 w.codegen[i] = 0 106 } 107 for _, s := range [...][]int32{w.literalFreq, w.offsetFreq, w.codegenFreq} { 108 for i := range s { 109 s[i] = 0 110 } 111 } 112 for _, enc := range [...]*huffmanEncoder{ 113 w.literalEncoding, 114 w.offsetEncoding, 115 w.codegenEncoding} { 116 for i := range enc.code { 117 enc.code[i] = 0 118 } 119 for i := range enc.codeBits { 120 enc.codeBits[i] = 0 121 } 122 } 123 } 124 125 func (w *huffmanBitWriter) flushBits() { 126 if w.err != nil { 127 w.nbits = 0 128 return 129 } 130 bits := w.bits 131 w.bits >>= 16 132 w.nbits -= 16 133 n := w.nbytes 134 w.bytes[n] = byte(bits) 135 w.bytes[n+1] = byte(bits >> 8) 136 if n += 2; n >= len(w.bytes) { 137 _, w.err = w.w.Write(w.bytes[0:]) 138 n = 0 139 } 140 w.nbytes = n 141 } 142 143 func (w *huffmanBitWriter) flush() { 144 if w.err != nil { 145 w.nbits = 0 146 return 147 } 148 n := w.nbytes 149 if w.nbits > 8 { 150 w.bytes[n] = byte(w.bits) 151 w.bits >>= 8 152 w.nbits -= 8 153 n++ 154 } 155 if w.nbits > 0 { 156 w.bytes[n] = byte(w.bits) 157 w.nbits = 0 158 n++ 159 } 160 w.bits = 0 161 _, w.err = w.w.Write(w.bytes[0:n]) 162 w.nbytes = 0 163 } 164 165 func (w *huffmanBitWriter) writeBits(b, nb int32) { 166 w.bits |= uint32(b) << w.nbits 167 if w.nbits += uint32(nb); w.nbits >= 16 { 168 w.flushBits() 169 } 170 } 171 172 func (w *huffmanBitWriter) writeBytes(bytes []byte) { 173 if w.err != nil { 174 return 175 } 176 n := w.nbytes 177 if w.nbits == 8 { 178 w.bytes[n] = byte(w.bits) 179 w.nbits = 0 180 n++ 181 } 182 if w.nbits != 0 { 183 w.err = InternalError("writeBytes with unfinished bits") 184 return 185 } 186 if n != 0 { 187 _, w.err = w.w.Write(w.bytes[0:n]) 188 if w.err != nil { 189 return 190 } 191 } 192 w.nbytes = 0 193 _, w.err = w.w.Write(bytes) 194 } 195 196 // RFC 1951 3.2.7 specifies a special run-length encoding for specifying 197 // the literal and offset lengths arrays (which are concatenated into a single 198 // array). This method generates that run-length encoding. 199 // 200 // The result is written into the codegen array, and the frequencies 201 // of each code is written into the codegenFreq array. 202 // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional 203 // information. Code badCode is an end marker 204 // 205 // numLiterals The number of literals in literalEncoding 206 // numOffsets The number of offsets in offsetEncoding 207 func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int) { 208 for i := range w.codegenFreq { 209 w.codegenFreq[i] = 0 210 } 211 // Note that we are using codegen both as a temporary variable for holding 212 // a copy of the frequencies, and as the place where we put the result. 213 // This is fine because the output is always shorter than the input used 214 // so far. 215 codegen := w.codegen // cache 216 // Copy the concatenated code sizes to codegen. Put a marker at the end. 217 copy(codegen[0:numLiterals], w.literalEncoding.codeBits) 218 copy(codegen[numLiterals:numLiterals+numOffsets], w.offsetEncoding.codeBits) 219 codegen[numLiterals+numOffsets] = badCode 220 221 size := codegen[0] 222 count := 1 223 outIndex := 0 224 for inIndex := 1; size != badCode; inIndex++ { 225 // INVARIANT: We have seen "count" copies of size that have not yet 226 // had output generated for them. 227 nextSize := codegen[inIndex] 228 if nextSize == size { 229 count++ 230 continue 231 } 232 // We need to generate codegen indicating "count" of size. 233 if size != 0 { 234 codegen[outIndex] = size 235 outIndex++ 236 w.codegenFreq[size]++ 237 count-- 238 for count >= 3 { 239 n := 6 240 if n > count { 241 n = count 242 } 243 codegen[outIndex] = 16 244 outIndex++ 245 codegen[outIndex] = uint8(n - 3) 246 outIndex++ 247 w.codegenFreq[16]++ 248 count -= n 249 } 250 } else { 251 for count >= 11 { 252 n := 138 253 if n > count { 254 n = count 255 } 256 codegen[outIndex] = 18 257 outIndex++ 258 codegen[outIndex] = uint8(n - 11) 259 outIndex++ 260 w.codegenFreq[18]++ 261 count -= n 262 } 263 if count >= 3 { 264 // count >= 3 && count <= 10 265 codegen[outIndex] = 17 266 outIndex++ 267 codegen[outIndex] = uint8(count - 3) 268 outIndex++ 269 w.codegenFreq[17]++ 270 count = 0 271 } 272 } 273 count-- 274 for ; count >= 0; count-- { 275 codegen[outIndex] = size 276 outIndex++ 277 w.codegenFreq[size]++ 278 } 279 // Set up invariant for next time through the loop. 280 size = nextSize 281 count = 1 282 } 283 // Marker indicating the end of the codegen. 284 codegen[outIndex] = badCode 285 } 286 287 func (w *huffmanBitWriter) writeCode(code *huffmanEncoder, literal uint32) { 288 if w.err != nil { 289 return 290 } 291 w.writeBits(int32(code.code[literal]), int32(code.codeBits[literal])) 292 } 293 294 // Write the header of a dynamic Huffman block to the output stream. 295 // 296 // numLiterals The number of literals specified in codegen 297 // numOffsets The number of offsets specified in codegen 298 // numCodegens The number of codegens used in codegen 299 func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) { 300 if w.err != nil { 301 return 302 } 303 var firstBits int32 = 4 304 if isEof { 305 firstBits = 5 306 } 307 w.writeBits(firstBits, 3) 308 w.writeBits(int32(numLiterals-257), 5) 309 w.writeBits(int32(numOffsets-1), 5) 310 w.writeBits(int32(numCodegens-4), 4) 311 312 for i := 0; i < numCodegens; i++ { 313 value := w.codegenEncoding.codeBits[codegenOrder[i]] 314 w.writeBits(int32(value), 3) 315 } 316 317 i := 0 318 for { 319 var codeWord int = int(w.codegen[i]) 320 i++ 321 if codeWord == badCode { 322 break 323 } 324 // The low byte contains the actual code to generate. 325 w.writeCode(w.codegenEncoding, uint32(codeWord)) 326 327 switch codeWord { 328 case 16: 329 w.writeBits(int32(w.codegen[i]), 2) 330 i++ 331 break 332 case 17: 333 w.writeBits(int32(w.codegen[i]), 3) 334 i++ 335 break 336 case 18: 337 w.writeBits(int32(w.codegen[i]), 7) 338 i++ 339 break 340 } 341 } 342 } 343 344 func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) { 345 if w.err != nil { 346 return 347 } 348 var flag int32 349 if isEof { 350 flag = 1 351 } 352 w.writeBits(flag, 3) 353 w.flush() 354 w.writeBits(int32(length), 16) 355 w.writeBits(int32(^uint16(length)), 16) 356 } 357 358 func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { 359 if w.err != nil { 360 return 361 } 362 // Indicate that we are a fixed Huffman block 363 var value int32 = 2 364 if isEof { 365 value = 3 366 } 367 w.writeBits(value, 3) 368 } 369 370 func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { 371 if w.err != nil { 372 return 373 } 374 for i := range w.literalFreq { 375 w.literalFreq[i] = 0 376 } 377 for i := range w.offsetFreq { 378 w.offsetFreq[i] = 0 379 } 380 381 n := len(tokens) 382 tokens = tokens[0 : n+1] 383 tokens[n] = endBlockMarker 384 385 for _, t := range tokens { 386 switch t.typ() { 387 case literalType: 388 w.literalFreq[t.literal()]++ 389 case matchType: 390 length := t.length() 391 offset := t.offset() 392 w.literalFreq[lengthCodesStart+lengthCode(length)]++ 393 w.offsetFreq[offsetCode(offset)]++ 394 } 395 } 396 397 // get the number of literals 398 numLiterals := len(w.literalFreq) 399 for w.literalFreq[numLiterals-1] == 0 { 400 numLiterals-- 401 } 402 // get the number of offsets 403 numOffsets := len(w.offsetFreq) 404 for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 { 405 numOffsets-- 406 } 407 if numOffsets == 0 { 408 // We haven't found a single match. If we want to go with the dynamic encoding, 409 // we should count at least one offset to be sure that the offset huffman tree could be encoded. 410 w.offsetFreq[0] = 1 411 numOffsets = 1 412 } 413 414 w.literalEncoding.generate(w.literalFreq, 15) 415 w.offsetEncoding.generate(w.offsetFreq, 15) 416 417 storedBytes := 0 418 if input != nil { 419 storedBytes = len(input) 420 } 421 var extraBits int64 422 var storedSize int64 = math.MaxInt64 423 if storedBytes <= maxStoreBlockSize && input != nil { 424 storedSize = int64((storedBytes + 5) * 8) 425 // We only bother calculating the costs of the extra bits required by 426 // the length of offset fields (which will be the same for both fixed 427 // and dynamic encoding), if we need to compare those two encodings 428 // against stored encoding. 429 for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { 430 // First eight length codes have extra size = 0. 431 extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart]) 432 } 433 for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { 434 // First four offset codes have extra size = 0. 435 extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode]) 436 } 437 } 438 439 // Figure out smallest code. 440 // Fixed Huffman baseline. 441 var size = int64(3) + 442 fixedLiteralEncoding.bitLength(w.literalFreq) + 443 fixedOffsetEncoding.bitLength(w.offsetFreq) + 444 extraBits 445 var literalEncoding = fixedLiteralEncoding 446 var offsetEncoding = fixedOffsetEncoding 447 448 // Dynamic Huffman? 449 var numCodegens int 450 451 // Generate codegen and codegenFrequencies, which indicates how to encode 452 // the literalEncoding and the offsetEncoding. 453 w.generateCodegen(numLiterals, numOffsets) 454 w.codegenEncoding.generate(w.codegenFreq, 7) 455 numCodegens = len(w.codegenFreq) 456 for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { 457 numCodegens-- 458 } 459 dynamicHeader := int64(3+5+5+4+(3*numCodegens)) + 460 w.codegenEncoding.bitLength(w.codegenFreq) + 461 int64(extraBits) + 462 int64(w.codegenFreq[16]*2) + 463 int64(w.codegenFreq[17]*3) + 464 int64(w.codegenFreq[18]*7) 465 dynamicSize := dynamicHeader + 466 w.literalEncoding.bitLength(w.literalFreq) + 467 w.offsetEncoding.bitLength(w.offsetFreq) 468 469 if dynamicSize < size { 470 size = dynamicSize 471 literalEncoding = w.literalEncoding 472 offsetEncoding = w.offsetEncoding 473 } 474 475 // Stored bytes? 476 if storedSize < size { 477 w.writeStoredHeader(storedBytes, eof) 478 w.writeBytes(input[0:storedBytes]) 479 return 480 } 481 482 // Huffman. 483 if literalEncoding == fixedLiteralEncoding { 484 w.writeFixedHeader(eof) 485 } else { 486 w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) 487 } 488 for _, t := range tokens { 489 switch t.typ() { 490 case literalType: 491 w.writeCode(literalEncoding, t.literal()) 492 break 493 case matchType: 494 // Write the length 495 length := t.length() 496 lengthCode := lengthCode(length) 497 w.writeCode(literalEncoding, lengthCode+lengthCodesStart) 498 extraLengthBits := int32(lengthExtraBits[lengthCode]) 499 if extraLengthBits > 0 { 500 extraLength := int32(length - lengthBase[lengthCode]) 501 w.writeBits(extraLength, extraLengthBits) 502 } 503 // Write the offset 504 offset := t.offset() 505 offsetCode := offsetCode(offset) 506 w.writeCode(offsetEncoding, offsetCode) 507 extraOffsetBits := int32(offsetExtraBits[offsetCode]) 508 if extraOffsetBits > 0 { 509 extraOffset := int32(offset - offsetBase[offsetCode]) 510 w.writeBits(extraOffset, extraOffsetBits) 511 } 512 break 513 default: 514 panic("unknown token type: " + string(t)) 515 } 516 } 517 }