github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/compress/flate/huffman_bit_writer.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package flate 6 7 import ( 8 "io" 9 "math" 10 ) 11 12 const ( 13 // The largest offset code. 14 offsetCodeCount = 30 15 16 // The special code used to mark the end of a block. 17 endBlockMarker = 256 18 19 // The first length code. 20 lengthCodesStart = 257 21 22 // The number of codegen codes. 23 codegenCodeCount = 19 24 badCode = 255 25 ) 26 27 // The number of extra bits needed by length code X - LENGTH_CODES_START. 28 var lengthExtraBits = []int8{ 29 /* 257 */ 0, 0, 0, 30 /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 31 /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 32 /* 280 */ 4, 5, 5, 5, 5, 0, 33 } 34 35 // The length indicated by length code X - LENGTH_CODES_START. 36 var lengthBase = []uint32{ 37 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 38 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 39 64, 80, 96, 112, 128, 160, 192, 224, 255, 40 } 41 42 // offset code word extra bits. 43 var offsetExtraBits = []int8{ 44 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 45 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 46 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 47 /* extended window */ 48 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 49 } 50 51 var offsetBase = []uint32{ 52 /* normal deflate */ 53 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, 54 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, 55 0x000020, 0x000030, 0x000040, 0x000060, 0x000080, 56 0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300, 57 0x000400, 0x000600, 0x000800, 0x000c00, 0x001000, 58 0x001800, 0x002000, 0x003000, 0x004000, 0x006000, 59 60 /* extended window */ 61 0x008000, 0x00c000, 0x010000, 0x018000, 0x020000, 62 0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000, 63 0x100000, 0x180000, 0x200000, 0x300000, 64 } 65 66 // The odd order in which the codegen code sizes are written. 67 var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15} 68 69 type huffmanBitWriter struct { 70 w io.Writer 71 // Data waiting to be written is bytes[0:nbytes] 72 // and then the low nbits of bits. 73 bits uint32 74 nbits uint32 75 bytes [64]byte 76 nbytes int 77 literalFreq []int32 78 offsetFreq []int32 79 codegen []uint8 80 codegenFreq []int32 81 literalEncoding *huffmanEncoder 82 offsetEncoding *huffmanEncoder 83 codegenEncoding *huffmanEncoder 84 err error 85 } 86 87 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { 88 return &huffmanBitWriter{ 89 w: w, 90 literalFreq: make([]int32, maxLit), 91 offsetFreq: make([]int32, offsetCodeCount), 92 codegen: make([]uint8, maxLit+offsetCodeCount+1), 93 codegenFreq: make([]int32, codegenCodeCount), 94 literalEncoding: newHuffmanEncoder(maxLit), 95 offsetEncoding: newHuffmanEncoder(offsetCodeCount), 96 codegenEncoding: newHuffmanEncoder(codegenCodeCount), 97 } 98 } 99 100 func (w *huffmanBitWriter) flushBits() { 101 if w.err != nil { 102 w.nbits = 0 103 return 104 } 105 bits := w.bits 106 w.bits >>= 16 107 w.nbits -= 16 108 n := w.nbytes 109 w.bytes[n] = byte(bits) 110 w.bytes[n+1] = byte(bits >> 8) 111 if n += 2; n >= len(w.bytes) { 112 _, w.err = w.w.Write(w.bytes[0:]) 113 n = 0 114 } 115 w.nbytes = n 116 } 117 118 func (w *huffmanBitWriter) flush() { 119 if w.err != nil { 120 w.nbits = 0 121 return 122 } 123 n := w.nbytes 124 if w.nbits > 8 { 125 w.bytes[n] = byte(w.bits) 126 w.bits >>= 8 127 w.nbits -= 8 128 n++ 129 } 130 if w.nbits > 0 { 131 w.bytes[n] = byte(w.bits) 132 w.nbits = 0 133 n++ 134 } 135 w.bits = 0 136 _, w.err = w.w.Write(w.bytes[0:n]) 137 w.nbytes = 0 138 } 139 140 func (w *huffmanBitWriter) writeBits(b, nb int32) { 141 w.bits |= uint32(b) << w.nbits 142 if w.nbits += uint32(nb); w.nbits >= 16 { 143 w.flushBits() 144 } 145 } 146 147 func (w *huffmanBitWriter) writeBytes(bytes []byte) { 148 if w.err != nil { 149 return 150 } 151 n := w.nbytes 152 if w.nbits == 8 { 153 w.bytes[n] = byte(w.bits) 154 w.nbits = 0 155 n++ 156 } 157 if w.nbits != 0 { 158 w.err = InternalError("writeBytes with unfinished bits") 159 return 160 } 161 if n != 0 { 162 _, w.err = w.w.Write(w.bytes[0:n]) 163 if w.err != nil { 164 return 165 } 166 } 167 w.nbytes = 0 168 _, w.err = w.w.Write(bytes) 169 } 170 171 // RFC 1951 3.2.7 specifies a special run-length encoding for specifying 172 // the literal and offset lengths arrays (which are concatenated into a single 173 // array). This method generates that run-length encoding. 174 // 175 // The result is written into the codegen array, and the frequencies 176 // of each code is written into the codegenFreq array. 177 // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional 178 // information. Code badCode is an end marker 179 // 180 // numLiterals The number of literals in literalEncoding 181 // numOffsets The number of offsets in offsetEncoding 182 func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int) { 183 for i := range w.codegenFreq { 184 w.codegenFreq[i] = 0 185 } 186 // Note that we are using codegen both as a temporary variable for holding 187 // a copy of the frequencies, and as the place where we put the result. 188 // This is fine because the output is always shorter than the input used 189 // so far. 190 codegen := w.codegen // cache 191 // Copy the concatenated code sizes to codegen. Put a marker at the end. 192 copy(codegen[0:numLiterals], w.literalEncoding.codeBits) 193 copy(codegen[numLiterals:numLiterals+numOffsets], w.offsetEncoding.codeBits) 194 codegen[numLiterals+numOffsets] = badCode 195 196 size := codegen[0] 197 count := 1 198 outIndex := 0 199 for inIndex := 1; size != badCode; inIndex++ { 200 // INVARIANT: We have seen "count" copies of size that have not yet 201 // had output generated for them. 202 nextSize := codegen[inIndex] 203 if nextSize == size { 204 count++ 205 continue 206 } 207 // We need to generate codegen indicating "count" of size. 208 if size != 0 { 209 codegen[outIndex] = size 210 outIndex++ 211 w.codegenFreq[size]++ 212 count-- 213 for count >= 3 { 214 n := 6 215 if n > count { 216 n = count 217 } 218 codegen[outIndex] = 16 219 outIndex++ 220 codegen[outIndex] = uint8(n - 3) 221 outIndex++ 222 w.codegenFreq[16]++ 223 count -= n 224 } 225 } else { 226 for count >= 11 { 227 n := 138 228 if n > count { 229 n = count 230 } 231 codegen[outIndex] = 18 232 outIndex++ 233 codegen[outIndex] = uint8(n - 11) 234 outIndex++ 235 w.codegenFreq[18]++ 236 count -= n 237 } 238 if count >= 3 { 239 // count >= 3 && count <= 10 240 codegen[outIndex] = 17 241 outIndex++ 242 codegen[outIndex] = uint8(count - 3) 243 outIndex++ 244 w.codegenFreq[17]++ 245 count = 0 246 } 247 } 248 count-- 249 for ; count >= 0; count-- { 250 codegen[outIndex] = size 251 outIndex++ 252 w.codegenFreq[size]++ 253 } 254 // Set up invariant for next time through the loop. 255 size = nextSize 256 count = 1 257 } 258 // Marker indicating the end of the codegen. 259 codegen[outIndex] = badCode 260 } 261 262 func (w *huffmanBitWriter) writeCode(code *huffmanEncoder, literal uint32) { 263 if w.err != nil { 264 return 265 } 266 w.writeBits(int32(code.code[literal]), int32(code.codeBits[literal])) 267 } 268 269 // Write the header of a dynamic Huffman block to the output stream. 270 // 271 // numLiterals The number of literals specified in codegen 272 // numOffsets The number of offsets specified in codegen 273 // numCodegens The number of codegens used in codegen 274 func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) { 275 if w.err != nil { 276 return 277 } 278 var firstBits int32 = 4 279 if isEof { 280 firstBits = 5 281 } 282 w.writeBits(firstBits, 3) 283 w.writeBits(int32(numLiterals-257), 5) 284 w.writeBits(int32(numOffsets-1), 5) 285 w.writeBits(int32(numCodegens-4), 4) 286 287 for i := 0; i < numCodegens; i++ { 288 value := w.codegenEncoding.codeBits[codegenOrder[i]] 289 w.writeBits(int32(value), 3) 290 } 291 292 i := 0 293 for { 294 var codeWord int = int(w.codegen[i]) 295 i++ 296 if codeWord == badCode { 297 break 298 } 299 // The low byte contains the actual code to generate. 300 w.writeCode(w.codegenEncoding, uint32(codeWord)) 301 302 switch codeWord { 303 case 16: 304 w.writeBits(int32(w.codegen[i]), 2) 305 i++ 306 break 307 case 17: 308 w.writeBits(int32(w.codegen[i]), 3) 309 i++ 310 break 311 case 18: 312 w.writeBits(int32(w.codegen[i]), 7) 313 i++ 314 break 315 } 316 } 317 } 318 319 func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) { 320 if w.err != nil { 321 return 322 } 323 var flag int32 324 if isEof { 325 flag = 1 326 } 327 w.writeBits(flag, 3) 328 w.flush() 329 w.writeBits(int32(length), 16) 330 w.writeBits(int32(^uint16(length)), 16) 331 } 332 333 func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { 334 if w.err != nil { 335 return 336 } 337 // Indicate that we are a fixed Huffman block 338 var value int32 = 2 339 if isEof { 340 value = 3 341 } 342 w.writeBits(value, 3) 343 } 344 345 func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { 346 if w.err != nil { 347 return 348 } 349 for i := range w.literalFreq { 350 w.literalFreq[i] = 0 351 } 352 for i := range w.offsetFreq { 353 w.offsetFreq[i] = 0 354 } 355 356 n := len(tokens) 357 tokens = tokens[0 : n+1] 358 tokens[n] = endBlockMarker 359 360 for _, t := range tokens { 361 switch t.typ() { 362 case literalType: 363 w.literalFreq[t.literal()]++ 364 case matchType: 365 length := t.length() 366 offset := t.offset() 367 w.literalFreq[lengthCodesStart+lengthCode(length)]++ 368 w.offsetFreq[offsetCode(offset)]++ 369 } 370 } 371 372 // get the number of literals 373 numLiterals := len(w.literalFreq) 374 for w.literalFreq[numLiterals-1] == 0 { 375 numLiterals-- 376 } 377 // get the number of offsets 378 numOffsets := len(w.offsetFreq) 379 for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 { 380 numOffsets-- 381 } 382 if numOffsets == 0 { 383 // We haven't found a single match. If we want to go with the dynamic encoding, 384 // we should count at least one offset to be sure that the offset huffman tree could be encoded. 385 w.offsetFreq[0] = 1 386 numOffsets = 1 387 } 388 389 w.literalEncoding.generate(w.literalFreq, 15) 390 w.offsetEncoding.generate(w.offsetFreq, 15) 391 392 storedBytes := 0 393 if input != nil { 394 storedBytes = len(input) 395 } 396 var extraBits int64 397 var storedSize int64 = math.MaxInt64 398 if storedBytes <= maxStoreBlockSize && input != nil { 399 storedSize = int64((storedBytes + 5) * 8) 400 // We only bother calculating the costs of the extra bits required by 401 // the length of offset fields (which will be the same for both fixed 402 // and dynamic encoding), if we need to compare those two encodings 403 // against stored encoding. 404 for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { 405 // First eight length codes have extra size = 0. 406 extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart]) 407 } 408 for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { 409 // First four offset codes have extra size = 0. 410 extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode]) 411 } 412 } 413 414 // Figure out smallest code. 415 // Fixed Huffman baseline. 416 var size = int64(3) + 417 fixedLiteralEncoding.bitLength(w.literalFreq) + 418 fixedOffsetEncoding.bitLength(w.offsetFreq) + 419 extraBits 420 var literalEncoding = fixedLiteralEncoding 421 var offsetEncoding = fixedOffsetEncoding 422 423 // Dynamic Huffman? 424 var numCodegens int 425 426 // Generate codegen and codegenFrequencies, which indicates how to encode 427 // the literalEncoding and the offsetEncoding. 428 w.generateCodegen(numLiterals, numOffsets) 429 w.codegenEncoding.generate(w.codegenFreq, 7) 430 numCodegens = len(w.codegenFreq) 431 for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { 432 numCodegens-- 433 } 434 dynamicHeader := int64(3+5+5+4+(3*numCodegens)) + 435 w.codegenEncoding.bitLength(w.codegenFreq) + 436 int64(extraBits) + 437 int64(w.codegenFreq[16]*2) + 438 int64(w.codegenFreq[17]*3) + 439 int64(w.codegenFreq[18]*7) 440 dynamicSize := dynamicHeader + 441 w.literalEncoding.bitLength(w.literalFreq) + 442 w.offsetEncoding.bitLength(w.offsetFreq) 443 444 if dynamicSize < size { 445 size = dynamicSize 446 literalEncoding = w.literalEncoding 447 offsetEncoding = w.offsetEncoding 448 } 449 450 // Stored bytes? 451 if storedSize < size { 452 w.writeStoredHeader(storedBytes, eof) 453 w.writeBytes(input[0:storedBytes]) 454 return 455 } 456 457 // Huffman. 458 if literalEncoding == fixedLiteralEncoding { 459 w.writeFixedHeader(eof) 460 } else { 461 w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) 462 } 463 for _, t := range tokens { 464 switch t.typ() { 465 case literalType: 466 w.writeCode(literalEncoding, t.literal()) 467 break 468 case matchType: 469 // Write the length 470 length := t.length() 471 lengthCode := lengthCode(length) 472 w.writeCode(literalEncoding, lengthCode+lengthCodesStart) 473 extraLengthBits := int32(lengthExtraBits[lengthCode]) 474 if extraLengthBits > 0 { 475 extraLength := int32(length - lengthBase[lengthCode]) 476 w.writeBits(extraLength, extraLengthBits) 477 } 478 // Write the offset 479 offset := t.offset() 480 offsetCode := offsetCode(offset) 481 w.writeCode(offsetEncoding, offsetCode) 482 extraOffsetBits := int32(offsetExtraBits[offsetCode]) 483 if extraOffsetBits > 0 { 484 extraOffset := int32(offset - offsetBase[offsetCode]) 485 w.writeBits(extraOffset, extraOffsetBits) 486 } 487 break 488 default: 489 panic("unknown token type: " + string(t)) 490 } 491 } 492 }