github.com/zach-klippenstein/go@v0.0.0-20150108044943-fcfbeb3adf58/src/compress/flate/huffman_bit_writer.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package flate
     6  
     7  import (
     8  	"io"
     9  	"math"
    10  )
    11  
    12  const (
    13  	// The largest offset code.
    14  	offsetCodeCount = 30
    15  
    16  	// The special code used to mark the end of a block.
    17  	endBlockMarker = 256
    18  
    19  	// The first length code.
    20  	lengthCodesStart = 257
    21  
    22  	// The number of codegen codes.
    23  	codegenCodeCount = 19
    24  	badCode          = 255
    25  )
    26  
    27  // The number of extra bits needed by length code X - LENGTH_CODES_START.
    28  var lengthExtraBits = []int8{
    29  	/* 257 */ 0, 0, 0,
    30  	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
    31  	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
    32  	/* 280 */ 4, 5, 5, 5, 5, 0,
    33  }
    34  
    35  // The length indicated by length code X - LENGTH_CODES_START.
    36  var lengthBase = []uint32{
    37  	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
    38  	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
    39  	64, 80, 96, 112, 128, 160, 192, 224, 255,
    40  }
    41  
    42  // offset code word extra bits.
    43  var offsetExtraBits = []int8{
    44  	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
    45  	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
    46  	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
    47  	/* extended window */
    48  	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
    49  }
    50  
    51  var offsetBase = []uint32{
    52  	/* normal deflate */
    53  	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
    54  	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
    55  	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
    56  	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
    57  	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
    58  	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
    59  
    60  	/* extended window */
    61  	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
    62  	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
    63  	0x100000, 0x180000, 0x200000, 0x300000,
    64  }
    65  
    66  // The odd order in which the codegen code sizes are written.
    67  var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
    68  
    69  type huffmanBitWriter struct {
    70  	w io.Writer
    71  	// Data waiting to be written is bytes[0:nbytes]
    72  	// and then the low nbits of bits.
    73  	bits            uint32
    74  	nbits           uint32
    75  	bytes           [64]byte
    76  	nbytes          int
    77  	literalFreq     []int32
    78  	offsetFreq      []int32
    79  	codegen         []uint8
    80  	codegenFreq     []int32
    81  	literalEncoding *huffmanEncoder
    82  	offsetEncoding  *huffmanEncoder
    83  	codegenEncoding *huffmanEncoder
    84  	err             error
    85  }
    86  
    87  func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
    88  	return &huffmanBitWriter{
    89  		w:               w,
    90  		literalFreq:     make([]int32, maxLit),
    91  		offsetFreq:      make([]int32, offsetCodeCount),
    92  		codegen:         make([]uint8, maxLit+offsetCodeCount+1),
    93  		codegenFreq:     make([]int32, codegenCodeCount),
    94  		literalEncoding: newHuffmanEncoder(maxLit),
    95  		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
    96  		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
    97  	}
    98  }
    99  
   100  func (w *huffmanBitWriter) reset(writer io.Writer) {
   101  	w.w = writer
   102  	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
   103  	w.bytes = [64]byte{}
   104  	for i := range w.codegen {
   105  		w.codegen[i] = 0
   106  	}
   107  	for _, s := range [...][]int32{w.literalFreq, w.offsetFreq, w.codegenFreq} {
   108  		for i := range s {
   109  			s[i] = 0
   110  		}
   111  	}
   112  	for _, enc := range [...]*huffmanEncoder{
   113  		w.literalEncoding,
   114  		w.offsetEncoding,
   115  		w.codegenEncoding} {
   116  		for i := range enc.code {
   117  			enc.code[i] = 0
   118  		}
   119  		for i := range enc.codeBits {
   120  			enc.codeBits[i] = 0
   121  		}
   122  	}
   123  }
   124  
   125  func (w *huffmanBitWriter) flushBits() {
   126  	if w.err != nil {
   127  		w.nbits = 0
   128  		return
   129  	}
   130  	bits := w.bits
   131  	w.bits >>= 16
   132  	w.nbits -= 16
   133  	n := w.nbytes
   134  	w.bytes[n] = byte(bits)
   135  	w.bytes[n+1] = byte(bits >> 8)
   136  	if n += 2; n >= len(w.bytes) {
   137  		_, w.err = w.w.Write(w.bytes[0:])
   138  		n = 0
   139  	}
   140  	w.nbytes = n
   141  }
   142  
   143  func (w *huffmanBitWriter) flush() {
   144  	if w.err != nil {
   145  		w.nbits = 0
   146  		return
   147  	}
   148  	n := w.nbytes
   149  	if w.nbits > 8 {
   150  		w.bytes[n] = byte(w.bits)
   151  		w.bits >>= 8
   152  		w.nbits -= 8
   153  		n++
   154  	}
   155  	if w.nbits > 0 {
   156  		w.bytes[n] = byte(w.bits)
   157  		w.nbits = 0
   158  		n++
   159  	}
   160  	w.bits = 0
   161  	_, w.err = w.w.Write(w.bytes[0:n])
   162  	w.nbytes = 0
   163  }
   164  
   165  func (w *huffmanBitWriter) writeBits(b, nb int32) {
   166  	w.bits |= uint32(b) << w.nbits
   167  	if w.nbits += uint32(nb); w.nbits >= 16 {
   168  		w.flushBits()
   169  	}
   170  }
   171  
   172  func (w *huffmanBitWriter) writeBytes(bytes []byte) {
   173  	if w.err != nil {
   174  		return
   175  	}
   176  	n := w.nbytes
   177  	if w.nbits == 8 {
   178  		w.bytes[n] = byte(w.bits)
   179  		w.nbits = 0
   180  		n++
   181  	}
   182  	if w.nbits != 0 {
   183  		w.err = InternalError("writeBytes with unfinished bits")
   184  		return
   185  	}
   186  	if n != 0 {
   187  		_, w.err = w.w.Write(w.bytes[0:n])
   188  		if w.err != nil {
   189  			return
   190  		}
   191  	}
   192  	w.nbytes = 0
   193  	_, w.err = w.w.Write(bytes)
   194  }
   195  
   196  // RFC 1951 3.2.7 specifies a special run-length encoding for specifying
   197  // the literal and offset lengths arrays (which are concatenated into a single
   198  // array).  This method generates that run-length encoding.
   199  //
   200  // The result is written into the codegen array, and the frequencies
   201  // of each code is written into the codegenFreq array.
   202  // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
   203  // information.  Code badCode is an end marker
   204  //
   205  //  numLiterals      The number of literals in literalEncoding
   206  //  numOffsets       The number of offsets in offsetEncoding
   207  func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int) {
   208  	for i := range w.codegenFreq {
   209  		w.codegenFreq[i] = 0
   210  	}
   211  	// Note that we are using codegen both as a temporary variable for holding
   212  	// a copy of the frequencies, and as the place where we put the result.
   213  	// This is fine because the output is always shorter than the input used
   214  	// so far.
   215  	codegen := w.codegen // cache
   216  	// Copy the concatenated code sizes to codegen.  Put a marker at the end.
   217  	copy(codegen[0:numLiterals], w.literalEncoding.codeBits)
   218  	copy(codegen[numLiterals:numLiterals+numOffsets], w.offsetEncoding.codeBits)
   219  	codegen[numLiterals+numOffsets] = badCode
   220  
   221  	size := codegen[0]
   222  	count := 1
   223  	outIndex := 0
   224  	for inIndex := 1; size != badCode; inIndex++ {
   225  		// INVARIANT: We have seen "count" copies of size that have not yet
   226  		// had output generated for them.
   227  		nextSize := codegen[inIndex]
   228  		if nextSize == size {
   229  			count++
   230  			continue
   231  		}
   232  		// We need to generate codegen indicating "count" of size.
   233  		if size != 0 {
   234  			codegen[outIndex] = size
   235  			outIndex++
   236  			w.codegenFreq[size]++
   237  			count--
   238  			for count >= 3 {
   239  				n := 6
   240  				if n > count {
   241  					n = count
   242  				}
   243  				codegen[outIndex] = 16
   244  				outIndex++
   245  				codegen[outIndex] = uint8(n - 3)
   246  				outIndex++
   247  				w.codegenFreq[16]++
   248  				count -= n
   249  			}
   250  		} else {
   251  			for count >= 11 {
   252  				n := 138
   253  				if n > count {
   254  					n = count
   255  				}
   256  				codegen[outIndex] = 18
   257  				outIndex++
   258  				codegen[outIndex] = uint8(n - 11)
   259  				outIndex++
   260  				w.codegenFreq[18]++
   261  				count -= n
   262  			}
   263  			if count >= 3 {
   264  				// count >= 3 && count <= 10
   265  				codegen[outIndex] = 17
   266  				outIndex++
   267  				codegen[outIndex] = uint8(count - 3)
   268  				outIndex++
   269  				w.codegenFreq[17]++
   270  				count = 0
   271  			}
   272  		}
   273  		count--
   274  		for ; count >= 0; count-- {
   275  			codegen[outIndex] = size
   276  			outIndex++
   277  			w.codegenFreq[size]++
   278  		}
   279  		// Set up invariant for next time through the loop.
   280  		size = nextSize
   281  		count = 1
   282  	}
   283  	// Marker indicating the end of the codegen.
   284  	codegen[outIndex] = badCode
   285  }
   286  
   287  func (w *huffmanBitWriter) writeCode(code *huffmanEncoder, literal uint32) {
   288  	if w.err != nil {
   289  		return
   290  	}
   291  	w.writeBits(int32(code.code[literal]), int32(code.codeBits[literal]))
   292  }
   293  
   294  // Write the header of a dynamic Huffman block to the output stream.
   295  //
   296  //  numLiterals  The number of literals specified in codegen
   297  //  numOffsets   The number of offsets specified in codegen
   298  //  numCodegens  The number of codegens used in codegen
   299  func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
   300  	if w.err != nil {
   301  		return
   302  	}
   303  	var firstBits int32 = 4
   304  	if isEof {
   305  		firstBits = 5
   306  	}
   307  	w.writeBits(firstBits, 3)
   308  	w.writeBits(int32(numLiterals-257), 5)
   309  	w.writeBits(int32(numOffsets-1), 5)
   310  	w.writeBits(int32(numCodegens-4), 4)
   311  
   312  	for i := 0; i < numCodegens; i++ {
   313  		value := w.codegenEncoding.codeBits[codegenOrder[i]]
   314  		w.writeBits(int32(value), 3)
   315  	}
   316  
   317  	i := 0
   318  	for {
   319  		var codeWord int = int(w.codegen[i])
   320  		i++
   321  		if codeWord == badCode {
   322  			break
   323  		}
   324  		// The low byte contains the actual code to generate.
   325  		w.writeCode(w.codegenEncoding, uint32(codeWord))
   326  
   327  		switch codeWord {
   328  		case 16:
   329  			w.writeBits(int32(w.codegen[i]), 2)
   330  			i++
   331  			break
   332  		case 17:
   333  			w.writeBits(int32(w.codegen[i]), 3)
   334  			i++
   335  			break
   336  		case 18:
   337  			w.writeBits(int32(w.codegen[i]), 7)
   338  			i++
   339  			break
   340  		}
   341  	}
   342  }
   343  
   344  func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
   345  	if w.err != nil {
   346  		return
   347  	}
   348  	var flag int32
   349  	if isEof {
   350  		flag = 1
   351  	}
   352  	w.writeBits(flag, 3)
   353  	w.flush()
   354  	w.writeBits(int32(length), 16)
   355  	w.writeBits(int32(^uint16(length)), 16)
   356  }
   357  
   358  func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
   359  	if w.err != nil {
   360  		return
   361  	}
   362  	// Indicate that we are a fixed Huffman block
   363  	var value int32 = 2
   364  	if isEof {
   365  		value = 3
   366  	}
   367  	w.writeBits(value, 3)
   368  }
   369  
   370  func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
   371  	if w.err != nil {
   372  		return
   373  	}
   374  	for i := range w.literalFreq {
   375  		w.literalFreq[i] = 0
   376  	}
   377  	for i := range w.offsetFreq {
   378  		w.offsetFreq[i] = 0
   379  	}
   380  
   381  	n := len(tokens)
   382  	tokens = tokens[0 : n+1]
   383  	tokens[n] = endBlockMarker
   384  
   385  	for _, t := range tokens {
   386  		switch t.typ() {
   387  		case literalType:
   388  			w.literalFreq[t.literal()]++
   389  		case matchType:
   390  			length := t.length()
   391  			offset := t.offset()
   392  			w.literalFreq[lengthCodesStart+lengthCode(length)]++
   393  			w.offsetFreq[offsetCode(offset)]++
   394  		}
   395  	}
   396  
   397  	// get the number of literals
   398  	numLiterals := len(w.literalFreq)
   399  	for w.literalFreq[numLiterals-1] == 0 {
   400  		numLiterals--
   401  	}
   402  	// get the number of offsets
   403  	numOffsets := len(w.offsetFreq)
   404  	for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
   405  		numOffsets--
   406  	}
   407  	if numOffsets == 0 {
   408  		// We haven't found a single match. If we want to go with the dynamic encoding,
   409  		// we should count at least one offset to be sure that the offset huffman tree could be encoded.
   410  		w.offsetFreq[0] = 1
   411  		numOffsets = 1
   412  	}
   413  
   414  	w.literalEncoding.generate(w.literalFreq, 15)
   415  	w.offsetEncoding.generate(w.offsetFreq, 15)
   416  
   417  	storedBytes := 0
   418  	if input != nil {
   419  		storedBytes = len(input)
   420  	}
   421  	var extraBits int64
   422  	var storedSize int64 = math.MaxInt64
   423  	if storedBytes <= maxStoreBlockSize && input != nil {
   424  		storedSize = int64((storedBytes + 5) * 8)
   425  		// We only bother calculating the costs of the extra bits required by
   426  		// the length of offset fields (which will be the same for both fixed
   427  		// and dynamic encoding), if we need to compare those two encodings
   428  		// against stored encoding.
   429  		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
   430  			// First eight length codes have extra size = 0.
   431  			extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart])
   432  		}
   433  		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
   434  			// First four offset codes have extra size = 0.
   435  			extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
   436  		}
   437  	}
   438  
   439  	// Figure out smallest code.
   440  	// Fixed Huffman baseline.
   441  	var size = int64(3) +
   442  		fixedLiteralEncoding.bitLength(w.literalFreq) +
   443  		fixedOffsetEncoding.bitLength(w.offsetFreq) +
   444  		extraBits
   445  	var literalEncoding = fixedLiteralEncoding
   446  	var offsetEncoding = fixedOffsetEncoding
   447  
   448  	// Dynamic Huffman?
   449  	var numCodegens int
   450  
   451  	// Generate codegen and codegenFrequencies, which indicates how to encode
   452  	// the literalEncoding and the offsetEncoding.
   453  	w.generateCodegen(numLiterals, numOffsets)
   454  	w.codegenEncoding.generate(w.codegenFreq, 7)
   455  	numCodegens = len(w.codegenFreq)
   456  	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
   457  		numCodegens--
   458  	}
   459  	dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
   460  		w.codegenEncoding.bitLength(w.codegenFreq) +
   461  		int64(extraBits) +
   462  		int64(w.codegenFreq[16]*2) +
   463  		int64(w.codegenFreq[17]*3) +
   464  		int64(w.codegenFreq[18]*7)
   465  	dynamicSize := dynamicHeader +
   466  		w.literalEncoding.bitLength(w.literalFreq) +
   467  		w.offsetEncoding.bitLength(w.offsetFreq)
   468  
   469  	if dynamicSize < size {
   470  		size = dynamicSize
   471  		literalEncoding = w.literalEncoding
   472  		offsetEncoding = w.offsetEncoding
   473  	}
   474  
   475  	// Stored bytes?
   476  	if storedSize < size {
   477  		w.writeStoredHeader(storedBytes, eof)
   478  		w.writeBytes(input[0:storedBytes])
   479  		return
   480  	}
   481  
   482  	// Huffman.
   483  	if literalEncoding == fixedLiteralEncoding {
   484  		w.writeFixedHeader(eof)
   485  	} else {
   486  		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
   487  	}
   488  	for _, t := range tokens {
   489  		switch t.typ() {
   490  		case literalType:
   491  			w.writeCode(literalEncoding, t.literal())
   492  			break
   493  		case matchType:
   494  			// Write the length
   495  			length := t.length()
   496  			lengthCode := lengthCode(length)
   497  			w.writeCode(literalEncoding, lengthCode+lengthCodesStart)
   498  			extraLengthBits := int32(lengthExtraBits[lengthCode])
   499  			if extraLengthBits > 0 {
   500  				extraLength := int32(length - lengthBase[lengthCode])
   501  				w.writeBits(extraLength, extraLengthBits)
   502  			}
   503  			// Write the offset
   504  			offset := t.offset()
   505  			offsetCode := offsetCode(offset)
   506  			w.writeCode(offsetEncoding, offsetCode)
   507  			extraOffsetBits := int32(offsetExtraBits[offsetCode])
   508  			if extraOffsetBits > 0 {
   509  				extraOffset := int32(offset - offsetBase[offsetCode])
   510  				w.writeBits(extraOffset, extraOffsetBits)
   511  			}
   512  			break
   513  		default:
   514  			panic("unknown token type: " + string(t))
   515  		}
   516  	}
   517  }