github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/compress/flate/huffman_bit_writer.go

github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/compress/flate/huffman_bit_writer.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package flate
     6  
     7  import (
     8  	"io"
     9  	"math"
    10  )
    11  
    12  const (
    13  	// The largest offset code.
    14  	offsetCodeCount = 30
    15  
    16  	// The special code used to mark the end of a block.
    17  	endBlockMarker = 256
    18  
    19  	// The first length code.
    20  	lengthCodesStart = 257
    21  
    22  	// The number of codegen codes.
    23  	codegenCodeCount = 19
    24  	badCode          = 255
    25  )
    26  
    27  // The number of extra bits needed by length code X - LENGTH_CODES_START.
    28  var lengthExtraBits = []int8{
    29  	/* 257 */ 0, 0, 0,
    30  	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
    31  	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
    32  	/* 280 */ 4, 5, 5, 5, 5, 0,
    33  }
    34  
    35  // The length indicated by length code X - LENGTH_CODES_START.
    36  var lengthBase = []uint32{
    37  	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
    38  	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
    39  	64, 80, 96, 112, 128, 160, 192, 224, 255,
    40  }
    41  
    42  // offset code word extra bits.
    43  var offsetExtraBits = []int8{
    44  	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
    45  	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
    46  	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
    47  	/* extended window */
    48  	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
    49  }
    50  
    51  var offsetBase = []uint32{
    52  	/* normal deflate */
    53  	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
    54  	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
    55  	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
    56  	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
    57  	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
    58  	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
    59  
    60  	/* extended window */
    61  	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
    62  	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
    63  	0x100000, 0x180000, 0x200000, 0x300000,
    64  }
    65  
    66  // The odd order in which the codegen code sizes are written.
    67  var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
    68  
    69  type huffmanBitWriter struct {
    70  	w io.Writer
    71  	// Data waiting to be written is bytes[0:nbytes]
    72  	// and then the low nbits of bits.
    73  	bits            uint32
    74  	nbits           uint32
    75  	bytes           [64]byte
    76  	nbytes          int
    77  	literalFreq     []int32
    78  	offsetFreq      []int32
    79  	codegen         []uint8
    80  	codegenFreq     []int32
    81  	literalEncoding *huffmanEncoder
    82  	offsetEncoding  *huffmanEncoder
    83  	codegenEncoding *huffmanEncoder
    84  	err             error
    85  }
    86  
    87  func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
    88  	return &huffmanBitWriter{
    89  		w:               w,
    90  		literalFreq:     make([]int32, maxLit),
    91  		offsetFreq:      make([]int32, offsetCodeCount),
    92  		codegen:         make([]uint8, maxLit+offsetCodeCount+1),
    93  		codegenFreq:     make([]int32, codegenCodeCount),
    94  		literalEncoding: newHuffmanEncoder(maxLit),
    95  		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
    96  		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
    97  	}
    98  }
    99  
   100  func (w *huffmanBitWriter) flushBits() {
   101  	if w.err != nil {
   102  		w.nbits = 0
   103  		return
   104  	}
   105  	bits := w.bits
   106  	w.bits >>= 16
   107  	w.nbits -= 16
   108  	n := w.nbytes
   109  	w.bytes[n] = byte(bits)
   110  	w.bytes[n+1] = byte(bits >> 8)
   111  	if n += 2; n >= len(w.bytes) {
   112  		_, w.err = w.w.Write(w.bytes[0:])
   113  		n = 0
   114  	}
   115  	w.nbytes = n
   116  }
   117  
   118  func (w *huffmanBitWriter) flush() {
   119  	if w.err != nil {
   120  		w.nbits = 0
   121  		return
   122  	}
   123  	n := w.nbytes
   124  	if w.nbits > 8 {
   125  		w.bytes[n] = byte(w.bits)
   126  		w.bits >>= 8
   127  		w.nbits -= 8
   128  		n++
   129  	}
   130  	if w.nbits > 0 {
   131  		w.bytes[n] = byte(w.bits)
   132  		w.nbits = 0
   133  		n++
   134  	}
   135  	w.bits = 0
   136  	_, w.err = w.w.Write(w.bytes[0:n])
   137  	w.nbytes = 0
   138  }
   139  
   140  func (w *huffmanBitWriter) writeBits(b, nb int32) {
   141  	w.bits |= uint32(b) << w.nbits
   142  	if w.nbits += uint32(nb); w.nbits >= 16 {
   143  		w.flushBits()
   144  	}
   145  }
   146  
   147  func (w *huffmanBitWriter) writeBytes(bytes []byte) {
   148  	if w.err != nil {
   149  		return
   150  	}
   151  	n := w.nbytes
   152  	if w.nbits == 8 {
   153  		w.bytes[n] = byte(w.bits)
   154  		w.nbits = 0
   155  		n++
   156  	}
   157  	if w.nbits != 0 {
   158  		w.err = InternalError("writeBytes with unfinished bits")
   159  		return
   160  	}
   161  	if n != 0 {
   162  		_, w.err = w.w.Write(w.bytes[0:n])
   163  		if w.err != nil {
   164  			return
   165  		}
   166  	}
   167  	w.nbytes = 0
   168  	_, w.err = w.w.Write(bytes)
   169  }
   170  
   171  // RFC 1951 3.2.7 specifies a special run-length encoding for specifying
   172  // the literal and offset lengths arrays (which are concatenated into a single
   173  // array).  This method generates that run-length encoding.
   174  //
   175  // The result is written into the codegen array, and the frequencies
   176  // of each code is written into the codegenFreq array.
   177  // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
   178  // information.  Code badCode is an end marker
   179  //
   180  //  numLiterals      The number of literals in literalEncoding
   181  //  numOffsets       The number of offsets in offsetEncoding
   182  func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int) {
   183  	for i := range w.codegenFreq {
   184  		w.codegenFreq[i] = 0
   185  	}
   186  	// Note that we are using codegen both as a temporary variable for holding
   187  	// a copy of the frequencies, and as the place where we put the result.
   188  	// This is fine because the output is always shorter than the input used
   189  	// so far.
   190  	codegen := w.codegen // cache
   191  	// Copy the concatenated code sizes to codegen.  Put a marker at the end.
   192  	copy(codegen[0:numLiterals], w.literalEncoding.codeBits)
   193  	copy(codegen[numLiterals:numLiterals+numOffsets], w.offsetEncoding.codeBits)
   194  	codegen[numLiterals+numOffsets] = badCode
   195  
   196  	size := codegen[0]
   197  	count := 1
   198  	outIndex := 0
   199  	for inIndex := 1; size != badCode; inIndex++ {
   200  		// INVARIANT: We have seen "count" copies of size that have not yet
   201  		// had output generated for them.
   202  		nextSize := codegen[inIndex]
   203  		if nextSize == size {
   204  			count++
   205  			continue
   206  		}
   207  		// We need to generate codegen indicating "count" of size.
   208  		if size != 0 {
   209  			codegen[outIndex] = size
   210  			outIndex++
   211  			w.codegenFreq[size]++
   212  			count--
   213  			for count >= 3 {
   214  				n := 6
   215  				if n > count {
   216  					n = count
   217  				}
   218  				codegen[outIndex] = 16
   219  				outIndex++
   220  				codegen[outIndex] = uint8(n - 3)
   221  				outIndex++
   222  				w.codegenFreq[16]++
   223  				count -= n
   224  			}
   225  		} else {
   226  			for count >= 11 {
   227  				n := 138
   228  				if n > count {
   229  					n = count
   230  				}
   231  				codegen[outIndex] = 18
   232  				outIndex++
   233  				codegen[outIndex] = uint8(n - 11)
   234  				outIndex++
   235  				w.codegenFreq[18]++
   236  				count -= n
   237  			}
   238  			if count >= 3 {
   239  				// count >= 3 && count <= 10
   240  				codegen[outIndex] = 17
   241  				outIndex++
   242  				codegen[outIndex] = uint8(count - 3)
   243  				outIndex++
   244  				w.codegenFreq[17]++
   245  				count = 0
   246  			}
   247  		}
   248  		count--
   249  		for ; count >= 0; count-- {
   250  			codegen[outIndex] = size
   251  			outIndex++
   252  			w.codegenFreq[size]++
   253  		}
   254  		// Set up invariant for next time through the loop.
   255  		size = nextSize
   256  		count = 1
   257  	}
   258  	// Marker indicating the end of the codegen.
   259  	codegen[outIndex] = badCode
   260  }
   261  
   262  func (w *huffmanBitWriter) writeCode(code *huffmanEncoder, literal uint32) {
   263  	if w.err != nil {
   264  		return
   265  	}
   266  	w.writeBits(int32(code.code[literal]), int32(code.codeBits[literal]))
   267  }
   268  
   269  // Write the header of a dynamic Huffman block to the output stream.
   270  //
   271  //  numLiterals  The number of literals specified in codegen
   272  //  numOffsets   The number of offsets specified in codegen
   273  //  numCodegens  The number of codegens used in codegen
   274  func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
   275  	if w.err != nil {
   276  		return
   277  	}
   278  	var firstBits int32 = 4
   279  	if isEof {
   280  		firstBits = 5
   281  	}
   282  	w.writeBits(firstBits, 3)
   283  	w.writeBits(int32(numLiterals-257), 5)
   284  	w.writeBits(int32(numOffsets-1), 5)
   285  	w.writeBits(int32(numCodegens-4), 4)
   286  
   287  	for i := 0; i < numCodegens; i++ {
   288  		value := w.codegenEncoding.codeBits[codegenOrder[i]]
   289  		w.writeBits(int32(value), 3)
   290  	}
   291  
   292  	i := 0
   293  	for {
   294  		var codeWord int = int(w.codegen[i])
   295  		i++
   296  		if codeWord == badCode {
   297  			break
   298  		}
   299  		// The low byte contains the actual code to generate.
   300  		w.writeCode(w.codegenEncoding, uint32(codeWord))
   301  
   302  		switch codeWord {
   303  		case 16:
   304  			w.writeBits(int32(w.codegen[i]), 2)
   305  			i++
   306  			break
   307  		case 17:
   308  			w.writeBits(int32(w.codegen[i]), 3)
   309  			i++
   310  			break
   311  		case 18:
   312  			w.writeBits(int32(w.codegen[i]), 7)
   313  			i++
   314  			break
   315  		}
   316  	}
   317  }
   318  
   319  func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
   320  	if w.err != nil {
   321  		return
   322  	}
   323  	var flag int32
   324  	if isEof {
   325  		flag = 1
   326  	}
   327  	w.writeBits(flag, 3)
   328  	w.flush()
   329  	w.writeBits(int32(length), 16)
   330  	w.writeBits(int32(^uint16(length)), 16)
   331  }
   332  
   333  func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
   334  	if w.err != nil {
   335  		return
   336  	}
   337  	// Indicate that we are a fixed Huffman block
   338  	var value int32 = 2
   339  	if isEof {
   340  		value = 3
   341  	}
   342  	w.writeBits(value, 3)
   343  }
   344  
   345  func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
   346  	if w.err != nil {
   347  		return
   348  	}
   349  	for i := range w.literalFreq {
   350  		w.literalFreq[i] = 0
   351  	}
   352  	for i := range w.offsetFreq {
   353  		w.offsetFreq[i] = 0
   354  	}
   355  
   356  	n := len(tokens)
   357  	tokens = tokens[0 : n+1]
   358  	tokens[n] = endBlockMarker
   359  
   360  	for _, t := range tokens {
   361  		switch t.typ() {
   362  		case literalType:
   363  			w.literalFreq[t.literal()]++
   364  		case matchType:
   365  			length := t.length()
   366  			offset := t.offset()
   367  			w.literalFreq[lengthCodesStart+lengthCode(length)]++
   368  			w.offsetFreq[offsetCode(offset)]++
   369  		}
   370  	}
   371  
   372  	// get the number of literals
   373  	numLiterals := len(w.literalFreq)
   374  	for w.literalFreq[numLiterals-1] == 0 {
   375  		numLiterals--
   376  	}
   377  	// get the number of offsets
   378  	numOffsets := len(w.offsetFreq)
   379  	for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
   380  		numOffsets--
   381  	}
   382  	if numOffsets == 0 {
   383  		// We haven't found a single match. If we want to go with the dynamic encoding,
   384  		// we should count at least one offset to be sure that the offset huffman tree could be encoded.
   385  		w.offsetFreq[0] = 1
   386  		numOffsets = 1
   387  	}
   388  
   389  	w.literalEncoding.generate(w.literalFreq, 15)
   390  	w.offsetEncoding.generate(w.offsetFreq, 15)
   391  
   392  	storedBytes := 0
   393  	if input != nil {
   394  		storedBytes = len(input)
   395  	}
   396  	var extraBits int64
   397  	var storedSize int64 = math.MaxInt64
   398  	if storedBytes <= maxStoreBlockSize && input != nil {
   399  		storedSize = int64((storedBytes + 5) * 8)
   400  		// We only bother calculating the costs of the extra bits required by
   401  		// the length of offset fields (which will be the same for both fixed
   402  		// and dynamic encoding), if we need to compare those two encodings
   403  		// against stored encoding.
   404  		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
   405  			// First eight length codes have extra size = 0.
   406  			extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart])
   407  		}
   408  		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
   409  			// First four offset codes have extra size = 0.
   410  			extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
   411  		}
   412  	}
   413  
   414  	// Figure out smallest code.
   415  	// Fixed Huffman baseline.
   416  	var size = int64(3) +
   417  		fixedLiteralEncoding.bitLength(w.literalFreq) +
   418  		fixedOffsetEncoding.bitLength(w.offsetFreq) +
   419  		extraBits
   420  	var literalEncoding = fixedLiteralEncoding
   421  	var offsetEncoding = fixedOffsetEncoding
   422  
   423  	// Dynamic Huffman?
   424  	var numCodegens int
   425  
   426  	// Generate codegen and codegenFrequencies, which indicates how to encode
   427  	// the literalEncoding and the offsetEncoding.
   428  	w.generateCodegen(numLiterals, numOffsets)
   429  	w.codegenEncoding.generate(w.codegenFreq, 7)
   430  	numCodegens = len(w.codegenFreq)
   431  	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
   432  		numCodegens--
   433  	}
   434  	dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
   435  		w.codegenEncoding.bitLength(w.codegenFreq) +
   436  		int64(extraBits) +
   437  		int64(w.codegenFreq[16]*2) +
   438  		int64(w.codegenFreq[17]*3) +
   439  		int64(w.codegenFreq[18]*7)
   440  	dynamicSize := dynamicHeader +
   441  		w.literalEncoding.bitLength(w.literalFreq) +
   442  		w.offsetEncoding.bitLength(w.offsetFreq)
   443  
   444  	if dynamicSize < size {
   445  		size = dynamicSize
   446  		literalEncoding = w.literalEncoding
   447  		offsetEncoding = w.offsetEncoding
   448  	}
   449  
   450  	// Stored bytes?
   451  	if storedSize < size {
   452  		w.writeStoredHeader(storedBytes, eof)
   453  		w.writeBytes(input[0:storedBytes])
   454  		return
   455  	}
   456  
   457  	// Huffman.
   458  	if literalEncoding == fixedLiteralEncoding {
   459  		w.writeFixedHeader(eof)
   460  	} else {
   461  		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
   462  	}
   463  	for _, t := range tokens {
   464  		switch t.typ() {
   465  		case literalType:
   466  			w.writeCode(literalEncoding, t.literal())
   467  			break
   468  		case matchType:
   469  			// Write the length
   470  			length := t.length()
   471  			lengthCode := lengthCode(length)
   472  			w.writeCode(literalEncoding, lengthCode+lengthCodesStart)
   473  			extraLengthBits := int32(lengthExtraBits[lengthCode])
   474  			if extraLengthBits > 0 {
   475  				extraLength := int32(length - lengthBase[lengthCode])
   476  				w.writeBits(extraLength, extraLengthBits)
   477  			}
   478  			// Write the offset
   479  			offset := t.offset()
   480  			offsetCode := offsetCode(offset)
   481  			w.writeCode(offsetEncoding, offsetCode)
   482  			extraOffsetBits := int32(offsetExtraBits[offsetCode])
   483  			if extraOffsetBits > 0 {
   484  				extraOffset := int32(offset - offsetBase[offsetCode])
   485  				w.writeBits(extraOffset, extraOffsetBits)
   486  			}
   487  			break
   488  		default:
   489  			panic("unknown token type: " + string(t))
   490  		}
   491  	}
   492  }