github.com/zach-klippenstein/go@v0.0.0-20150108044943-fcfbeb3adf58/src/compress/bzip2/huffman.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bzip2
     6  
     7  import "sort"
     8  
     9  // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
    10  // symbol.
    11  type huffmanTree struct {
    12  	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
    13  	// root of the tree and nextNode contains the index of the next element
    14  	// of nodes to use when the tree is being constructed.
    15  	nodes    []huffmanNode
    16  	nextNode int
    17  }
    18  
    19  // A huffmanNode is a node in the tree. left and right contain indexes into the
    20  // nodes slice of the tree. If left or right is invalidNodeValue then the child
    21  // is a left node and its value is in leftValue/rightValue.
    22  //
    23  // The symbols are uint16s because bzip2 encodes not only MTF indexes in the
    24  // tree, but also two magic values for run-length encoding and an EOF symbol.
    25  // Thus there are more than 256 possible symbols.
    26  type huffmanNode struct {
    27  	left, right           uint16
    28  	leftValue, rightValue uint16
    29  }
    30  
    31  // invalidNodeValue is an invalid index which marks a leaf node in the tree.
    32  const invalidNodeValue = 0xffff
    33  
    34  // Decode reads bits from the given bitReader and navigates the tree until a
    35  // symbol is found.
    36  func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
    37  	nodeIndex := uint16(0) // node 0 is the root of the tree.
    38  
    39  	for {
    40  		node := &t.nodes[nodeIndex]
    41  		bit, ok := br.TryReadBit()
    42  		if !ok && br.ReadBit() {
    43  			bit = 1
    44  		}
    45  		// bzip2 encodes left as a true bit.
    46  		if bit != 0 {
    47  			// left
    48  			if node.left == invalidNodeValue {
    49  				return node.leftValue
    50  			}
    51  			nodeIndex = node.left
    52  		} else {
    53  			// right
    54  			if node.right == invalidNodeValue {
    55  				return node.rightValue
    56  			}
    57  			nodeIndex = node.right
    58  		}
    59  	}
    60  }
    61  
    62  // newHuffmanTree builds a Huffman tree from a slice containing the code
    63  // lengths of each symbol. The maximum code length is 32 bits.
    64  func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
    65  	// There are many possible trees that assign the same code length to
    66  	// each symbol (consider reflecting a tree down the middle, for
    67  	// example). Since the code length assignments determine the
    68  	// efficiency of the tree, each of these trees is equally good. In
    69  	// order to minimize the amount of information needed to build a tree
    70  	// bzip2 uses a canonical tree so that it can be reconstructed given
    71  	// only the code length assignments.
    72  
    73  	if len(lengths) < 2 {
    74  		panic("newHuffmanTree: too few symbols")
    75  	}
    76  
    77  	var t huffmanTree
    78  
    79  	// First we sort the code length assignments by ascending code length,
    80  	// using the symbol value to break ties.
    81  	pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
    82  	for i, length := range lengths {
    83  		pairs[i].value = uint16(i)
    84  		pairs[i].length = length
    85  	}
    86  
    87  	sort.Sort(pairs)
    88  
    89  	// Now we assign codes to the symbols, starting with the longest code.
    90  	// We keep the codes packed into a uint32, at the most-significant end.
    91  	// So branches are taken from the MSB downwards. This makes it easy to
    92  	// sort them later.
    93  	code := uint32(0)
    94  	length := uint8(32)
    95  
    96  	codes := huffmanCodes(make([]huffmanCode, len(lengths)))
    97  	for i := len(pairs) - 1; i >= 0; i-- {
    98  		if length > pairs[i].length {
    99  			// If the code length decreases we shift in order to
   100  			// zero any bits beyond the end of the code.
   101  			length >>= 32 - pairs[i].length
   102  			length <<= 32 - pairs[i].length
   103  			length = pairs[i].length
   104  		}
   105  		codes[i].code = code
   106  		codes[i].codeLen = length
   107  		codes[i].value = pairs[i].value
   108  		// We need to 'increment' the code, which means treating |code|
   109  		// like a |length| bit number.
   110  		code += 1 << (32 - length)
   111  	}
   112  
   113  	// Now we can sort by the code so that the left half of each branch are
   114  	// grouped together, recursively.
   115  	sort.Sort(codes)
   116  
   117  	t.nodes = make([]huffmanNode, len(codes))
   118  	_, err := buildHuffmanNode(&t, codes, 0)
   119  	return t, err
   120  }
   121  
   122  // huffmanSymbolLengthPair contains a symbol and its code length.
   123  type huffmanSymbolLengthPair struct {
   124  	value  uint16
   125  	length uint8
   126  }
   127  
   128  // huffmanSymbolLengthPair is used to provide an interface for sorting.
   129  type huffmanSymbolLengthPairs []huffmanSymbolLengthPair
   130  
   131  func (h huffmanSymbolLengthPairs) Len() int {
   132  	return len(h)
   133  }
   134  
   135  func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
   136  	if h[i].length < h[j].length {
   137  		return true
   138  	}
   139  	if h[i].length > h[j].length {
   140  		return false
   141  	}
   142  	if h[i].value < h[j].value {
   143  		return true
   144  	}
   145  	return false
   146  }
   147  
   148  func (h huffmanSymbolLengthPairs) Swap(i, j int) {
   149  	h[i], h[j] = h[j], h[i]
   150  }
   151  
   152  // huffmanCode contains a symbol, its code and code length.
   153  type huffmanCode struct {
   154  	code    uint32
   155  	codeLen uint8
   156  	value   uint16
   157  }
   158  
   159  // huffmanCodes is used to provide an interface for sorting.
   160  type huffmanCodes []huffmanCode
   161  
   162  func (n huffmanCodes) Len() int {
   163  	return len(n)
   164  }
   165  
   166  func (n huffmanCodes) Less(i, j int) bool {
   167  	return n[i].code < n[j].code
   168  }
   169  
   170  func (n huffmanCodes) Swap(i, j int) {
   171  	n[i], n[j] = n[j], n[i]
   172  }
   173  
   174  // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
   175  // the Huffman tree at the given level. It returns the index of the newly
   176  // constructed node.
   177  func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
   178  	test := uint32(1) << (31 - level)
   179  
   180  	// We have to search the list of codes to find the divide between the left and right sides.
   181  	firstRightIndex := len(codes)
   182  	for i, code := range codes {
   183  		if code.code&test != 0 {
   184  			firstRightIndex = i
   185  			break
   186  		}
   187  	}
   188  
   189  	left := codes[:firstRightIndex]
   190  	right := codes[firstRightIndex:]
   191  
   192  	if len(left) == 0 || len(right) == 0 {
   193  		// There is a superfluous level in the Huffman tree indicating
   194  		// a bug in the encoder. However, this bug has been observed in
   195  		// the wild so we handle it.
   196  
   197  		// If this function was called recursively then we know that
   198  		// len(codes) >= 2 because, otherwise, we would have hit the
   199  		// "leaf node" case, below, and not recursed.
   200  		//
   201  		// However, for the initial call it's possible that len(codes)
   202  		// is zero or one. Both cases are invalid because a zero length
   203  		// tree cannot encode anything and a length-1 tree can only
   204  		// encode EOF and so is superfluous. We reject both.
   205  		if len(codes) < 2 {
   206  			return 0, StructuralError("empty Huffman tree")
   207  		}
   208  
   209  		// In this case the recursion doesn't always reduce the length
   210  		// of codes so we need to ensure termination via another
   211  		// mechanism.
   212  		if level == 31 {
   213  			// Since len(codes) >= 2 the only way that the values
   214  			// can match at all 32 bits is if they are equal, which
   215  			// is invalid. This ensures that we never enter
   216  			// infinite recursion.
   217  			return 0, StructuralError("equal symbols in Huffman tree")
   218  		}
   219  
   220  		if len(left) == 0 {
   221  			return buildHuffmanNode(t, right, level+1)
   222  		}
   223  		return buildHuffmanNode(t, left, level+1)
   224  	}
   225  
   226  	nodeIndex = uint16(t.nextNode)
   227  	node := &t.nodes[t.nextNode]
   228  	t.nextNode++
   229  
   230  	if len(left) == 1 {
   231  		// leaf node
   232  		node.left = invalidNodeValue
   233  		node.leftValue = left[0].value
   234  	} else {
   235  		node.left, err = buildHuffmanNode(t, left, level+1)
   236  	}
   237  
   238  	if err != nil {
   239  		return
   240  	}
   241  
   242  	if len(right) == 1 {
   243  		// leaf node
   244  		node.right = invalidNodeValue
   245  		node.rightValue = right[0].value
   246  	} else {
   247  		node.right, err = buildHuffmanNode(t, right, level+1)
   248  	}
   249  
   250  	return
   251  }