github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/compress/bzip2/huffman.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bzip2
     6  
     7  import "sort"
     8  
     9  // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
    10  // symbol.
    11  type huffmanTree struct {
    12  	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
    13  	// root of the tree and nextNode contains the index of the next element
    14  	// of nodes to use when the tree is being constructed.
    15  	nodes    []huffmanNode
    16  	nextNode int
    17  }
    18  
    19  // A huffmanNode is a node in the tree. left and right contain indexes into the
    20  // nodes slice of the tree. If left or right is invalidNodeValue then the child
    21  // is a left node and its value is in leftValue/rightValue.
    22  //
    23  // The symbols are uint16s because bzip2 encodes not only MTF indexes in the
    24  // tree, but also two magic values for run-length encoding and an EOF symbol.
    25  // Thus there are more than 256 possible symbols.
    26  type huffmanNode struct {
    27  	left, right           uint16
    28  	leftValue, rightValue uint16
    29  }
    30  
    31  // invalidNodeValue is an invalid index which marks a leaf node in the tree.
    32  const invalidNodeValue = 0xffff
    33  
    34  // Decode reads bits from the given bitReader and navigates the tree until a
    35  // symbol is found.
    36  func (t huffmanTree) Decode(br *bitReader) (v uint16) {
    37  	nodeIndex := uint16(0) // node 0 is the root of the tree.
    38  
    39  	for {
    40  		node := &t.nodes[nodeIndex]
    41  		bit := br.ReadBit()
    42  		// bzip2 encodes left as a true bit.
    43  		if bit {
    44  			// left
    45  			if node.left == invalidNodeValue {
    46  				return node.leftValue
    47  			}
    48  			nodeIndex = node.left
    49  		} else {
    50  			// right
    51  			if node.right == invalidNodeValue {
    52  				return node.rightValue
    53  			}
    54  			nodeIndex = node.right
    55  		}
    56  	}
    57  }
    58  
    59  // newHuffmanTree builds a Huffman tree from a slice containing the code
    60  // lengths of each symbol. The maximum code length is 32 bits.
    61  func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
    62  	// There are many possible trees that assign the same code length to
    63  	// each symbol (consider reflecting a tree down the middle, for
    64  	// example). Since the code length assignments determine the
    65  	// efficiency of the tree, each of these trees is equally good. In
    66  	// order to minimize the amount of information needed to build a tree
    67  	// bzip2 uses a canonical tree so that it can be reconstructed given
    68  	// only the code length assignments.
    69  
    70  	if len(lengths) < 2 {
    71  		panic("newHuffmanTree: too few symbols")
    72  	}
    73  
    74  	var t huffmanTree
    75  
    76  	// First we sort the code length assignments by ascending code length,
    77  	// using the symbol value to break ties.
    78  	pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
    79  	for i, length := range lengths {
    80  		pairs[i].value = uint16(i)
    81  		pairs[i].length = length
    82  	}
    83  
    84  	sort.Sort(pairs)
    85  
    86  	// Now we assign codes to the symbols, starting with the longest code.
    87  	// We keep the codes packed into a uint32, at the most-significant end.
    88  	// So branches are taken from the MSB downwards. This makes it easy to
    89  	// sort them later.
    90  	code := uint32(0)
    91  	length := uint8(32)
    92  
    93  	codes := huffmanCodes(make([]huffmanCode, len(lengths)))
    94  	for i := len(pairs) - 1; i >= 0; i-- {
    95  		if length > pairs[i].length {
    96  			// If the code length decreases we shift in order to
    97  			// zero any bits beyond the end of the code.
    98  			length >>= 32 - pairs[i].length
    99  			length <<= 32 - pairs[i].length
   100  			length = pairs[i].length
   101  		}
   102  		codes[i].code = code
   103  		codes[i].codeLen = length
   104  		codes[i].value = pairs[i].value
   105  		// We need to 'increment' the code, which means treating |code|
   106  		// like a |length| bit number.
   107  		code += 1 << (32 - length)
   108  	}
   109  
   110  	// Now we can sort by the code so that the left half of each branch are
   111  	// grouped together, recursively.
   112  	sort.Sort(codes)
   113  
   114  	t.nodes = make([]huffmanNode, len(codes))
   115  	_, err := buildHuffmanNode(&t, codes, 0)
   116  	return t, err
   117  }
   118  
   119  // huffmanSymbolLengthPair contains a symbol and its code length.
   120  type huffmanSymbolLengthPair struct {
   121  	value  uint16
   122  	length uint8
   123  }
   124  
   125  // huffmanSymbolLengthPair is used to provide an interface for sorting.
   126  type huffmanSymbolLengthPairs []huffmanSymbolLengthPair
   127  
   128  func (h huffmanSymbolLengthPairs) Len() int {
   129  	return len(h)
   130  }
   131  
   132  func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
   133  	if h[i].length < h[j].length {
   134  		return true
   135  	}
   136  	if h[i].length > h[j].length {
   137  		return false
   138  	}
   139  	if h[i].value < h[j].value {
   140  		return true
   141  	}
   142  	return false
   143  }
   144  
   145  func (h huffmanSymbolLengthPairs) Swap(i, j int) {
   146  	h[i], h[j] = h[j], h[i]
   147  }
   148  
   149  // huffmanCode contains a symbol, its code and code length.
   150  type huffmanCode struct {
   151  	code    uint32
   152  	codeLen uint8
   153  	value   uint16
   154  }
   155  
   156  // huffmanCodes is used to provide an interface for sorting.
   157  type huffmanCodes []huffmanCode
   158  
   159  func (n huffmanCodes) Len() int {
   160  	return len(n)
   161  }
   162  
   163  func (n huffmanCodes) Less(i, j int) bool {
   164  	return n[i].code < n[j].code
   165  }
   166  
   167  func (n huffmanCodes) Swap(i, j int) {
   168  	n[i], n[j] = n[j], n[i]
   169  }
   170  
   171  // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
   172  // the Huffman tree at the given level. It returns the index of the newly
   173  // constructed node.
   174  func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
   175  	test := uint32(1) << (31 - level)
   176  
   177  	// We have to search the list of codes to find the divide between the left and right sides.
   178  	firstRightIndex := len(codes)
   179  	for i, code := range codes {
   180  		if code.code&test != 0 {
   181  			firstRightIndex = i
   182  			break
   183  		}
   184  	}
   185  
   186  	left := codes[:firstRightIndex]
   187  	right := codes[firstRightIndex:]
   188  
   189  	if len(left) == 0 || len(right) == 0 {
   190  		return 0, StructuralError("superfluous level in Huffman tree")
   191  	}
   192  
   193  	nodeIndex = uint16(t.nextNode)
   194  	node := &t.nodes[t.nextNode]
   195  	t.nextNode++
   196  
   197  	if len(left) == 1 {
   198  		// leaf node
   199  		node.left = invalidNodeValue
   200  		node.leftValue = left[0].value
   201  	} else {
   202  		node.left, err = buildHuffmanNode(t, left, level+1)
   203  	}
   204  
   205  	if err != nil {
   206  		return
   207  	}
   208  
   209  	if len(right) == 1 {
   210  		// leaf node
   211  		node.right = invalidNodeValue
   212  		node.rightValue = right[0].value
   213  	} else {
   214  		node.right, err = buildHuffmanNode(t, right, level+1)
   215  	}
   216  
   217  	return
   218  }