github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/compress/bzip2/huffman.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bzip2
     6  
     7  import "sort"
     8  
     9  // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
    10  // symbol.
    11  type huffmanTree struct {
    12  	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
    13  	// root of the tree and nextNode contains the index of the next element
    14  	// of nodes to use when the tree is being constructed.
    15  	nodes    []huffmanNode
    16  	nextNode int
    17  }
    18  
    19  // A huffmanNode is a node in the tree. left and right contain indexes into the
    20  // nodes slice of the tree. If left or right is invalidNodeValue then the child
    21  // is a left node and its value is in leftValue/rightValue.
    22  //
    23  // The symbols are uint16s because bzip2 encodes not only MTF indexes in the
    24  // tree, but also two magic values for run-length encoding and an EOF symbol.
    25  // Thus there are more than 256 possible symbols.
    26  type huffmanNode struct {
    27  	left, right           uint16
    28  	leftValue, rightValue uint16
    29  }
    30  
    31  // invalidNodeValue is an invalid index which marks a leaf node in the tree.
    32  const invalidNodeValue = 0xffff
    33  
    34  // Decode reads bits from the given bitReader and navigates the tree until a
    35  // symbol is found.
    36  func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
    37  	nodeIndex := uint16(0) // node 0 is the root of the tree.
    38  
    39  	for {
    40  		node := &t.nodes[nodeIndex]
    41  
    42  		var bit uint16
    43  		if br.bits > 0 {
    44  			// Get next bit - fast path.
    45  			br.bits--
    46  			bit = 0 - (uint16(br.n>>br.bits) & 1)
    47  		} else {
    48  			// Get next bit - slow path.
    49  			// Use ReadBits to retrieve a single bit
    50  			// from the underling io.ByteReader.
    51  			bit = 0 - uint16(br.ReadBits(1))
    52  		}
    53  		// now
    54  		// bit = 0xffff if the next bit was 1
    55  		// bit = 0x0000 if the next bit was 0
    56  
    57  		// 1 means left, 0 means right.
    58  		//
    59  		// if bit == 0xffff {
    60  		//     nodeIndex = node.left
    61  		// } else {
    62  		//     nodeIndex = node.right
    63  		// }
    64  		nodeIndex = (bit & node.left) | (^bit & node.right)
    65  
    66  		if nodeIndex == invalidNodeValue {
    67  			// We found a leaf. Use the value of bit to decide
    68  			// whether is a left or a right value.
    69  			return (bit & node.leftValue) | (^bit & node.rightValue)
    70  		}
    71  	}
    72  }
    73  
    74  // newHuffmanTree builds a Huffman tree from a slice containing the code
    75  // lengths of each symbol. The maximum code length is 32 bits.
    76  func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
    77  	// There are many possible trees that assign the same code length to
    78  	// each symbol (consider reflecting a tree down the middle, for
    79  	// example). Since the code length assignments determine the
    80  	// efficiency of the tree, each of these trees is equally good. In
    81  	// order to minimize the amount of information needed to build a tree
    82  	// bzip2 uses a canonical tree so that it can be reconstructed given
    83  	// only the code length assignments.
    84  
    85  	if len(lengths) < 2 {
    86  		panic("newHuffmanTree: too few symbols")
    87  	}
    88  
    89  	var t huffmanTree
    90  
    91  	// First we sort the code length assignments by ascending code length,
    92  	// using the symbol value to break ties.
    93  	pairs := make([]huffmanSymbolLengthPair, len(lengths))
    94  	for i, length := range lengths {
    95  		pairs[i].value = uint16(i)
    96  		pairs[i].length = length
    97  	}
    98  
    99  	sort.Slice(pairs, func(i, j int) bool {
   100  		if pairs[i].length < pairs[j].length {
   101  			return true
   102  		}
   103  		if pairs[i].length > pairs[j].length {
   104  			return false
   105  		}
   106  		if pairs[i].value < pairs[j].value {
   107  			return true
   108  		}
   109  		return false
   110  	})
   111  
   112  	// Now we assign codes to the symbols, starting with the longest code.
   113  	// We keep the codes packed into a uint32, at the most-significant end.
   114  	// So branches are taken from the MSB downwards. This makes it easy to
   115  	// sort them later.
   116  	code := uint32(0)
   117  	length := uint8(32)
   118  
   119  	codes := make([]huffmanCode, len(lengths))
   120  	for i := len(pairs) - 1; i >= 0; i-- {
   121  		if length > pairs[i].length {
   122  			length = pairs[i].length
   123  		}
   124  		codes[i].code = code
   125  		codes[i].codeLen = length
   126  		codes[i].value = pairs[i].value
   127  		// We need to 'increment' the code, which means treating |code|
   128  		// like a |length| bit number.
   129  		code += 1 << (32 - length)
   130  	}
   131  
   132  	// Now we can sort by the code so that the left half of each branch are
   133  	// grouped together, recursively.
   134  	sort.Slice(codes, func(i, j int) bool {
   135  		return codes[i].code < codes[j].code
   136  	})
   137  
   138  	t.nodes = make([]huffmanNode, len(codes))
   139  	_, err := buildHuffmanNode(&t, codes, 0)
   140  	return t, err
   141  }
   142  
   143  // huffmanSymbolLengthPair contains a symbol and its code length.
   144  type huffmanSymbolLengthPair struct {
   145  	value  uint16
   146  	length uint8
   147  }
   148  
   149  // huffmanCode contains a symbol, its code and code length.
   150  type huffmanCode struct {
   151  	code    uint32
   152  	codeLen uint8
   153  	value   uint16
   154  }
   155  
   156  // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
   157  // the Huffman tree at the given level. It returns the index of the newly
   158  // constructed node.
   159  func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
   160  	test := uint32(1) << (31 - level)
   161  
   162  	// We have to search the list of codes to find the divide between the left and right sides.
   163  	firstRightIndex := len(codes)
   164  	for i, code := range codes {
   165  		if code.code&test != 0 {
   166  			firstRightIndex = i
   167  			break
   168  		}
   169  	}
   170  
   171  	left := codes[:firstRightIndex]
   172  	right := codes[firstRightIndex:]
   173  
   174  	if len(left) == 0 || len(right) == 0 {
   175  		// There is a superfluous level in the Huffman tree indicating
   176  		// a bug in the encoder. However, this bug has been observed in
   177  		// the wild so we handle it.
   178  
   179  		// If this function was called recursively then we know that
   180  		// len(codes) >= 2 because, otherwise, we would have hit the
   181  		// "leaf node" case, below, and not recursed.
   182  		//
   183  		// However, for the initial call it's possible that len(codes)
   184  		// is zero or one. Both cases are invalid because a zero length
   185  		// tree cannot encode anything and a length-1 tree can only
   186  		// encode EOF and so is superfluous. We reject both.
   187  		if len(codes) < 2 {
   188  			return 0, StructuralError("empty Huffman tree")
   189  		}
   190  
   191  		// In this case the recursion doesn't always reduce the length
   192  		// of codes so we need to ensure termination via another
   193  		// mechanism.
   194  		if level == 31 {
   195  			// Since len(codes) >= 2 the only way that the values
   196  			// can match at all 32 bits is if they are equal, which
   197  			// is invalid. This ensures that we never enter
   198  			// infinite recursion.
   199  			return 0, StructuralError("equal symbols in Huffman tree")
   200  		}
   201  
   202  		if len(left) == 0 {
   203  			return buildHuffmanNode(t, right, level+1)
   204  		}
   205  		return buildHuffmanNode(t, left, level+1)
   206  	}
   207  
   208  	nodeIndex = uint16(t.nextNode)
   209  	node := &t.nodes[t.nextNode]
   210  	t.nextNode++
   211  
   212  	if len(left) == 1 {
   213  		// leaf node
   214  		node.left = invalidNodeValue
   215  		node.leftValue = left[0].value
   216  	} else {
   217  		node.left, err = buildHuffmanNode(t, left, level+1)
   218  	}
   219  
   220  	if err != nil {
   221  		return
   222  	}
   223  
   224  	if len(right) == 1 {
   225  		// leaf node
   226  		node.right = invalidNodeValue
   227  		node.rightValue = right[0].value
   228  	} else {
   229  		node.right, err = buildHuffmanNode(t, right, level+1)
   230  	}
   231  
   232  	return
   233  }