github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/compress/bzip2/huffman.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bzip2 6 7 import "sort" 8 9 // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a 10 // symbol. 11 type huffmanTree struct { 12 // nodes contains all the non-leaf nodes in the tree. nodes[0] is the 13 // root of the tree and nextNode contains the index of the next element 14 // of nodes to use when the tree is being constructed. 15 nodes []huffmanNode 16 nextNode int 17 } 18 19 // A huffmanNode is a node in the tree. left and right contain indexes into the 20 // nodes slice of the tree. If left or right is invalidNodeValue then the child 21 // is a left node and its value is in leftValue/rightValue. 22 // 23 // The symbols are uint16s because bzip2 encodes not only MTF indexes in the 24 // tree, but also two magic values for run-length encoding and an EOF symbol. 25 // Thus there are more than 256 possible symbols. 26 type huffmanNode struct { 27 left, right uint16 28 leftValue, rightValue uint16 29 } 30 31 // invalidNodeValue is an invalid index which marks a leaf node in the tree. 32 const invalidNodeValue = 0xffff 33 34 // Decode reads bits from the given bitReader and navigates the tree until a 35 // symbol is found. 36 func (t *huffmanTree) Decode(br *bitReader) (v uint16) { 37 nodeIndex := uint16(0) // node 0 is the root of the tree. 38 39 for { 40 node := &t.nodes[nodeIndex] 41 42 var bit uint16 43 if br.bits > 0 { 44 // Get next bit - fast path. 45 br.bits-- 46 bit = 0 - (uint16(br.n>>br.bits) & 1) 47 } else { 48 // Get next bit - slow path. 49 // Use ReadBits to retrieve a single bit 50 // from the underling io.ByteReader. 51 bit = 0 - uint16(br.ReadBits(1)) 52 } 53 // now 54 // bit = 0xffff if the next bit was 1 55 // bit = 0x0000 if the next bit was 0 56 57 // 1 means left, 0 means right. 58 // 59 // if bit == 0xffff { 60 // nodeIndex = node.left 61 // } else { 62 // nodeIndex = node.right 63 // } 64 nodeIndex = (bit & node.left) | (^bit & node.right) 65 66 if nodeIndex == invalidNodeValue { 67 // We found a leaf. Use the value of bit to decide 68 // whether is a left or a right value. 69 return (bit & node.leftValue) | (^bit & node.rightValue) 70 } 71 } 72 } 73 74 // newHuffmanTree builds a Huffman tree from a slice containing the code 75 // lengths of each symbol. The maximum code length is 32 bits. 76 func newHuffmanTree(lengths []uint8) (huffmanTree, error) { 77 // There are many possible trees that assign the same code length to 78 // each symbol (consider reflecting a tree down the middle, for 79 // example). Since the code length assignments determine the 80 // efficiency of the tree, each of these trees is equally good. In 81 // order to minimize the amount of information needed to build a tree 82 // bzip2 uses a canonical tree so that it can be reconstructed given 83 // only the code length assignments. 84 85 if len(lengths) < 2 { 86 panic("newHuffmanTree: too few symbols") 87 } 88 89 var t huffmanTree 90 91 // First we sort the code length assignments by ascending code length, 92 // using the symbol value to break ties. 93 pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths))) 94 for i, length := range lengths { 95 pairs[i].value = uint16(i) 96 pairs[i].length = length 97 } 98 99 sort.Sort(pairs) 100 101 // Now we assign codes to the symbols, starting with the longest code. 102 // We keep the codes packed into a uint32, at the most-significant end. 103 // So branches are taken from the MSB downwards. This makes it easy to 104 // sort them later. 105 code := uint32(0) 106 length := uint8(32) 107 108 codes := huffmanCodes(make([]huffmanCode, len(lengths))) 109 for i := len(pairs) - 1; i >= 0; i-- { 110 if length > pairs[i].length { 111 length = pairs[i].length 112 } 113 codes[i].code = code 114 codes[i].codeLen = length 115 codes[i].value = pairs[i].value 116 // We need to 'increment' the code, which means treating |code| 117 // like a |length| bit number. 118 code += 1 << (32 - length) 119 } 120 121 // Now we can sort by the code so that the left half of each branch are 122 // grouped together, recursively. 123 sort.Sort(codes) 124 125 t.nodes = make([]huffmanNode, len(codes)) 126 _, err := buildHuffmanNode(&t, codes, 0) 127 return t, err 128 } 129 130 // huffmanSymbolLengthPair contains a symbol and its code length. 131 type huffmanSymbolLengthPair struct { 132 value uint16 133 length uint8 134 } 135 136 // huffmanSymbolLengthPair is used to provide an interface for sorting. 137 type huffmanSymbolLengthPairs []huffmanSymbolLengthPair 138 139 func (h huffmanSymbolLengthPairs) Len() int { 140 return len(h) 141 } 142 143 func (h huffmanSymbolLengthPairs) Less(i, j int) bool { 144 if h[i].length < h[j].length { 145 return true 146 } 147 if h[i].length > h[j].length { 148 return false 149 } 150 if h[i].value < h[j].value { 151 return true 152 } 153 return false 154 } 155 156 func (h huffmanSymbolLengthPairs) Swap(i, j int) { 157 h[i], h[j] = h[j], h[i] 158 } 159 160 // huffmanCode contains a symbol, its code and code length. 161 type huffmanCode struct { 162 code uint32 163 codeLen uint8 164 value uint16 165 } 166 167 // huffmanCodes is used to provide an interface for sorting. 168 type huffmanCodes []huffmanCode 169 170 func (n huffmanCodes) Len() int { 171 return len(n) 172 } 173 174 func (n huffmanCodes) Less(i, j int) bool { 175 return n[i].code < n[j].code 176 } 177 178 func (n huffmanCodes) Swap(i, j int) { 179 n[i], n[j] = n[j], n[i] 180 } 181 182 // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in 183 // the Huffman tree at the given level. It returns the index of the newly 184 // constructed node. 185 func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) { 186 test := uint32(1) << (31 - level) 187 188 // We have to search the list of codes to find the divide between the left and right sides. 189 firstRightIndex := len(codes) 190 for i, code := range codes { 191 if code.code&test != 0 { 192 firstRightIndex = i 193 break 194 } 195 } 196 197 left := codes[:firstRightIndex] 198 right := codes[firstRightIndex:] 199 200 if len(left) == 0 || len(right) == 0 { 201 // There is a superfluous level in the Huffman tree indicating 202 // a bug in the encoder. However, this bug has been observed in 203 // the wild so we handle it. 204 205 // If this function was called recursively then we know that 206 // len(codes) >= 2 because, otherwise, we would have hit the 207 // "leaf node" case, below, and not recursed. 208 // 209 // However, for the initial call it's possible that len(codes) 210 // is zero or one. Both cases are invalid because a zero length 211 // tree cannot encode anything and a length-1 tree can only 212 // encode EOF and so is superfluous. We reject both. 213 if len(codes) < 2 { 214 return 0, StructuralError("empty Huffman tree") 215 } 216 217 // In this case the recursion doesn't always reduce the length 218 // of codes so we need to ensure termination via another 219 // mechanism. 220 if level == 31 { 221 // Since len(codes) >= 2 the only way that the values 222 // can match at all 32 bits is if they are equal, which 223 // is invalid. This ensures that we never enter 224 // infinite recursion. 225 return 0, StructuralError("equal symbols in Huffman tree") 226 } 227 228 if len(left) == 0 { 229 return buildHuffmanNode(t, right, level+1) 230 } 231 return buildHuffmanNode(t, left, level+1) 232 } 233 234 nodeIndex = uint16(t.nextNode) 235 node := &t.nodes[t.nextNode] 236 t.nextNode++ 237 238 if len(left) == 1 { 239 // leaf node 240 node.left = invalidNodeValue 241 node.leftValue = left[0].value 242 } else { 243 node.left, err = buildHuffmanNode(t, left, level+1) 244 } 245 246 if err != nil { 247 return 248 } 249 250 if len(right) == 1 { 251 // leaf node 252 node.right = invalidNodeValue 253 node.rightValue = right[0].value 254 } else { 255 node.right, err = buildHuffmanNode(t, right, level+1) 256 } 257 258 return 259 }