github.com/andybalholm/brotli@v1.0.6/cluster_distance.go (about) 1 package brotli 2 3 import "math" 4 5 /* Copyright 2013 Google Inc. All Rights Reserved. 6 7 Distributed under MIT license. 8 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 9 */ 10 11 /* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if 12 it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */ 13 func compareAndPushToQueueDistance(out []histogramDistance, cluster_size []uint32, idx1 uint32, idx2 uint32, max_num_pairs uint, pairs []histogramPair, num_pairs *uint) { 14 var is_good_pair bool = false 15 var p histogramPair 16 p.idx2 = 0 17 p.idx1 = p.idx2 18 p.cost_combo = 0 19 p.cost_diff = p.cost_combo 20 if idx1 == idx2 { 21 return 22 } 23 24 if idx2 < idx1 { 25 var t uint32 = idx2 26 idx2 = idx1 27 idx1 = t 28 } 29 30 p.idx1 = idx1 31 p.idx2 = idx2 32 p.cost_diff = 0.5 * clusterCostDiff(uint(cluster_size[idx1]), uint(cluster_size[idx2])) 33 p.cost_diff -= out[idx1].bit_cost_ 34 p.cost_diff -= out[idx2].bit_cost_ 35 36 if out[idx1].total_count_ == 0 { 37 p.cost_combo = out[idx2].bit_cost_ 38 is_good_pair = true 39 } else if out[idx2].total_count_ == 0 { 40 p.cost_combo = out[idx1].bit_cost_ 41 is_good_pair = true 42 } else { 43 var threshold float64 44 if *num_pairs == 0 { 45 threshold = 1e99 46 } else { 47 threshold = brotli_max_double(0.0, pairs[0].cost_diff) 48 } 49 var combo histogramDistance = out[idx1] 50 var cost_combo float64 51 histogramAddHistogramDistance(&combo, &out[idx2]) 52 cost_combo = populationCostDistance(&combo) 53 if cost_combo < threshold-p.cost_diff { 54 p.cost_combo = cost_combo 55 is_good_pair = true 56 } 57 } 58 59 if is_good_pair { 60 p.cost_diff += p.cost_combo 61 if *num_pairs > 0 && histogramPairIsLess(&pairs[0], &p) { 62 /* Replace the top of the queue if needed. */ 63 if *num_pairs < max_num_pairs { 64 pairs[*num_pairs] = pairs[0] 65 (*num_pairs)++ 66 } 67 68 pairs[0] = p 69 } else if *num_pairs < max_num_pairs { 70 pairs[*num_pairs] = p 71 (*num_pairs)++ 72 } 73 } 74 } 75 76 func histogramCombineDistance(out []histogramDistance, cluster_size []uint32, symbols []uint32, clusters []uint32, pairs []histogramPair, num_clusters uint, symbols_size uint, max_clusters uint, max_num_pairs uint) uint { 77 var cost_diff_threshold float64 = 0.0 78 var min_cluster_size uint = 1 79 var num_pairs uint = 0 80 { 81 /* We maintain a vector of histogram pairs, with the property that the pair 82 with the maximum bit cost reduction is the first. */ 83 var idx1 uint 84 for idx1 = 0; idx1 < num_clusters; idx1++ { 85 var idx2 uint 86 for idx2 = idx1 + 1; idx2 < num_clusters; idx2++ { 87 compareAndPushToQueueDistance(out, cluster_size, clusters[idx1], clusters[idx2], max_num_pairs, pairs[0:], &num_pairs) 88 } 89 } 90 } 91 92 for num_clusters > min_cluster_size { 93 var best_idx1 uint32 94 var best_idx2 uint32 95 var i uint 96 if pairs[0].cost_diff >= cost_diff_threshold { 97 cost_diff_threshold = 1e99 98 min_cluster_size = max_clusters 99 continue 100 } 101 102 /* Take the best pair from the top of heap. */ 103 best_idx1 = pairs[0].idx1 104 105 best_idx2 = pairs[0].idx2 106 histogramAddHistogramDistance(&out[best_idx1], &out[best_idx2]) 107 out[best_idx1].bit_cost_ = pairs[0].cost_combo 108 cluster_size[best_idx1] += cluster_size[best_idx2] 109 for i = 0; i < symbols_size; i++ { 110 if symbols[i] == best_idx2 { 111 symbols[i] = best_idx1 112 } 113 } 114 115 for i = 0; i < num_clusters; i++ { 116 if clusters[i] == best_idx2 { 117 copy(clusters[i:], clusters[i+1:][:num_clusters-i-1]) 118 break 119 } 120 } 121 122 num_clusters-- 123 { 124 /* Remove pairs intersecting the just combined best pair. */ 125 var copy_to_idx uint = 0 126 for i = 0; i < num_pairs; i++ { 127 var p *histogramPair = &pairs[i] 128 if p.idx1 == best_idx1 || p.idx2 == best_idx1 || p.idx1 == best_idx2 || p.idx2 == best_idx2 { 129 /* Remove invalid pair from the queue. */ 130 continue 131 } 132 133 if histogramPairIsLess(&pairs[0], p) { 134 /* Replace the top of the queue if needed. */ 135 var front histogramPair = pairs[0] 136 pairs[0] = *p 137 pairs[copy_to_idx] = front 138 } else { 139 pairs[copy_to_idx] = *p 140 } 141 142 copy_to_idx++ 143 } 144 145 num_pairs = copy_to_idx 146 } 147 148 /* Push new pairs formed with the combined histogram to the heap. */ 149 for i = 0; i < num_clusters; i++ { 150 compareAndPushToQueueDistance(out, cluster_size, best_idx1, clusters[i], max_num_pairs, pairs[0:], &num_pairs) 151 } 152 } 153 154 return num_clusters 155 } 156 157 /* What is the bit cost of moving histogram from cur_symbol to candidate. */ 158 func histogramBitCostDistanceDistance(histogram *histogramDistance, candidate *histogramDistance) float64 { 159 if histogram.total_count_ == 0 { 160 return 0.0 161 } else { 162 var tmp histogramDistance = *histogram 163 histogramAddHistogramDistance(&tmp, candidate) 164 return populationCostDistance(&tmp) - candidate.bit_cost_ 165 } 166 } 167 168 /* Find the best 'out' histogram for each of the 'in' histograms. 169 When called, clusters[0..num_clusters) contains the unique values from 170 symbols[0..in_size), but this property is not preserved in this function. 171 Note: we assume that out[]->bit_cost_ is already up-to-date. */ 172 func histogramRemapDistance(in []histogramDistance, in_size uint, clusters []uint32, num_clusters uint, out []histogramDistance, symbols []uint32) { 173 var i uint 174 for i = 0; i < in_size; i++ { 175 var best_out uint32 176 if i == 0 { 177 best_out = symbols[0] 178 } else { 179 best_out = symbols[i-1] 180 } 181 var best_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[best_out]) 182 var j uint 183 for j = 0; j < num_clusters; j++ { 184 var cur_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[clusters[j]]) 185 if cur_bits < best_bits { 186 best_bits = cur_bits 187 best_out = clusters[j] 188 } 189 } 190 191 symbols[i] = best_out 192 } 193 194 /* Recompute each out based on raw and symbols. */ 195 for i = 0; i < num_clusters; i++ { 196 histogramClearDistance(&out[clusters[i]]) 197 } 198 199 for i = 0; i < in_size; i++ { 200 histogramAddHistogramDistance(&out[symbols[i]], &in[i]) 201 } 202 } 203 204 /* Reorders elements of the out[0..length) array and changes values in 205 symbols[0..length) array in the following way: 206 * when called, symbols[] contains indexes into out[], and has N unique 207 values (possibly N < length) 208 * on return, symbols'[i] = f(symbols[i]) and 209 out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length, 210 where f is a bijection between the range of symbols[] and [0..N), and 211 the first occurrences of values in symbols'[i] come in consecutive 212 increasing order. 213 Returns N, the number of unique values in symbols[]. */ 214 215 var histogramReindexDistance_kInvalidIndex uint32 = math.MaxUint32 216 217 func histogramReindexDistance(out []histogramDistance, symbols []uint32, length uint) uint { 218 var new_index []uint32 = make([]uint32, length) 219 var next_index uint32 220 var tmp []histogramDistance 221 var i uint 222 for i = 0; i < length; i++ { 223 new_index[i] = histogramReindexDistance_kInvalidIndex 224 } 225 226 next_index = 0 227 for i = 0; i < length; i++ { 228 if new_index[symbols[i]] == histogramReindexDistance_kInvalidIndex { 229 new_index[symbols[i]] = next_index 230 next_index++ 231 } 232 } 233 234 /* TODO: by using idea of "cycle-sort" we can avoid allocation of 235 tmp and reduce the number of copying by the factor of 2. */ 236 tmp = make([]histogramDistance, next_index) 237 238 next_index = 0 239 for i = 0; i < length; i++ { 240 if new_index[symbols[i]] == next_index { 241 tmp[next_index] = out[symbols[i]] 242 next_index++ 243 } 244 245 symbols[i] = new_index[symbols[i]] 246 } 247 248 new_index = nil 249 for i = 0; uint32(i) < next_index; i++ { 250 out[i] = tmp[i] 251 } 252 253 tmp = nil 254 return uint(next_index) 255 } 256 257 func clusterHistogramsDistance(in []histogramDistance, in_size uint, max_histograms uint, out []histogramDistance, out_size *uint, histogram_symbols []uint32) { 258 var cluster_size []uint32 = make([]uint32, in_size) 259 var clusters []uint32 = make([]uint32, in_size) 260 var num_clusters uint = 0 261 var max_input_histograms uint = 64 262 var pairs_capacity uint = max_input_histograms * max_input_histograms / 2 263 var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1)) 264 var i uint 265 266 /* For the first pass of clustering, we allow all pairs. */ 267 for i = 0; i < in_size; i++ { 268 cluster_size[i] = 1 269 } 270 271 for i = 0; i < in_size; i++ { 272 out[i] = in[i] 273 out[i].bit_cost_ = populationCostDistance(&in[i]) 274 histogram_symbols[i] = uint32(i) 275 } 276 277 for i = 0; i < in_size; i += max_input_histograms { 278 var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms) 279 var num_new_clusters uint 280 var j uint 281 for j = 0; j < num_to_combine; j++ { 282 clusters[num_clusters+j] = uint32(i + j) 283 } 284 285 num_new_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity) 286 num_clusters += num_new_clusters 287 } 288 { 289 /* For the second pass, we limit the total number of histogram pairs. 290 After this limit is reached, we only keep searching for the best pair. */ 291 var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters) 292 if pairs_capacity < (max_num_pairs + 1) { 293 var _new_size uint 294 if pairs_capacity == 0 { 295 _new_size = max_num_pairs + 1 296 } else { 297 _new_size = pairs_capacity 298 } 299 var new_array []histogramPair 300 for _new_size < (max_num_pairs + 1) { 301 _new_size *= 2 302 } 303 new_array = make([]histogramPair, _new_size) 304 if pairs_capacity != 0 { 305 copy(new_array, pairs[:pairs_capacity]) 306 } 307 308 pairs = new_array 309 pairs_capacity = _new_size 310 } 311 312 /* Collapse similar histograms. */ 313 num_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs) 314 } 315 316 pairs = nil 317 cluster_size = nil 318 319 /* Find the optimal map from original histograms to the final ones. */ 320 histogramRemapDistance(in, in_size, clusters, num_clusters, out, histogram_symbols) 321 322 clusters = nil 323 324 /* Convert the context map to a canonical form. */ 325 *out_size = histogramReindexDistance(out, histogram_symbols, in_size) 326 }