github.com/andybalholm/brotli@v1.0.6/block_splitter_distance.go (about) 1 package brotli 2 3 import "math" 4 5 /* Copyright 2013 Google Inc. All Rights Reserved. 6 7 Distributed under MIT license. 8 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 9 */ 10 11 func initialEntropyCodesDistance(data []uint16, length uint, stride uint, num_histograms uint, histograms []histogramDistance) { 12 var seed uint32 = 7 13 var block_length uint = length / num_histograms 14 var i uint 15 clearHistogramsDistance(histograms, num_histograms) 16 for i = 0; i < num_histograms; i++ { 17 var pos uint = length * i / num_histograms 18 if i != 0 { 19 pos += uint(myRand(&seed) % uint32(block_length)) 20 } 21 22 if pos+stride >= length { 23 pos = length - stride - 1 24 } 25 26 histogramAddVectorDistance(&histograms[i], data[pos:], stride) 27 } 28 } 29 30 func randomSampleDistance(seed *uint32, data []uint16, length uint, stride uint, sample *histogramDistance) { 31 var pos uint = 0 32 if stride >= length { 33 stride = length 34 } else { 35 pos = uint(myRand(seed) % uint32(length-stride+1)) 36 } 37 38 histogramAddVectorDistance(sample, data[pos:], stride) 39 } 40 41 func refineEntropyCodesDistance(data []uint16, length uint, stride uint, num_histograms uint, histograms []histogramDistance) { 42 var iters uint = kIterMulForRefining*length/stride + kMinItersForRefining 43 var seed uint32 = 7 44 var iter uint 45 iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms 46 for iter = 0; iter < iters; iter++ { 47 var sample histogramDistance 48 histogramClearDistance(&sample) 49 randomSampleDistance(&seed, data, length, stride, &sample) 50 histogramAddHistogramDistance(&histograms[iter%num_histograms], &sample) 51 } 52 } 53 54 /* Assigns a block id from the range [0, num_histograms) to each data element 55 in data[0..length) and fills in block_id[0..length) with the assigned values. 56 Returns the number of blocks, i.e. one plus the number of block switches. */ 57 func findBlocksDistance(data []uint16, length uint, block_switch_bitcost float64, num_histograms uint, histograms []histogramDistance, insert_cost []float64, cost []float64, switch_signal []byte, block_id []byte) uint { 58 var data_size uint = histogramDataSizeDistance() 59 var bitmaplen uint = (num_histograms + 7) >> 3 60 var num_blocks uint = 1 61 var i uint 62 var j uint 63 assert(num_histograms <= 256) 64 if num_histograms <= 1 { 65 for i = 0; i < length; i++ { 66 block_id[i] = 0 67 } 68 69 return 1 70 } 71 72 for i := 0; i < int(data_size*num_histograms); i++ { 73 insert_cost[i] = 0 74 } 75 for i = 0; i < num_histograms; i++ { 76 insert_cost[i] = fastLog2(uint(uint32(histograms[i].total_count_))) 77 } 78 79 for i = data_size; i != 0; { 80 i-- 81 for j = 0; j < num_histograms; j++ { 82 insert_cost[i*num_histograms+j] = insert_cost[j] - bitCost(uint(histograms[j].data_[i])) 83 } 84 } 85 86 for i := 0; i < int(num_histograms); i++ { 87 cost[i] = 0 88 } 89 for i := 0; i < int(length*bitmaplen); i++ { 90 switch_signal[i] = 0 91 } 92 93 /* After each iteration of this loop, cost[k] will contain the difference 94 between the minimum cost of arriving at the current byte position using 95 entropy code k, and the minimum cost of arriving at the current byte 96 position. This difference is capped at the block switch cost, and if it 97 reaches block switch cost, it means that when we trace back from the last 98 position, we need to switch here. */ 99 for i = 0; i < length; i++ { 100 var byte_ix uint = i 101 var ix uint = byte_ix * bitmaplen 102 var insert_cost_ix uint = uint(data[byte_ix]) * num_histograms 103 var min_cost float64 = 1e99 104 var block_switch_cost float64 = block_switch_bitcost 105 var k uint 106 for k = 0; k < num_histograms; k++ { 107 /* We are coding the symbol in data[byte_ix] with entropy code k. */ 108 cost[k] += insert_cost[insert_cost_ix+k] 109 110 if cost[k] < min_cost { 111 min_cost = cost[k] 112 block_id[byte_ix] = byte(k) 113 } 114 } 115 116 /* More blocks for the beginning. */ 117 if byte_ix < 2000 { 118 block_switch_cost *= 0.77 + 0.07*float64(byte_ix)/2000 119 } 120 121 for k = 0; k < num_histograms; k++ { 122 cost[k] -= min_cost 123 if cost[k] >= block_switch_cost { 124 var mask byte = byte(1 << (k & 7)) 125 cost[k] = block_switch_cost 126 assert(k>>3 < bitmaplen) 127 switch_signal[ix+(k>>3)] |= mask 128 /* Trace back from the last position and switch at the marked places. */ 129 } 130 } 131 } 132 { 133 var byte_ix uint = length - 1 134 var ix uint = byte_ix * bitmaplen 135 var cur_id byte = block_id[byte_ix] 136 for byte_ix > 0 { 137 var mask byte = byte(1 << (cur_id & 7)) 138 assert(uint(cur_id)>>3 < bitmaplen) 139 byte_ix-- 140 ix -= bitmaplen 141 if switch_signal[ix+uint(cur_id>>3)]&mask != 0 { 142 if cur_id != block_id[byte_ix] { 143 cur_id = block_id[byte_ix] 144 num_blocks++ 145 } 146 } 147 148 block_id[byte_ix] = cur_id 149 } 150 } 151 152 return num_blocks 153 } 154 155 var remapBlockIdsDistance_kInvalidId uint16 = 256 156 157 func remapBlockIdsDistance(block_ids []byte, length uint, new_id []uint16, num_histograms uint) uint { 158 var next_id uint16 = 0 159 var i uint 160 for i = 0; i < num_histograms; i++ { 161 new_id[i] = remapBlockIdsDistance_kInvalidId 162 } 163 164 for i = 0; i < length; i++ { 165 assert(uint(block_ids[i]) < num_histograms) 166 if new_id[block_ids[i]] == remapBlockIdsDistance_kInvalidId { 167 new_id[block_ids[i]] = next_id 168 next_id++ 169 } 170 } 171 172 for i = 0; i < length; i++ { 173 block_ids[i] = byte(new_id[block_ids[i]]) 174 assert(uint(block_ids[i]) < num_histograms) 175 } 176 177 assert(uint(next_id) <= num_histograms) 178 return uint(next_id) 179 } 180 181 func buildBlockHistogramsDistance(data []uint16, length uint, block_ids []byte, num_histograms uint, histograms []histogramDistance) { 182 var i uint 183 clearHistogramsDistance(histograms, num_histograms) 184 for i = 0; i < length; i++ { 185 histogramAddDistance(&histograms[block_ids[i]], uint(data[i])) 186 } 187 } 188 189 var clusterBlocksDistance_kInvalidIndex uint32 = math.MaxUint32 190 191 func clusterBlocksDistance(data []uint16, length uint, num_blocks uint, block_ids []byte, split *blockSplit) { 192 var histogram_symbols []uint32 = make([]uint32, num_blocks) 193 var block_lengths []uint32 = make([]uint32, num_blocks) 194 var expected_num_clusters uint = clustersPerBatch * (num_blocks + histogramsPerBatch - 1) / histogramsPerBatch 195 var all_histograms_size uint = 0 196 var all_histograms_capacity uint = expected_num_clusters 197 var all_histograms []histogramDistance = make([]histogramDistance, all_histograms_capacity) 198 var cluster_size_size uint = 0 199 var cluster_size_capacity uint = expected_num_clusters 200 var cluster_size []uint32 = make([]uint32, cluster_size_capacity) 201 var num_clusters uint = 0 202 var histograms []histogramDistance = make([]histogramDistance, brotli_min_size_t(num_blocks, histogramsPerBatch)) 203 var max_num_pairs uint = histogramsPerBatch * histogramsPerBatch / 2 204 var pairs_capacity uint = max_num_pairs + 1 205 var pairs []histogramPair = make([]histogramPair, pairs_capacity) 206 var pos uint = 0 207 var clusters []uint32 208 var num_final_clusters uint 209 var new_index []uint32 210 var i uint 211 var sizes = [histogramsPerBatch]uint32{0} 212 var new_clusters = [histogramsPerBatch]uint32{0} 213 var symbols = [histogramsPerBatch]uint32{0} 214 var remap = [histogramsPerBatch]uint32{0} 215 216 for i := 0; i < int(num_blocks); i++ { 217 block_lengths[i] = 0 218 } 219 { 220 var block_idx uint = 0 221 for i = 0; i < length; i++ { 222 assert(block_idx < num_blocks) 223 block_lengths[block_idx]++ 224 if i+1 == length || block_ids[i] != block_ids[i+1] { 225 block_idx++ 226 } 227 } 228 229 assert(block_idx == num_blocks) 230 } 231 232 for i = 0; i < num_blocks; i += histogramsPerBatch { 233 var num_to_combine uint = brotli_min_size_t(num_blocks-i, histogramsPerBatch) 234 var num_new_clusters uint 235 var j uint 236 for j = 0; j < num_to_combine; j++ { 237 var k uint 238 histogramClearDistance(&histograms[j]) 239 for k = 0; uint32(k) < block_lengths[i+j]; k++ { 240 histogramAddDistance(&histograms[j], uint(data[pos])) 241 pos++ 242 } 243 244 histograms[j].bit_cost_ = populationCostDistance(&histograms[j]) 245 new_clusters[j] = uint32(j) 246 symbols[j] = uint32(j) 247 sizes[j] = 1 248 } 249 250 num_new_clusters = histogramCombineDistance(histograms, sizes[:], symbols[:], new_clusters[:], []histogramPair(pairs), num_to_combine, num_to_combine, histogramsPerBatch, max_num_pairs) 251 if all_histograms_capacity < (all_histograms_size + num_new_clusters) { 252 var _new_size uint 253 if all_histograms_capacity == 0 { 254 _new_size = all_histograms_size + num_new_clusters 255 } else { 256 _new_size = all_histograms_capacity 257 } 258 var new_array []histogramDistance 259 for _new_size < (all_histograms_size + num_new_clusters) { 260 _new_size *= 2 261 } 262 new_array = make([]histogramDistance, _new_size) 263 if all_histograms_capacity != 0 { 264 copy(new_array, all_histograms[:all_histograms_capacity]) 265 } 266 267 all_histograms = new_array 268 all_histograms_capacity = _new_size 269 } 270 271 brotli_ensure_capacity_uint32_t(&cluster_size, &cluster_size_capacity, cluster_size_size+num_new_clusters) 272 for j = 0; j < num_new_clusters; j++ { 273 all_histograms[all_histograms_size] = histograms[new_clusters[j]] 274 all_histograms_size++ 275 cluster_size[cluster_size_size] = sizes[new_clusters[j]] 276 cluster_size_size++ 277 remap[new_clusters[j]] = uint32(j) 278 } 279 280 for j = 0; j < num_to_combine; j++ { 281 histogram_symbols[i+j] = uint32(num_clusters) + remap[symbols[j]] 282 } 283 284 num_clusters += num_new_clusters 285 assert(num_clusters == cluster_size_size) 286 assert(num_clusters == all_histograms_size) 287 } 288 289 histograms = nil 290 291 max_num_pairs = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters) 292 if pairs_capacity < max_num_pairs+1 { 293 pairs = nil 294 pairs = make([]histogramPair, (max_num_pairs + 1)) 295 } 296 297 clusters = make([]uint32, num_clusters) 298 for i = 0; i < num_clusters; i++ { 299 clusters[i] = uint32(i) 300 } 301 302 num_final_clusters = histogramCombineDistance(all_histograms, cluster_size, histogram_symbols, clusters, pairs, num_clusters, num_blocks, maxNumberOfBlockTypes, max_num_pairs) 303 pairs = nil 304 cluster_size = nil 305 306 new_index = make([]uint32, num_clusters) 307 for i = 0; i < num_clusters; i++ { 308 new_index[i] = clusterBlocksDistance_kInvalidIndex 309 } 310 pos = 0 311 { 312 var next_index uint32 = 0 313 for i = 0; i < num_blocks; i++ { 314 var histo histogramDistance 315 var j uint 316 var best_out uint32 317 var best_bits float64 318 histogramClearDistance(&histo) 319 for j = 0; uint32(j) < block_lengths[i]; j++ { 320 histogramAddDistance(&histo, uint(data[pos])) 321 pos++ 322 } 323 324 if i == 0 { 325 best_out = histogram_symbols[0] 326 } else { 327 best_out = histogram_symbols[i-1] 328 } 329 best_bits = histogramBitCostDistanceDistance(&histo, &all_histograms[best_out]) 330 for j = 0; j < num_final_clusters; j++ { 331 var cur_bits float64 = histogramBitCostDistanceDistance(&histo, &all_histograms[clusters[j]]) 332 if cur_bits < best_bits { 333 best_bits = cur_bits 334 best_out = clusters[j] 335 } 336 } 337 338 histogram_symbols[i] = best_out 339 if new_index[best_out] == clusterBlocksDistance_kInvalidIndex { 340 new_index[best_out] = next_index 341 next_index++ 342 } 343 } 344 } 345 346 clusters = nil 347 all_histograms = nil 348 brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, num_blocks) 349 brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, num_blocks) 350 { 351 var cur_length uint32 = 0 352 var block_idx uint = 0 353 var max_type byte = 0 354 for i = 0; i < num_blocks; i++ { 355 cur_length += block_lengths[i] 356 if i+1 == num_blocks || histogram_symbols[i] != histogram_symbols[i+1] { 357 var id byte = byte(new_index[histogram_symbols[i]]) 358 split.types[block_idx] = id 359 split.lengths[block_idx] = cur_length 360 max_type = brotli_max_uint8_t(max_type, id) 361 cur_length = 0 362 block_idx++ 363 } 364 } 365 366 split.num_blocks = block_idx 367 split.num_types = uint(max_type) + 1 368 } 369 370 new_index = nil 371 block_lengths = nil 372 histogram_symbols = nil 373 } 374 375 func splitByteVectorDistance(data []uint16, length uint, literals_per_histogram uint, max_histograms uint, sampling_stride_length uint, block_switch_cost float64, params *encoderParams, split *blockSplit) { 376 var data_size uint = histogramDataSizeDistance() 377 var num_histograms uint = length/literals_per_histogram + 1 378 var histograms []histogramDistance 379 if num_histograms > max_histograms { 380 num_histograms = max_histograms 381 } 382 383 if length == 0 { 384 split.num_types = 1 385 return 386 } else if length < kMinLengthForBlockSplitting { 387 brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, split.num_blocks+1) 388 brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, split.num_blocks+1) 389 split.num_types = 1 390 split.types[split.num_blocks] = 0 391 split.lengths[split.num_blocks] = uint32(length) 392 split.num_blocks++ 393 return 394 } 395 396 histograms = make([]histogramDistance, num_histograms) 397 398 /* Find good entropy codes. */ 399 initialEntropyCodesDistance(data, length, sampling_stride_length, num_histograms, histograms) 400 401 refineEntropyCodesDistance(data, length, sampling_stride_length, num_histograms, histograms) 402 { 403 var block_ids []byte = make([]byte, length) 404 var num_blocks uint = 0 405 var bitmaplen uint = (num_histograms + 7) >> 3 406 var insert_cost []float64 = make([]float64, (data_size * num_histograms)) 407 var cost []float64 = make([]float64, num_histograms) 408 var switch_signal []byte = make([]byte, (length * bitmaplen)) 409 var new_id []uint16 = make([]uint16, num_histograms) 410 var iters uint 411 if params.quality < hqZopflificationQuality { 412 iters = 3 413 } else { 414 iters = 10 415 } 416 /* Find a good path through literals with the good entropy codes. */ 417 418 var i uint 419 for i = 0; i < iters; i++ { 420 num_blocks = findBlocksDistance(data, length, block_switch_cost, num_histograms, histograms, insert_cost, cost, switch_signal, block_ids) 421 num_histograms = remapBlockIdsDistance(block_ids, length, new_id, num_histograms) 422 buildBlockHistogramsDistance(data, length, block_ids, num_histograms, histograms) 423 } 424 425 insert_cost = nil 426 cost = nil 427 switch_signal = nil 428 new_id = nil 429 histograms = nil 430 clusterBlocksDistance(data, length, num_blocks, block_ids, split) 431 block_ids = nil 432 } 433 }