github.com/andybalholm/brotli@v1.0.6/block_splitter_distance.go (about)

     1  package brotli
     2  
     3  import "math"
     4  
     5  /* Copyright 2013 Google Inc. All Rights Reserved.
     6  
     7     Distributed under MIT license.
     8     See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
     9  */
    10  
    11  func initialEntropyCodesDistance(data []uint16, length uint, stride uint, num_histograms uint, histograms []histogramDistance) {
    12  	var seed uint32 = 7
    13  	var block_length uint = length / num_histograms
    14  	var i uint
    15  	clearHistogramsDistance(histograms, num_histograms)
    16  	for i = 0; i < num_histograms; i++ {
    17  		var pos uint = length * i / num_histograms
    18  		if i != 0 {
    19  			pos += uint(myRand(&seed) % uint32(block_length))
    20  		}
    21  
    22  		if pos+stride >= length {
    23  			pos = length - stride - 1
    24  		}
    25  
    26  		histogramAddVectorDistance(&histograms[i], data[pos:], stride)
    27  	}
    28  }
    29  
    30  func randomSampleDistance(seed *uint32, data []uint16, length uint, stride uint, sample *histogramDistance) {
    31  	var pos uint = 0
    32  	if stride >= length {
    33  		stride = length
    34  	} else {
    35  		pos = uint(myRand(seed) % uint32(length-stride+1))
    36  	}
    37  
    38  	histogramAddVectorDistance(sample, data[pos:], stride)
    39  }
    40  
    41  func refineEntropyCodesDistance(data []uint16, length uint, stride uint, num_histograms uint, histograms []histogramDistance) {
    42  	var iters uint = kIterMulForRefining*length/stride + kMinItersForRefining
    43  	var seed uint32 = 7
    44  	var iter uint
    45  	iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms
    46  	for iter = 0; iter < iters; iter++ {
    47  		var sample histogramDistance
    48  		histogramClearDistance(&sample)
    49  		randomSampleDistance(&seed, data, length, stride, &sample)
    50  		histogramAddHistogramDistance(&histograms[iter%num_histograms], &sample)
    51  	}
    52  }
    53  
    54  /* Assigns a block id from the range [0, num_histograms) to each data element
    55     in data[0..length) and fills in block_id[0..length) with the assigned values.
    56     Returns the number of blocks, i.e. one plus the number of block switches. */
    57  func findBlocksDistance(data []uint16, length uint, block_switch_bitcost float64, num_histograms uint, histograms []histogramDistance, insert_cost []float64, cost []float64, switch_signal []byte, block_id []byte) uint {
    58  	var data_size uint = histogramDataSizeDistance()
    59  	var bitmaplen uint = (num_histograms + 7) >> 3
    60  	var num_blocks uint = 1
    61  	var i uint
    62  	var j uint
    63  	assert(num_histograms <= 256)
    64  	if num_histograms <= 1 {
    65  		for i = 0; i < length; i++ {
    66  			block_id[i] = 0
    67  		}
    68  
    69  		return 1
    70  	}
    71  
    72  	for i := 0; i < int(data_size*num_histograms); i++ {
    73  		insert_cost[i] = 0
    74  	}
    75  	for i = 0; i < num_histograms; i++ {
    76  		insert_cost[i] = fastLog2(uint(uint32(histograms[i].total_count_)))
    77  	}
    78  
    79  	for i = data_size; i != 0; {
    80  		i--
    81  		for j = 0; j < num_histograms; j++ {
    82  			insert_cost[i*num_histograms+j] = insert_cost[j] - bitCost(uint(histograms[j].data_[i]))
    83  		}
    84  	}
    85  
    86  	for i := 0; i < int(num_histograms); i++ {
    87  		cost[i] = 0
    88  	}
    89  	for i := 0; i < int(length*bitmaplen); i++ {
    90  		switch_signal[i] = 0
    91  	}
    92  
    93  	/* After each iteration of this loop, cost[k] will contain the difference
    94  	   between the minimum cost of arriving at the current byte position using
    95  	   entropy code k, and the minimum cost of arriving at the current byte
    96  	   position. This difference is capped at the block switch cost, and if it
    97  	   reaches block switch cost, it means that when we trace back from the last
    98  	   position, we need to switch here. */
    99  	for i = 0; i < length; i++ {
   100  		var byte_ix uint = i
   101  		var ix uint = byte_ix * bitmaplen
   102  		var insert_cost_ix uint = uint(data[byte_ix]) * num_histograms
   103  		var min_cost float64 = 1e99
   104  		var block_switch_cost float64 = block_switch_bitcost
   105  		var k uint
   106  		for k = 0; k < num_histograms; k++ {
   107  			/* We are coding the symbol in data[byte_ix] with entropy code k. */
   108  			cost[k] += insert_cost[insert_cost_ix+k]
   109  
   110  			if cost[k] < min_cost {
   111  				min_cost = cost[k]
   112  				block_id[byte_ix] = byte(k)
   113  			}
   114  		}
   115  
   116  		/* More blocks for the beginning. */
   117  		if byte_ix < 2000 {
   118  			block_switch_cost *= 0.77 + 0.07*float64(byte_ix)/2000
   119  		}
   120  
   121  		for k = 0; k < num_histograms; k++ {
   122  			cost[k] -= min_cost
   123  			if cost[k] >= block_switch_cost {
   124  				var mask byte = byte(1 << (k & 7))
   125  				cost[k] = block_switch_cost
   126  				assert(k>>3 < bitmaplen)
   127  				switch_signal[ix+(k>>3)] |= mask
   128  				/* Trace back from the last position and switch at the marked places. */
   129  			}
   130  		}
   131  	}
   132  	{
   133  		var byte_ix uint = length - 1
   134  		var ix uint = byte_ix * bitmaplen
   135  		var cur_id byte = block_id[byte_ix]
   136  		for byte_ix > 0 {
   137  			var mask byte = byte(1 << (cur_id & 7))
   138  			assert(uint(cur_id)>>3 < bitmaplen)
   139  			byte_ix--
   140  			ix -= bitmaplen
   141  			if switch_signal[ix+uint(cur_id>>3)]&mask != 0 {
   142  				if cur_id != block_id[byte_ix] {
   143  					cur_id = block_id[byte_ix]
   144  					num_blocks++
   145  				}
   146  			}
   147  
   148  			block_id[byte_ix] = cur_id
   149  		}
   150  	}
   151  
   152  	return num_blocks
   153  }
   154  
   155  var remapBlockIdsDistance_kInvalidId uint16 = 256
   156  
   157  func remapBlockIdsDistance(block_ids []byte, length uint, new_id []uint16, num_histograms uint) uint {
   158  	var next_id uint16 = 0
   159  	var i uint
   160  	for i = 0; i < num_histograms; i++ {
   161  		new_id[i] = remapBlockIdsDistance_kInvalidId
   162  	}
   163  
   164  	for i = 0; i < length; i++ {
   165  		assert(uint(block_ids[i]) < num_histograms)
   166  		if new_id[block_ids[i]] == remapBlockIdsDistance_kInvalidId {
   167  			new_id[block_ids[i]] = next_id
   168  			next_id++
   169  		}
   170  	}
   171  
   172  	for i = 0; i < length; i++ {
   173  		block_ids[i] = byte(new_id[block_ids[i]])
   174  		assert(uint(block_ids[i]) < num_histograms)
   175  	}
   176  
   177  	assert(uint(next_id) <= num_histograms)
   178  	return uint(next_id)
   179  }
   180  
   181  func buildBlockHistogramsDistance(data []uint16, length uint, block_ids []byte, num_histograms uint, histograms []histogramDistance) {
   182  	var i uint
   183  	clearHistogramsDistance(histograms, num_histograms)
   184  	for i = 0; i < length; i++ {
   185  		histogramAddDistance(&histograms[block_ids[i]], uint(data[i]))
   186  	}
   187  }
   188  
   189  var clusterBlocksDistance_kInvalidIndex uint32 = math.MaxUint32
   190  
   191  func clusterBlocksDistance(data []uint16, length uint, num_blocks uint, block_ids []byte, split *blockSplit) {
   192  	var histogram_symbols []uint32 = make([]uint32, num_blocks)
   193  	var block_lengths []uint32 = make([]uint32, num_blocks)
   194  	var expected_num_clusters uint = clustersPerBatch * (num_blocks + histogramsPerBatch - 1) / histogramsPerBatch
   195  	var all_histograms_size uint = 0
   196  	var all_histograms_capacity uint = expected_num_clusters
   197  	var all_histograms []histogramDistance = make([]histogramDistance, all_histograms_capacity)
   198  	var cluster_size_size uint = 0
   199  	var cluster_size_capacity uint = expected_num_clusters
   200  	var cluster_size []uint32 = make([]uint32, cluster_size_capacity)
   201  	var num_clusters uint = 0
   202  	var histograms []histogramDistance = make([]histogramDistance, brotli_min_size_t(num_blocks, histogramsPerBatch))
   203  	var max_num_pairs uint = histogramsPerBatch * histogramsPerBatch / 2
   204  	var pairs_capacity uint = max_num_pairs + 1
   205  	var pairs []histogramPair = make([]histogramPair, pairs_capacity)
   206  	var pos uint = 0
   207  	var clusters []uint32
   208  	var num_final_clusters uint
   209  	var new_index []uint32
   210  	var i uint
   211  	var sizes = [histogramsPerBatch]uint32{0}
   212  	var new_clusters = [histogramsPerBatch]uint32{0}
   213  	var symbols = [histogramsPerBatch]uint32{0}
   214  	var remap = [histogramsPerBatch]uint32{0}
   215  
   216  	for i := 0; i < int(num_blocks); i++ {
   217  		block_lengths[i] = 0
   218  	}
   219  	{
   220  		var block_idx uint = 0
   221  		for i = 0; i < length; i++ {
   222  			assert(block_idx < num_blocks)
   223  			block_lengths[block_idx]++
   224  			if i+1 == length || block_ids[i] != block_ids[i+1] {
   225  				block_idx++
   226  			}
   227  		}
   228  
   229  		assert(block_idx == num_blocks)
   230  	}
   231  
   232  	for i = 0; i < num_blocks; i += histogramsPerBatch {
   233  		var num_to_combine uint = brotli_min_size_t(num_blocks-i, histogramsPerBatch)
   234  		var num_new_clusters uint
   235  		var j uint
   236  		for j = 0; j < num_to_combine; j++ {
   237  			var k uint
   238  			histogramClearDistance(&histograms[j])
   239  			for k = 0; uint32(k) < block_lengths[i+j]; k++ {
   240  				histogramAddDistance(&histograms[j], uint(data[pos]))
   241  				pos++
   242  			}
   243  
   244  			histograms[j].bit_cost_ = populationCostDistance(&histograms[j])
   245  			new_clusters[j] = uint32(j)
   246  			symbols[j] = uint32(j)
   247  			sizes[j] = 1
   248  		}
   249  
   250  		num_new_clusters = histogramCombineDistance(histograms, sizes[:], symbols[:], new_clusters[:], []histogramPair(pairs), num_to_combine, num_to_combine, histogramsPerBatch, max_num_pairs)
   251  		if all_histograms_capacity < (all_histograms_size + num_new_clusters) {
   252  			var _new_size uint
   253  			if all_histograms_capacity == 0 {
   254  				_new_size = all_histograms_size + num_new_clusters
   255  			} else {
   256  				_new_size = all_histograms_capacity
   257  			}
   258  			var new_array []histogramDistance
   259  			for _new_size < (all_histograms_size + num_new_clusters) {
   260  				_new_size *= 2
   261  			}
   262  			new_array = make([]histogramDistance, _new_size)
   263  			if all_histograms_capacity != 0 {
   264  				copy(new_array, all_histograms[:all_histograms_capacity])
   265  			}
   266  
   267  			all_histograms = new_array
   268  			all_histograms_capacity = _new_size
   269  		}
   270  
   271  		brotli_ensure_capacity_uint32_t(&cluster_size, &cluster_size_capacity, cluster_size_size+num_new_clusters)
   272  		for j = 0; j < num_new_clusters; j++ {
   273  			all_histograms[all_histograms_size] = histograms[new_clusters[j]]
   274  			all_histograms_size++
   275  			cluster_size[cluster_size_size] = sizes[new_clusters[j]]
   276  			cluster_size_size++
   277  			remap[new_clusters[j]] = uint32(j)
   278  		}
   279  
   280  		for j = 0; j < num_to_combine; j++ {
   281  			histogram_symbols[i+j] = uint32(num_clusters) + remap[symbols[j]]
   282  		}
   283  
   284  		num_clusters += num_new_clusters
   285  		assert(num_clusters == cluster_size_size)
   286  		assert(num_clusters == all_histograms_size)
   287  	}
   288  
   289  	histograms = nil
   290  
   291  	max_num_pairs = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
   292  	if pairs_capacity < max_num_pairs+1 {
   293  		pairs = nil
   294  		pairs = make([]histogramPair, (max_num_pairs + 1))
   295  	}
   296  
   297  	clusters = make([]uint32, num_clusters)
   298  	for i = 0; i < num_clusters; i++ {
   299  		clusters[i] = uint32(i)
   300  	}
   301  
   302  	num_final_clusters = histogramCombineDistance(all_histograms, cluster_size, histogram_symbols, clusters, pairs, num_clusters, num_blocks, maxNumberOfBlockTypes, max_num_pairs)
   303  	pairs = nil
   304  	cluster_size = nil
   305  
   306  	new_index = make([]uint32, num_clusters)
   307  	for i = 0; i < num_clusters; i++ {
   308  		new_index[i] = clusterBlocksDistance_kInvalidIndex
   309  	}
   310  	pos = 0
   311  	{
   312  		var next_index uint32 = 0
   313  		for i = 0; i < num_blocks; i++ {
   314  			var histo histogramDistance
   315  			var j uint
   316  			var best_out uint32
   317  			var best_bits float64
   318  			histogramClearDistance(&histo)
   319  			for j = 0; uint32(j) < block_lengths[i]; j++ {
   320  				histogramAddDistance(&histo, uint(data[pos]))
   321  				pos++
   322  			}
   323  
   324  			if i == 0 {
   325  				best_out = histogram_symbols[0]
   326  			} else {
   327  				best_out = histogram_symbols[i-1]
   328  			}
   329  			best_bits = histogramBitCostDistanceDistance(&histo, &all_histograms[best_out])
   330  			for j = 0; j < num_final_clusters; j++ {
   331  				var cur_bits float64 = histogramBitCostDistanceDistance(&histo, &all_histograms[clusters[j]])
   332  				if cur_bits < best_bits {
   333  					best_bits = cur_bits
   334  					best_out = clusters[j]
   335  				}
   336  			}
   337  
   338  			histogram_symbols[i] = best_out
   339  			if new_index[best_out] == clusterBlocksDistance_kInvalidIndex {
   340  				new_index[best_out] = next_index
   341  				next_index++
   342  			}
   343  		}
   344  	}
   345  
   346  	clusters = nil
   347  	all_histograms = nil
   348  	brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, num_blocks)
   349  	brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, num_blocks)
   350  	{
   351  		var cur_length uint32 = 0
   352  		var block_idx uint = 0
   353  		var max_type byte = 0
   354  		for i = 0; i < num_blocks; i++ {
   355  			cur_length += block_lengths[i]
   356  			if i+1 == num_blocks || histogram_symbols[i] != histogram_symbols[i+1] {
   357  				var id byte = byte(new_index[histogram_symbols[i]])
   358  				split.types[block_idx] = id
   359  				split.lengths[block_idx] = cur_length
   360  				max_type = brotli_max_uint8_t(max_type, id)
   361  				cur_length = 0
   362  				block_idx++
   363  			}
   364  		}
   365  
   366  		split.num_blocks = block_idx
   367  		split.num_types = uint(max_type) + 1
   368  	}
   369  
   370  	new_index = nil
   371  	block_lengths = nil
   372  	histogram_symbols = nil
   373  }
   374  
   375  func splitByteVectorDistance(data []uint16, length uint, literals_per_histogram uint, max_histograms uint, sampling_stride_length uint, block_switch_cost float64, params *encoderParams, split *blockSplit) {
   376  	var data_size uint = histogramDataSizeDistance()
   377  	var num_histograms uint = length/literals_per_histogram + 1
   378  	var histograms []histogramDistance
   379  	if num_histograms > max_histograms {
   380  		num_histograms = max_histograms
   381  	}
   382  
   383  	if length == 0 {
   384  		split.num_types = 1
   385  		return
   386  	} else if length < kMinLengthForBlockSplitting {
   387  		brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, split.num_blocks+1)
   388  		brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, split.num_blocks+1)
   389  		split.num_types = 1
   390  		split.types[split.num_blocks] = 0
   391  		split.lengths[split.num_blocks] = uint32(length)
   392  		split.num_blocks++
   393  		return
   394  	}
   395  
   396  	histograms = make([]histogramDistance, num_histograms)
   397  
   398  	/* Find good entropy codes. */
   399  	initialEntropyCodesDistance(data, length, sampling_stride_length, num_histograms, histograms)
   400  
   401  	refineEntropyCodesDistance(data, length, sampling_stride_length, num_histograms, histograms)
   402  	{
   403  		var block_ids []byte = make([]byte, length)
   404  		var num_blocks uint = 0
   405  		var bitmaplen uint = (num_histograms + 7) >> 3
   406  		var insert_cost []float64 = make([]float64, (data_size * num_histograms))
   407  		var cost []float64 = make([]float64, num_histograms)
   408  		var switch_signal []byte = make([]byte, (length * bitmaplen))
   409  		var new_id []uint16 = make([]uint16, num_histograms)
   410  		var iters uint
   411  		if params.quality < hqZopflificationQuality {
   412  			iters = 3
   413  		} else {
   414  			iters = 10
   415  		}
   416  		/* Find a good path through literals with the good entropy codes. */
   417  
   418  		var i uint
   419  		for i = 0; i < iters; i++ {
   420  			num_blocks = findBlocksDistance(data, length, block_switch_cost, num_histograms, histograms, insert_cost, cost, switch_signal, block_ids)
   421  			num_histograms = remapBlockIdsDistance(block_ids, length, new_id, num_histograms)
   422  			buildBlockHistogramsDistance(data, length, block_ids, num_histograms, histograms)
   423  		}
   424  
   425  		insert_cost = nil
   426  		cost = nil
   427  		switch_signal = nil
   428  		new_id = nil
   429  		histograms = nil
   430  		clusterBlocksDistance(data, length, num_blocks, block_ids, split)
   431  		block_ids = nil
   432  	}
   433  }