github.com/andybalholm/brotli@v1.0.6/cluster_distance.go (about)

     1  package brotli
     2  
     3  import "math"
     4  
     5  /* Copyright 2013 Google Inc. All Rights Reserved.
     6  
     7     Distributed under MIT license.
     8     See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
     9  */
    10  
    11  /* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
    12     it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
    13  func compareAndPushToQueueDistance(out []histogramDistance, cluster_size []uint32, idx1 uint32, idx2 uint32, max_num_pairs uint, pairs []histogramPair, num_pairs *uint) {
    14  	var is_good_pair bool = false
    15  	var p histogramPair
    16  	p.idx2 = 0
    17  	p.idx1 = p.idx2
    18  	p.cost_combo = 0
    19  	p.cost_diff = p.cost_combo
    20  	if idx1 == idx2 {
    21  		return
    22  	}
    23  
    24  	if idx2 < idx1 {
    25  		var t uint32 = idx2
    26  		idx2 = idx1
    27  		idx1 = t
    28  	}
    29  
    30  	p.idx1 = idx1
    31  	p.idx2 = idx2
    32  	p.cost_diff = 0.5 * clusterCostDiff(uint(cluster_size[idx1]), uint(cluster_size[idx2]))
    33  	p.cost_diff -= out[idx1].bit_cost_
    34  	p.cost_diff -= out[idx2].bit_cost_
    35  
    36  	if out[idx1].total_count_ == 0 {
    37  		p.cost_combo = out[idx2].bit_cost_
    38  		is_good_pair = true
    39  	} else if out[idx2].total_count_ == 0 {
    40  		p.cost_combo = out[idx1].bit_cost_
    41  		is_good_pair = true
    42  	} else {
    43  		var threshold float64
    44  		if *num_pairs == 0 {
    45  			threshold = 1e99
    46  		} else {
    47  			threshold = brotli_max_double(0.0, pairs[0].cost_diff)
    48  		}
    49  		var combo histogramDistance = out[idx1]
    50  		var cost_combo float64
    51  		histogramAddHistogramDistance(&combo, &out[idx2])
    52  		cost_combo = populationCostDistance(&combo)
    53  		if cost_combo < threshold-p.cost_diff {
    54  			p.cost_combo = cost_combo
    55  			is_good_pair = true
    56  		}
    57  	}
    58  
    59  	if is_good_pair {
    60  		p.cost_diff += p.cost_combo
    61  		if *num_pairs > 0 && histogramPairIsLess(&pairs[0], &p) {
    62  			/* Replace the top of the queue if needed. */
    63  			if *num_pairs < max_num_pairs {
    64  				pairs[*num_pairs] = pairs[0]
    65  				(*num_pairs)++
    66  			}
    67  
    68  			pairs[0] = p
    69  		} else if *num_pairs < max_num_pairs {
    70  			pairs[*num_pairs] = p
    71  			(*num_pairs)++
    72  		}
    73  	}
    74  }
    75  
    76  func histogramCombineDistance(out []histogramDistance, cluster_size []uint32, symbols []uint32, clusters []uint32, pairs []histogramPair, num_clusters uint, symbols_size uint, max_clusters uint, max_num_pairs uint) uint {
    77  	var cost_diff_threshold float64 = 0.0
    78  	var min_cluster_size uint = 1
    79  	var num_pairs uint = 0
    80  	{
    81  		/* We maintain a vector of histogram pairs, with the property that the pair
    82  		   with the maximum bit cost reduction is the first. */
    83  		var idx1 uint
    84  		for idx1 = 0; idx1 < num_clusters; idx1++ {
    85  			var idx2 uint
    86  			for idx2 = idx1 + 1; idx2 < num_clusters; idx2++ {
    87  				compareAndPushToQueueDistance(out, cluster_size, clusters[idx1], clusters[idx2], max_num_pairs, pairs[0:], &num_pairs)
    88  			}
    89  		}
    90  	}
    91  
    92  	for num_clusters > min_cluster_size {
    93  		var best_idx1 uint32
    94  		var best_idx2 uint32
    95  		var i uint
    96  		if pairs[0].cost_diff >= cost_diff_threshold {
    97  			cost_diff_threshold = 1e99
    98  			min_cluster_size = max_clusters
    99  			continue
   100  		}
   101  
   102  		/* Take the best pair from the top of heap. */
   103  		best_idx1 = pairs[0].idx1
   104  
   105  		best_idx2 = pairs[0].idx2
   106  		histogramAddHistogramDistance(&out[best_idx1], &out[best_idx2])
   107  		out[best_idx1].bit_cost_ = pairs[0].cost_combo
   108  		cluster_size[best_idx1] += cluster_size[best_idx2]
   109  		for i = 0; i < symbols_size; i++ {
   110  			if symbols[i] == best_idx2 {
   111  				symbols[i] = best_idx1
   112  			}
   113  		}
   114  
   115  		for i = 0; i < num_clusters; i++ {
   116  			if clusters[i] == best_idx2 {
   117  				copy(clusters[i:], clusters[i+1:][:num_clusters-i-1])
   118  				break
   119  			}
   120  		}
   121  
   122  		num_clusters--
   123  		{
   124  			/* Remove pairs intersecting the just combined best pair. */
   125  			var copy_to_idx uint = 0
   126  			for i = 0; i < num_pairs; i++ {
   127  				var p *histogramPair = &pairs[i]
   128  				if p.idx1 == best_idx1 || p.idx2 == best_idx1 || p.idx1 == best_idx2 || p.idx2 == best_idx2 {
   129  					/* Remove invalid pair from the queue. */
   130  					continue
   131  				}
   132  
   133  				if histogramPairIsLess(&pairs[0], p) {
   134  					/* Replace the top of the queue if needed. */
   135  					var front histogramPair = pairs[0]
   136  					pairs[0] = *p
   137  					pairs[copy_to_idx] = front
   138  				} else {
   139  					pairs[copy_to_idx] = *p
   140  				}
   141  
   142  				copy_to_idx++
   143  			}
   144  
   145  			num_pairs = copy_to_idx
   146  		}
   147  
   148  		/* Push new pairs formed with the combined histogram to the heap. */
   149  		for i = 0; i < num_clusters; i++ {
   150  			compareAndPushToQueueDistance(out, cluster_size, best_idx1, clusters[i], max_num_pairs, pairs[0:], &num_pairs)
   151  		}
   152  	}
   153  
   154  	return num_clusters
   155  }
   156  
   157  /* What is the bit cost of moving histogram from cur_symbol to candidate. */
   158  func histogramBitCostDistanceDistance(histogram *histogramDistance, candidate *histogramDistance) float64 {
   159  	if histogram.total_count_ == 0 {
   160  		return 0.0
   161  	} else {
   162  		var tmp histogramDistance = *histogram
   163  		histogramAddHistogramDistance(&tmp, candidate)
   164  		return populationCostDistance(&tmp) - candidate.bit_cost_
   165  	}
   166  }
   167  
   168  /* Find the best 'out' histogram for each of the 'in' histograms.
   169     When called, clusters[0..num_clusters) contains the unique values from
   170     symbols[0..in_size), but this property is not preserved in this function.
   171     Note: we assume that out[]->bit_cost_ is already up-to-date. */
   172  func histogramRemapDistance(in []histogramDistance, in_size uint, clusters []uint32, num_clusters uint, out []histogramDistance, symbols []uint32) {
   173  	var i uint
   174  	for i = 0; i < in_size; i++ {
   175  		var best_out uint32
   176  		if i == 0 {
   177  			best_out = symbols[0]
   178  		} else {
   179  			best_out = symbols[i-1]
   180  		}
   181  		var best_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[best_out])
   182  		var j uint
   183  		for j = 0; j < num_clusters; j++ {
   184  			var cur_bits float64 = histogramBitCostDistanceDistance(&in[i], &out[clusters[j]])
   185  			if cur_bits < best_bits {
   186  				best_bits = cur_bits
   187  				best_out = clusters[j]
   188  			}
   189  		}
   190  
   191  		symbols[i] = best_out
   192  	}
   193  
   194  	/* Recompute each out based on raw and symbols. */
   195  	for i = 0; i < num_clusters; i++ {
   196  		histogramClearDistance(&out[clusters[i]])
   197  	}
   198  
   199  	for i = 0; i < in_size; i++ {
   200  		histogramAddHistogramDistance(&out[symbols[i]], &in[i])
   201  	}
   202  }
   203  
   204  /* Reorders elements of the out[0..length) array and changes values in
   205     symbols[0..length) array in the following way:
   206       * when called, symbols[] contains indexes into out[], and has N unique
   207         values (possibly N < length)
   208       * on return, symbols'[i] = f(symbols[i]) and
   209                    out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
   210         where f is a bijection between the range of symbols[] and [0..N), and
   211         the first occurrences of values in symbols'[i] come in consecutive
   212         increasing order.
   213     Returns N, the number of unique values in symbols[]. */
   214  
   215  var histogramReindexDistance_kInvalidIndex uint32 = math.MaxUint32
   216  
   217  func histogramReindexDistance(out []histogramDistance, symbols []uint32, length uint) uint {
   218  	var new_index []uint32 = make([]uint32, length)
   219  	var next_index uint32
   220  	var tmp []histogramDistance
   221  	var i uint
   222  	for i = 0; i < length; i++ {
   223  		new_index[i] = histogramReindexDistance_kInvalidIndex
   224  	}
   225  
   226  	next_index = 0
   227  	for i = 0; i < length; i++ {
   228  		if new_index[symbols[i]] == histogramReindexDistance_kInvalidIndex {
   229  			new_index[symbols[i]] = next_index
   230  			next_index++
   231  		}
   232  	}
   233  
   234  	/* TODO: by using idea of "cycle-sort" we can avoid allocation of
   235  	   tmp and reduce the number of copying by the factor of 2. */
   236  	tmp = make([]histogramDistance, next_index)
   237  
   238  	next_index = 0
   239  	for i = 0; i < length; i++ {
   240  		if new_index[symbols[i]] == next_index {
   241  			tmp[next_index] = out[symbols[i]]
   242  			next_index++
   243  		}
   244  
   245  		symbols[i] = new_index[symbols[i]]
   246  	}
   247  
   248  	new_index = nil
   249  	for i = 0; uint32(i) < next_index; i++ {
   250  		out[i] = tmp[i]
   251  	}
   252  
   253  	tmp = nil
   254  	return uint(next_index)
   255  }
   256  
   257  func clusterHistogramsDistance(in []histogramDistance, in_size uint, max_histograms uint, out []histogramDistance, out_size *uint, histogram_symbols []uint32) {
   258  	var cluster_size []uint32 = make([]uint32, in_size)
   259  	var clusters []uint32 = make([]uint32, in_size)
   260  	var num_clusters uint = 0
   261  	var max_input_histograms uint = 64
   262  	var pairs_capacity uint = max_input_histograms * max_input_histograms / 2
   263  	var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1))
   264  	var i uint
   265  
   266  	/* For the first pass of clustering, we allow all pairs. */
   267  	for i = 0; i < in_size; i++ {
   268  		cluster_size[i] = 1
   269  	}
   270  
   271  	for i = 0; i < in_size; i++ {
   272  		out[i] = in[i]
   273  		out[i].bit_cost_ = populationCostDistance(&in[i])
   274  		histogram_symbols[i] = uint32(i)
   275  	}
   276  
   277  	for i = 0; i < in_size; i += max_input_histograms {
   278  		var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms)
   279  		var num_new_clusters uint
   280  		var j uint
   281  		for j = 0; j < num_to_combine; j++ {
   282  			clusters[num_clusters+j] = uint32(i + j)
   283  		}
   284  
   285  		num_new_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity)
   286  		num_clusters += num_new_clusters
   287  	}
   288  	{
   289  		/* For the second pass, we limit the total number of histogram pairs.
   290  		   After this limit is reached, we only keep searching for the best pair. */
   291  		var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
   292  		if pairs_capacity < (max_num_pairs + 1) {
   293  			var _new_size uint
   294  			if pairs_capacity == 0 {
   295  				_new_size = max_num_pairs + 1
   296  			} else {
   297  				_new_size = pairs_capacity
   298  			}
   299  			var new_array []histogramPair
   300  			for _new_size < (max_num_pairs + 1) {
   301  				_new_size *= 2
   302  			}
   303  			new_array = make([]histogramPair, _new_size)
   304  			if pairs_capacity != 0 {
   305  				copy(new_array, pairs[:pairs_capacity])
   306  			}
   307  
   308  			pairs = new_array
   309  			pairs_capacity = _new_size
   310  		}
   311  
   312  		/* Collapse similar histograms. */
   313  		num_clusters = histogramCombineDistance(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs)
   314  	}
   315  
   316  	pairs = nil
   317  	cluster_size = nil
   318  
   319  	/* Find the optimal map from original histograms to the final ones. */
   320  	histogramRemapDistance(in, in_size, clusters, num_clusters, out, histogram_symbols)
   321  
   322  	clusters = nil
   323  
   324  	/* Convert the context map to a canonical form. */
   325  	*out_size = histogramReindexDistance(out, histogram_symbols, in_size)
   326  }