github.com/andybalholm/brotli@v1.0.6/literal_cost.go (about)

     1  package brotli
     2  
     3  func utf8Position(last uint, c uint, clamp uint) uint {
     4  	if c < 128 {
     5  		return 0 /* Next one is the 'Byte 1' again. */
     6  	} else if c >= 192 { /* Next one is the 'Byte 2' of utf-8 encoding. */
     7  		return brotli_min_size_t(1, clamp)
     8  	} else {
     9  		/* Let's decide over the last byte if this ends the sequence. */
    10  		if last < 0xE0 {
    11  			return 0 /* Completed two or three byte coding. */ /* Next one is the 'Byte 3' of utf-8 encoding. */
    12  		} else {
    13  			return brotli_min_size_t(2, clamp)
    14  		}
    15  	}
    16  }
    17  
    18  func decideMultiByteStatsLevel(pos uint, len uint, mask uint, data []byte) uint {
    19  	var counts = [3]uint{0} /* should be 2, but 1 compresses better. */
    20  	var max_utf8 uint = 1
    21  	var last_c uint = 0
    22  	var i uint
    23  	for i = 0; i < len; i++ {
    24  		var c uint = uint(data[(pos+i)&mask])
    25  		counts[utf8Position(last_c, c, 2)]++
    26  		last_c = c
    27  	}
    28  
    29  	if counts[2] < 500 {
    30  		max_utf8 = 1
    31  	}
    32  
    33  	if counts[1]+counts[2] < 25 {
    34  		max_utf8 = 0
    35  	}
    36  
    37  	return max_utf8
    38  }
    39  
    40  func estimateBitCostsForLiteralsUTF8(pos uint, len uint, mask uint, data []byte, cost []float32) {
    41  	var max_utf8 uint = decideMultiByteStatsLevel(pos, uint(len), mask, data)
    42  	/* Bootstrap histograms. */
    43  	var histogram = [3][256]uint{[256]uint{0}}
    44  	var window_half uint = 495
    45  	var in_window uint = brotli_min_size_t(window_half, uint(len))
    46  	var in_window_utf8 = [3]uint{0}
    47  	/* max_utf8 is 0 (normal ASCII single byte modeling),
    48  	   1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
    49  
    50  	var i uint
    51  	{
    52  		var last_c uint = 0
    53  		var utf8_pos uint = 0
    54  		for i = 0; i < in_window; i++ {
    55  			var c uint = uint(data[(pos+i)&mask])
    56  			histogram[utf8_pos][c]++
    57  			in_window_utf8[utf8_pos]++
    58  			utf8_pos = utf8Position(last_c, c, max_utf8)
    59  			last_c = c
    60  		}
    61  	}
    62  
    63  	/* Compute bit costs with sliding window. */
    64  	for i = 0; i < len; i++ {
    65  		if i >= window_half {
    66  			var c uint
    67  			var last_c uint
    68  			if i < window_half+1 {
    69  				c = 0
    70  			} else {
    71  				c = uint(data[(pos+i-window_half-1)&mask])
    72  			}
    73  			if i < window_half+2 {
    74  				last_c = 0
    75  			} else {
    76  				last_c = uint(data[(pos+i-window_half-2)&mask])
    77  			}
    78  			/* Remove a byte in the past. */
    79  
    80  			var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
    81  			histogram[utf8_pos2][data[(pos+i-window_half)&mask]]--
    82  			in_window_utf8[utf8_pos2]--
    83  		}
    84  
    85  		if i+window_half < len {
    86  			var c uint = uint(data[(pos+i+window_half-1)&mask])
    87  			var last_c uint = uint(data[(pos+i+window_half-2)&mask])
    88  			/* Add a byte in the future. */
    89  
    90  			var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
    91  			histogram[utf8_pos2][data[(pos+i+window_half)&mask]]++
    92  			in_window_utf8[utf8_pos2]++
    93  		}
    94  		{
    95  			var c uint
    96  			var last_c uint
    97  			if i < 1 {
    98  				c = 0
    99  			} else {
   100  				c = uint(data[(pos+i-1)&mask])
   101  			}
   102  			if i < 2 {
   103  				last_c = 0
   104  			} else {
   105  				last_c = uint(data[(pos+i-2)&mask])
   106  			}
   107  			var utf8_pos uint = utf8Position(last_c, c, max_utf8)
   108  			var masked_pos uint = (pos + i) & mask
   109  			var histo uint = histogram[utf8_pos][data[masked_pos]]
   110  			var lit_cost float64
   111  			if histo == 0 {
   112  				histo = 1
   113  			}
   114  
   115  			lit_cost = fastLog2(in_window_utf8[utf8_pos]) - fastLog2(histo)
   116  			lit_cost += 0.02905
   117  			if lit_cost < 1.0 {
   118  				lit_cost *= 0.5
   119  				lit_cost += 0.5
   120  			}
   121  
   122  			/* Make the first bytes more expensive -- seems to help, not sure why.
   123  			   Perhaps because the entropy source is changing its properties
   124  			   rapidly in the beginning of the file, perhaps because the beginning
   125  			   of the data is a statistical "anomaly". */
   126  			if i < 2000 {
   127  				lit_cost += 0.7 - (float64(2000-i) / 2000.0 * 0.35)
   128  			}
   129  
   130  			cost[i] = float32(lit_cost)
   131  		}
   132  	}
   133  }
   134  
   135  func estimateBitCostsForLiterals(pos uint, len uint, mask uint, data []byte, cost []float32) {
   136  	if isMostlyUTF8(data, pos, mask, uint(len), kMinUTF8Ratio) {
   137  		estimateBitCostsForLiteralsUTF8(pos, uint(len), mask, data, cost)
   138  		return
   139  	} else {
   140  		var histogram = [256]uint{0}
   141  		var window_half uint = 2000
   142  		var in_window uint = brotli_min_size_t(window_half, uint(len))
   143  		var i uint
   144  		/* Bootstrap histogram. */
   145  		for i = 0; i < in_window; i++ {
   146  			histogram[data[(pos+i)&mask]]++
   147  		}
   148  
   149  		/* Compute bit costs with sliding window. */
   150  		for i = 0; i < len; i++ {
   151  			var histo uint
   152  			if i >= window_half {
   153  				/* Remove a byte in the past. */
   154  				histogram[data[(pos+i-window_half)&mask]]--
   155  
   156  				in_window--
   157  			}
   158  
   159  			if i+window_half < len {
   160  				/* Add a byte in the future. */
   161  				histogram[data[(pos+i+window_half)&mask]]++
   162  
   163  				in_window++
   164  			}
   165  
   166  			histo = histogram[data[(pos+i)&mask]]
   167  			if histo == 0 {
   168  				histo = 1
   169  			}
   170  			{
   171  				var lit_cost float64 = fastLog2(in_window) - fastLog2(histo)
   172  				lit_cost += 0.029
   173  				if lit_cost < 1.0 {
   174  					lit_cost *= 0.5
   175  					lit_cost += 0.5
   176  				}
   177  
   178  				cost[i] = float32(lit_cost)
   179  			}
   180  		}
   181  	}
   182  }