github.com/andybalholm/brotli@v1.0.6/literal_cost.go (about) 1 package brotli 2 3 func utf8Position(last uint, c uint, clamp uint) uint { 4 if c < 128 { 5 return 0 /* Next one is the 'Byte 1' again. */ 6 } else if c >= 192 { /* Next one is the 'Byte 2' of utf-8 encoding. */ 7 return brotli_min_size_t(1, clamp) 8 } else { 9 /* Let's decide over the last byte if this ends the sequence. */ 10 if last < 0xE0 { 11 return 0 /* Completed two or three byte coding. */ /* Next one is the 'Byte 3' of utf-8 encoding. */ 12 } else { 13 return brotli_min_size_t(2, clamp) 14 } 15 } 16 } 17 18 func decideMultiByteStatsLevel(pos uint, len uint, mask uint, data []byte) uint { 19 var counts = [3]uint{0} /* should be 2, but 1 compresses better. */ 20 var max_utf8 uint = 1 21 var last_c uint = 0 22 var i uint 23 for i = 0; i < len; i++ { 24 var c uint = uint(data[(pos+i)&mask]) 25 counts[utf8Position(last_c, c, 2)]++ 26 last_c = c 27 } 28 29 if counts[2] < 500 { 30 max_utf8 = 1 31 } 32 33 if counts[1]+counts[2] < 25 { 34 max_utf8 = 0 35 } 36 37 return max_utf8 38 } 39 40 func estimateBitCostsForLiteralsUTF8(pos uint, len uint, mask uint, data []byte, cost []float32) { 41 var max_utf8 uint = decideMultiByteStatsLevel(pos, uint(len), mask, data) 42 /* Bootstrap histograms. */ 43 var histogram = [3][256]uint{[256]uint{0}} 44 var window_half uint = 495 45 var in_window uint = brotli_min_size_t(window_half, uint(len)) 46 var in_window_utf8 = [3]uint{0} 47 /* max_utf8 is 0 (normal ASCII single byte modeling), 48 1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */ 49 50 var i uint 51 { 52 var last_c uint = 0 53 var utf8_pos uint = 0 54 for i = 0; i < in_window; i++ { 55 var c uint = uint(data[(pos+i)&mask]) 56 histogram[utf8_pos][c]++ 57 in_window_utf8[utf8_pos]++ 58 utf8_pos = utf8Position(last_c, c, max_utf8) 59 last_c = c 60 } 61 } 62 63 /* Compute bit costs with sliding window. */ 64 for i = 0; i < len; i++ { 65 if i >= window_half { 66 var c uint 67 var last_c uint 68 if i < window_half+1 { 69 c = 0 70 } else { 71 c = uint(data[(pos+i-window_half-1)&mask]) 72 } 73 if i < window_half+2 { 74 last_c = 0 75 } else { 76 last_c = uint(data[(pos+i-window_half-2)&mask]) 77 } 78 /* Remove a byte in the past. */ 79 80 var utf8_pos2 uint = utf8Position(last_c, c, max_utf8) 81 histogram[utf8_pos2][data[(pos+i-window_half)&mask]]-- 82 in_window_utf8[utf8_pos2]-- 83 } 84 85 if i+window_half < len { 86 var c uint = uint(data[(pos+i+window_half-1)&mask]) 87 var last_c uint = uint(data[(pos+i+window_half-2)&mask]) 88 /* Add a byte in the future. */ 89 90 var utf8_pos2 uint = utf8Position(last_c, c, max_utf8) 91 histogram[utf8_pos2][data[(pos+i+window_half)&mask]]++ 92 in_window_utf8[utf8_pos2]++ 93 } 94 { 95 var c uint 96 var last_c uint 97 if i < 1 { 98 c = 0 99 } else { 100 c = uint(data[(pos+i-1)&mask]) 101 } 102 if i < 2 { 103 last_c = 0 104 } else { 105 last_c = uint(data[(pos+i-2)&mask]) 106 } 107 var utf8_pos uint = utf8Position(last_c, c, max_utf8) 108 var masked_pos uint = (pos + i) & mask 109 var histo uint = histogram[utf8_pos][data[masked_pos]] 110 var lit_cost float64 111 if histo == 0 { 112 histo = 1 113 } 114 115 lit_cost = fastLog2(in_window_utf8[utf8_pos]) - fastLog2(histo) 116 lit_cost += 0.02905 117 if lit_cost < 1.0 { 118 lit_cost *= 0.5 119 lit_cost += 0.5 120 } 121 122 /* Make the first bytes more expensive -- seems to help, not sure why. 123 Perhaps because the entropy source is changing its properties 124 rapidly in the beginning of the file, perhaps because the beginning 125 of the data is a statistical "anomaly". */ 126 if i < 2000 { 127 lit_cost += 0.7 - (float64(2000-i) / 2000.0 * 0.35) 128 } 129 130 cost[i] = float32(lit_cost) 131 } 132 } 133 } 134 135 func estimateBitCostsForLiterals(pos uint, len uint, mask uint, data []byte, cost []float32) { 136 if isMostlyUTF8(data, pos, mask, uint(len), kMinUTF8Ratio) { 137 estimateBitCostsForLiteralsUTF8(pos, uint(len), mask, data, cost) 138 return 139 } else { 140 var histogram = [256]uint{0} 141 var window_half uint = 2000 142 var in_window uint = brotli_min_size_t(window_half, uint(len)) 143 var i uint 144 /* Bootstrap histogram. */ 145 for i = 0; i < in_window; i++ { 146 histogram[data[(pos+i)&mask]]++ 147 } 148 149 /* Compute bit costs with sliding window. */ 150 for i = 0; i < len; i++ { 151 var histo uint 152 if i >= window_half { 153 /* Remove a byte in the past. */ 154 histogram[data[(pos+i-window_half)&mask]]-- 155 156 in_window-- 157 } 158 159 if i+window_half < len { 160 /* Add a byte in the future. */ 161 histogram[data[(pos+i+window_half)&mask]]++ 162 163 in_window++ 164 } 165 166 histo = histogram[data[(pos+i)&mask]] 167 if histo == 0 { 168 histo = 1 169 } 170 { 171 var lit_cost float64 = fastLog2(in_window) - fastLog2(histo) 172 lit_cost += 0.029 173 if lit_cost < 1.0 { 174 lit_cost *= 0.5 175 lit_cost += 0.5 176 } 177 178 cost[i] = float32(lit_cost) 179 } 180 } 181 } 182 }