vitess.io/vitess@v0.16.2/go/mysql/collations/internal/uca/iter_fast_900.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package uca 18 19 import ( 20 "math/bits" 21 "unicode/utf8" 22 "unsafe" 23 ) 24 25 type FastIterator900 struct { 26 iterator900 27 fastTable *[256]uint32 28 unicode int 29 } 30 31 func (it *FastIterator900) Done() { 32 it.original = nil 33 it.input = nil 34 it.iterpool.Put(it) 35 } 36 37 func (it *FastIterator900) reset(input []byte) { 38 it.fastTable = &fastweightTable_uca900_page000L0 39 it.unicode = 0 40 it.iterator900.reset(input) 41 } 42 43 func (it *FastIterator900) SkipLevel() int { 44 it.codepoint.ce = 0 45 it.level++ 46 it.resetForNextLevel() 47 return it.level 48 } 49 50 const maxUnicodeBlocks = 3 51 52 // FastForward32 fast-forwards this iterator and the given it2 in parallel until 53 // there is a mismatch in their weights, and returns their difference. 54 // This function is similar to NextWeightBlock64 in that it only succeeds if the 55 // iterators are composed of (mostly) ASCII characters. See the docs for NextWeightBlock64 56 // for documentation on how these fast comparisons work. 57 func (it *FastIterator900) FastForward32(it2 *FastIterator900) int { 58 // We use a heuristic to detect when we should stop using the FastForward32 59 // iterator: every time we encounter a 4-byte block that is fully Unicode, 60 // (i.e. without any ASCII characters), we increase the `it.unicode` counter. 61 // Encountering a block that is fully ASCII decreases the counter. If the 62 // counter ever gets to 4, further calls to FastForward32 are disabled. 63 if it.unicode > maxUnicodeBlocks || it.codepoint.ce != 0 || it2.codepoint.ce != 0 { 64 return 0 65 } 66 67 p1 := it.input 68 p2 := it2.input 69 var w1, w2 uint16 70 71 for len(p1) >= 4 && len(p2) >= 4 { 72 dword1 := *(*uint32)(unsafe.Pointer(&p1[0])) 73 dword2 := *(*uint32)(unsafe.Pointer(&p2[0])) 74 nonascii := (dword1 | dword2) & 0x80808080 75 76 if nonascii == 0 { 77 if dword1 != dword2 { 78 // Use the weight string fast tables for quick weight comparisons; 79 // see (*FastIterator900).NextWeightBlock64 for a description of 80 // the table format 81 table := it.fastTable 82 if w1, w2 = uint16(table[p1[0]]), uint16(table[p2[0]]); w1 != w2 { 83 goto mismatch 84 } 85 if w1, w2 = uint16(table[p1[1]]), uint16(table[p2[1]]); w1 != w2 { 86 goto mismatch 87 } 88 if w1, w2 = uint16(table[p1[2]]), uint16(table[p2[2]]); w1 != w2 { 89 goto mismatch 90 } 91 if w1, w2 = uint16(table[p1[3]]), uint16(table[p2[3]]); w1 != w2 { 92 goto mismatch 93 } 94 } 95 p1 = p1[4:] 96 p2 = p2[4:] 97 it.unicode-- 98 continue 99 } else if bits.OnesCount32(nonascii) == 4 { 100 it.unicode++ 101 } 102 break 103 } 104 it.input = p1 105 it2.input = p2 106 return 0 107 108 mismatch: 109 // If either of the weights was an ignorable, this is not really a mismatch; 110 // return 0 so we fall back to the slow path and increase `it.unicode`. Although 111 // these are _not_ unicode codepoints, if we find too many ignorable ASCII in 112 // an iterator we want to skip further calls to FastForward32 because they 113 // won't be able to optimize the comparisons at all 114 if w1 == 0 || w2 == 0 { 115 it.input = p1 116 it2.input = p2 117 it.unicode++ 118 return 0 119 } 120 // The weights must be byte-swapped before comparison because they're stored in big endian 121 return int(bits.ReverseBytes16(w1)) - int(bits.ReverseBytes16(w2)) 122 } 123 124 // NextWeightBlock64 takes a byte slice of 16 bytes and fills it with the next 125 // chunk of weights from this iterator. If the input slice is smaller than 126 // 16 bytes, the function will panic. 127 // 128 // The function returns the weights in Big Endian ordering: this is the 129 // same ordering that MySQL uses when generating weight strings, so the return 130 // of this function can be inserted directly into a weight string and the 131 // result will be compatible with MySQL. Likewise, the resulting slice 132 // can be compared byte-wise (bytes.Compare) to obtain a proper collation 133 // ordering against another string. 134 // 135 // Returns the number of bytes written to `dstbytes`. If 0, this iterator 136 // has been fully consumed. 137 // 138 // Implementation notes: 139 // This is a fast-path algorithm that can only work for UCA900 collations 140 // that do not have reorderings, contractions or any weight patches. The idea 141 // is detecting runs of 8 ASCII characters in a row, which are very frequent 142 // in most UTF8 code, particularly in English, and generating the weights 143 // for these 8 characters directly from an optimized table, instead of going 144 // through the whole Unicode Collation Algorithm. This is feasible because 145 // in UCA900, all characters in the ASCII range have either 0 or 1 weight 146 // triplets, so their weight can be calculated with a single lookup in a 128-entry 147 // table for each level (0, 1, 2). 148 func (it *FastIterator900) NextWeightBlock64(dstbytes []byte) int { 149 // Ensure the destination slice has at least 16 bytes; this bounds check 150 // removes all the other bound checks for the rest of the function. 151 _ = dstbytes[15] 152 153 // Unsafe cast the destination byte slice into a slice of uint16, so the 154 // individual weights can be written directly to it. 155 dst := (*[8]uint16)(unsafe.Pointer(&dstbytes[0])) 156 p := it.input 157 158 // The fast path works on 8-byte chunks from the original input. 159 // If the underlying slow iterator is in the middle of processing the 160 // weights for a codepoint, we cannot go through the fast path. 161 if it.codepoint.ce == 0 && len(p) >= 8 { 162 // Read 8 bytes from the input string. This would ideally be implemented 163 // as a `binary.LittleEndian.Uint64` read, but the compiler doesn't seem to be 164 // optimizing it properly, and it generates the individual byte shifts :( 165 dword := *(*uint64)(unsafe.Pointer(&p[0])) 166 167 // Check if all any of the bytes in the next 8 bytes have their highest 168 // bit set. If they're all clear, these are 8 ASCII bytes which can go through 169 // the fast path 170 if dword&0x8080808080808080 == 0 { 171 // Load the fast table from the iterator. There are three fast tables hardcoded 172 // for this implementation, for ASCII levels 0, 1 and 2. The slow iterator replaces 173 // the table stored on `it.fastTable` every time we skip a level. 174 // Note that the table has 256 entries although only the first 128 are used. We 175 // want a 256 entry table because it forces the Go compiler to disable all bound 176 // checks when accessing the table IF our index variable is a byte (since a byte 177 // cannot be larger than 255). 178 table := it.fastTable 179 180 // All ASCII codepoints (0 >= cp >= 127) have either 0 or 1 weights to yield. 181 // This is a problem for our fast path, because 0-weights must NOT appear in the 182 // resulting weight string. The codepoints with 0 weights are, however, exceedingly 183 // rare (they're mostly non-print codepoints), so based on this we can choose to 184 // perform either an optimistic or pessimistic optimization, which is toggled at 185 // compile time with this flag. For now, we're going with optimistic because it 186 // provides better results in realistic benchmarks. 187 const optimisticFastWrites = true 188 189 if optimisticFastWrites { 190 // For the optimistic optimization, we're going to assume that none of the 191 // ASCII codepoints in this chunk have zero weights, and check only once at 192 // the end of the chunk if that was the case. If we found a zero-weight (a 193 // rare occurrence), we discard the whole chunk and fall back to the slow 194 // iterator, which handles zero weights just fine. 195 // To perform the check for zero weights efficiently, we've designed fast tables 196 // where every entry is 32 bits, even though the actual weights are in the 197 // bottom 16 bits. The upper 16 bits contain either 0x2, if this is a valid weight 198 // or are zeroed out for 0-weights. 199 // Because of this, we can lookup from the fast table and insert directly the lowest 200 // 16 bits as the weight, and then we `and` the whole weight against a 32-bit mask 201 // that starts as 0x20000. For any weights that are valid, that will leave the mask 202 // with the same value, because only the higher bits will match, while any 0-weight 203 // will fully clear the mask. 204 // At the end of this block, we check if the mask has been cleared by any of the 205 // writes, and if it has, we scrap this work (boo) and fall back to the slow iterator. 206 var mask uint32 = 0x20000 207 weight := table[p[0]] 208 mask &= weight 209 dst[0] = uint16(weight) 210 211 weight = table[p[1]] 212 mask &= weight 213 dst[1] = uint16(weight) 214 215 weight = table[p[2]] 216 mask &= weight 217 dst[2] = uint16(weight) 218 219 weight = table[p[3]] 220 mask &= weight 221 dst[3] = uint16(weight) 222 223 weight = table[p[4]] 224 mask &= weight 225 dst[4] = uint16(weight) 226 227 weight = table[p[5]] 228 mask &= weight 229 dst[5] = uint16(weight) 230 231 weight = table[p[6]] 232 mask &= weight 233 dst[6] = uint16(weight) 234 235 weight = table[p[7]] 236 mask &= weight 237 dst[7] = uint16(weight) 238 239 if mask != 0 { 240 it.input = it.input[8:] 241 return 16 242 } 243 } else { 244 // For the pessimistic optimization, we're going to assume that any 8-byte chunk 245 // can contain 0-weights (something rather rare in practice). We're writing 246 // the lower 16 bits of the weight into the target buffer (just like in the optimistic 247 // optimization), but then we're increasing our target buffer pointer by 248 // the high 16 bits of the weight (weight >> 16). For valid weights, the high 249 // bits will equal 0x2, which is exactly the offset we want to move our target 250 // pointer so we can write the next weight afterwards, and for 0-weights, the 251 // high bits will be 0x0, so the pointer will not advance and the next weight 252 // we write will replace the 0-weight we've just written. This ensures that the 253 // resulting byte output doesn't have any 0-weights in it, but it also causes 254 // small stalls in the CPU because the writes are not necessarily linear and they 255 // have an ordering dependency with the value we've loaded from the fast table. 256 // Regardless of the stalls, this algorithm is obviously branch-less and very 257 // efficient, it just happens that in real world scenarios, the optimistic 258 // approach is even faster because 0-weights are very rare in practice. 259 // For now, this algorithm is disabled. 260 dstptr := (*uint16)(unsafe.Pointer(&dstbytes[0])) 261 weight := table[p[0]] 262 *dstptr = uint16(weight) 263 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 264 265 weight = table[p[1]] 266 *dstptr = uint16(weight) 267 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 268 269 weight = table[p[2]] 270 *dstptr = uint16(weight) 271 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 272 273 weight = table[p[3]] 274 *dstptr = uint16(weight) 275 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 276 277 weight = table[p[4]] 278 *dstptr = uint16(weight) 279 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 280 281 weight = table[p[5]] 282 *dstptr = uint16(weight) 283 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 284 285 weight = table[p[6]] 286 *dstptr = uint16(weight) 287 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 288 289 weight = table[p[7]] 290 *dstptr = uint16(weight) 291 dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16)) 292 293 written := uintptr(unsafe.Pointer(dstptr)) - uintptr(unsafe.Pointer(&dstbytes[0])) 294 if written != 0 { 295 it.input = it.input[written>>1:] 296 return int(written) 297 } 298 } 299 } 300 } 301 302 // Slow path: just loop up to 8 times to fill the buffer and bail 303 // early if we exhaust the iterator. 304 for i := 0; i < 8; i++ { 305 w, ok := it.Next() 306 if !ok { 307 return i * 2 308 } 309 dst[i] = bits.ReverseBytes16(w) 310 } 311 return 16 312 } 313 314 func (it *FastIterator900) resetForNextLevel() { 315 it.input = it.original 316 switch it.level { 317 case 1: 318 it.fastTable = &fastweightTable_uca900_page000L1 319 case 2: 320 it.fastTable = &fastweightTable_uca900_page000L2 321 } 322 } 323 324 func (it *FastIterator900) Next() (uint16, bool) { 325 for { 326 if w, ok := it.codepoint.next(); ok { 327 return w, true 328 } 329 330 cp, width := utf8.DecodeRune(it.input) 331 if cp == utf8.RuneError && width < 3 { 332 it.level++ 333 if it.level < it.maxLevel { 334 it.resetForNextLevel() 335 return 0, true 336 } 337 return 0, false 338 } 339 340 it.input = it.input[width:] 341 it.codepoint.init(&it.iterator900, cp) 342 } 343 }