vitess.io/vitess@v0.16.2/go/mysql/collations/internal/uca/iter_fast_900.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package uca
    18  
    19  import (
    20  	"math/bits"
    21  	"unicode/utf8"
    22  	"unsafe"
    23  )
    24  
    25  type FastIterator900 struct {
    26  	iterator900
    27  	fastTable *[256]uint32
    28  	unicode   int
    29  }
    30  
    31  func (it *FastIterator900) Done() {
    32  	it.original = nil
    33  	it.input = nil
    34  	it.iterpool.Put(it)
    35  }
    36  
    37  func (it *FastIterator900) reset(input []byte) {
    38  	it.fastTable = &fastweightTable_uca900_page000L0
    39  	it.unicode = 0
    40  	it.iterator900.reset(input)
    41  }
    42  
    43  func (it *FastIterator900) SkipLevel() int {
    44  	it.codepoint.ce = 0
    45  	it.level++
    46  	it.resetForNextLevel()
    47  	return it.level
    48  }
    49  
    50  const maxUnicodeBlocks = 3
    51  
    52  // FastForward32 fast-forwards this iterator and the given it2 in parallel until
    53  // there is a mismatch in their weights, and returns their difference.
    54  // This function is similar to NextWeightBlock64 in that it only succeeds if the
    55  // iterators are composed of (mostly) ASCII characters. See the docs for NextWeightBlock64
    56  // for documentation on how these fast comparisons work.
    57  func (it *FastIterator900) FastForward32(it2 *FastIterator900) int {
    58  	// We use a heuristic to detect when we should stop using the FastForward32
    59  	// iterator: every time we encounter a 4-byte block that is fully Unicode,
    60  	// (i.e. without any ASCII characters), we increase the `it.unicode` counter.
    61  	// Encountering a block that is fully ASCII decreases the counter. If the
    62  	// counter ever gets to 4, further calls to FastForward32 are disabled.
    63  	if it.unicode > maxUnicodeBlocks || it.codepoint.ce != 0 || it2.codepoint.ce != 0 {
    64  		return 0
    65  	}
    66  
    67  	p1 := it.input
    68  	p2 := it2.input
    69  	var w1, w2 uint16
    70  
    71  	for len(p1) >= 4 && len(p2) >= 4 {
    72  		dword1 := *(*uint32)(unsafe.Pointer(&p1[0]))
    73  		dword2 := *(*uint32)(unsafe.Pointer(&p2[0]))
    74  		nonascii := (dword1 | dword2) & 0x80808080
    75  
    76  		if nonascii == 0 {
    77  			if dword1 != dword2 {
    78  				// Use the weight string fast tables for quick weight comparisons;
    79  				// see (*FastIterator900).NextWeightBlock64 for a description of
    80  				// the table format
    81  				table := it.fastTable
    82  				if w1, w2 = uint16(table[p1[0]]), uint16(table[p2[0]]); w1 != w2 {
    83  					goto mismatch
    84  				}
    85  				if w1, w2 = uint16(table[p1[1]]), uint16(table[p2[1]]); w1 != w2 {
    86  					goto mismatch
    87  				}
    88  				if w1, w2 = uint16(table[p1[2]]), uint16(table[p2[2]]); w1 != w2 {
    89  					goto mismatch
    90  				}
    91  				if w1, w2 = uint16(table[p1[3]]), uint16(table[p2[3]]); w1 != w2 {
    92  					goto mismatch
    93  				}
    94  			}
    95  			p1 = p1[4:]
    96  			p2 = p2[4:]
    97  			it.unicode--
    98  			continue
    99  		} else if bits.OnesCount32(nonascii) == 4 {
   100  			it.unicode++
   101  		}
   102  		break
   103  	}
   104  	it.input = p1
   105  	it2.input = p2
   106  	return 0
   107  
   108  mismatch:
   109  	// If either of the weights was an ignorable, this is not really a mismatch;
   110  	// return 0 so we fall back to the slow path and increase `it.unicode`. Although
   111  	// these are _not_ unicode codepoints, if we find too many ignorable ASCII in
   112  	// an iterator we want to skip further calls to FastForward32 because they
   113  	// won't be able to optimize the comparisons at all
   114  	if w1 == 0 || w2 == 0 {
   115  		it.input = p1
   116  		it2.input = p2
   117  		it.unicode++
   118  		return 0
   119  	}
   120  	// The weights must be byte-swapped before comparison because they're stored in big endian
   121  	return int(bits.ReverseBytes16(w1)) - int(bits.ReverseBytes16(w2))
   122  }
   123  
   124  // NextWeightBlock64 takes a byte slice of 16 bytes and fills it with the next
   125  // chunk of weights from this iterator. If the input slice is smaller than
   126  // 16 bytes, the function will panic.
   127  //
   128  // The function returns the weights in Big Endian ordering: this is the
   129  // same ordering that MySQL uses when generating weight strings, so the return
   130  // of this function can be inserted directly into a weight string and the
   131  // result will be compatible with MySQL. Likewise, the resulting slice
   132  // can be compared byte-wise (bytes.Compare) to obtain a proper collation
   133  // ordering against another string.
   134  //
   135  // Returns the number of bytes written to `dstbytes`. If 0, this iterator
   136  // has been fully consumed.
   137  //
   138  // Implementation notes:
   139  // This is a fast-path algorithm that can only work for UCA900 collations
   140  // that do not have reorderings, contractions or any weight patches. The idea
   141  // is detecting runs of 8 ASCII characters in a row, which are very frequent
   142  // in most UTF8 code, particularly in English, and generating the weights
   143  // for these 8 characters directly from an optimized table, instead of going
   144  // through the whole Unicode Collation Algorithm. This is feasible because
   145  // in UCA900, all characters in the ASCII range have either 0 or 1 weight
   146  // triplets, so their weight can be calculated with a single lookup in a 128-entry
   147  // table for each level (0, 1, 2).
   148  func (it *FastIterator900) NextWeightBlock64(dstbytes []byte) int {
   149  	// Ensure the destination slice has at least 16 bytes; this bounds check
   150  	// removes all the other bound checks for the rest of the function.
   151  	_ = dstbytes[15]
   152  
   153  	// Unsafe cast the destination byte slice into a slice of uint16, so the
   154  	// individual weights can be written directly to it.
   155  	dst := (*[8]uint16)(unsafe.Pointer(&dstbytes[0]))
   156  	p := it.input
   157  
   158  	// The fast path works on 8-byte chunks from the original input.
   159  	// If the underlying slow iterator is in the middle of processing the
   160  	// weights for a codepoint, we cannot go through the fast path.
   161  	if it.codepoint.ce == 0 && len(p) >= 8 {
   162  		// Read 8 bytes from the input string. This would ideally be implemented
   163  		// as a `binary.LittleEndian.Uint64` read, but the compiler doesn't seem to be
   164  		// optimizing it properly, and it generates the individual byte shifts :(
   165  		dword := *(*uint64)(unsafe.Pointer(&p[0]))
   166  
   167  		// Check if all any of the  bytes in the next 8 bytes have their highest
   168  		// bit set. If they're all clear, these are 8 ASCII bytes which can go through
   169  		// the fast path
   170  		if dword&0x8080808080808080 == 0 {
   171  			// Load the fast table from the iterator. There are three fast tables hardcoded
   172  			// for this implementation, for ASCII levels 0, 1 and 2. The slow iterator replaces
   173  			// the table stored on `it.fastTable` every time we skip a level.
   174  			// Note that the table has 256 entries although only the first 128 are used. We
   175  			// want a 256 entry table because it forces the Go compiler to disable all bound
   176  			// checks when accessing the table IF our index variable is a byte (since a byte
   177  			// cannot be larger than 255).
   178  			table := it.fastTable
   179  
   180  			// All ASCII codepoints (0 >= cp >= 127) have either 0 or 1 weights to yield.
   181  			// This is a problem for our fast path, because 0-weights must NOT appear in the
   182  			// resulting weight string. The codepoints with 0 weights are, however, exceedingly
   183  			// rare (they're mostly non-print codepoints), so based on this we can choose to
   184  			// perform either an optimistic or pessimistic optimization, which is toggled at
   185  			// compile time with this flag. For now, we're going with optimistic because it
   186  			// provides better results in realistic benchmarks.
   187  			const optimisticFastWrites = true
   188  
   189  			if optimisticFastWrites {
   190  				// For the optimistic optimization, we're going to assume that none of the
   191  				// ASCII codepoints in this chunk have zero weights, and check only once at
   192  				// the end of the chunk if that was the case. If we found a zero-weight (a
   193  				// rare occurrence), we discard the whole chunk and fall back to the slow
   194  				// iterator, which handles zero weights just fine.
   195  				// To perform the check for zero weights efficiently, we've designed fast tables
   196  				// where every entry is 32 bits, even though the actual weights are in the
   197  				// bottom 16 bits. The upper 16 bits contain either 0x2, if this is a valid weight
   198  				// or are zeroed out for 0-weights.
   199  				// Because of this, we can lookup from the fast table and insert directly the lowest
   200  				// 16 bits as the weight, and then we `and` the whole weight against a 32-bit mask
   201  				// that starts as 0x20000. For any weights that are valid, that will leave the mask
   202  				// with the same value, because only the higher bits will match, while any 0-weight
   203  				// will fully clear the mask.
   204  				// At the end of this block, we check if the mask has been cleared by any of the
   205  				// writes, and if it has, we scrap this work (boo) and fall back to the slow iterator.
   206  				var mask uint32 = 0x20000
   207  				weight := table[p[0]]
   208  				mask &= weight
   209  				dst[0] = uint16(weight)
   210  
   211  				weight = table[p[1]]
   212  				mask &= weight
   213  				dst[1] = uint16(weight)
   214  
   215  				weight = table[p[2]]
   216  				mask &= weight
   217  				dst[2] = uint16(weight)
   218  
   219  				weight = table[p[3]]
   220  				mask &= weight
   221  				dst[3] = uint16(weight)
   222  
   223  				weight = table[p[4]]
   224  				mask &= weight
   225  				dst[4] = uint16(weight)
   226  
   227  				weight = table[p[5]]
   228  				mask &= weight
   229  				dst[5] = uint16(weight)
   230  
   231  				weight = table[p[6]]
   232  				mask &= weight
   233  				dst[6] = uint16(weight)
   234  
   235  				weight = table[p[7]]
   236  				mask &= weight
   237  				dst[7] = uint16(weight)
   238  
   239  				if mask != 0 {
   240  					it.input = it.input[8:]
   241  					return 16
   242  				}
   243  			} else {
   244  				// For the pessimistic optimization, we're going to assume that any 8-byte chunk
   245  				// can contain 0-weights (something rather rare in practice). We're writing
   246  				// the lower 16 bits of the weight into the target buffer (just like in the optimistic
   247  				// optimization), but then we're increasing our target buffer pointer by
   248  				// the high 16 bits of the weight (weight >> 16). For valid weights, the high
   249  				// bits will equal 0x2, which is exactly the offset we want to move our target
   250  				// pointer so we can write the next weight afterwards, and for 0-weights, the
   251  				// high bits will be 0x0, so the pointer will not advance and the next weight
   252  				// we write will replace the 0-weight we've just written. This ensures that the
   253  				// resulting byte output doesn't have any 0-weights in it, but it also causes
   254  				// small stalls in the CPU because the writes are not necessarily linear and they
   255  				// have an ordering dependency with the value we've loaded from the fast table.
   256  				// Regardless of the stalls, this algorithm is obviously branch-less and very
   257  				// efficient, it just happens that in real world scenarios, the optimistic
   258  				// approach is even faster because 0-weights are very rare in practice.
   259  				// For now, this algorithm is disabled.
   260  				dstptr := (*uint16)(unsafe.Pointer(&dstbytes[0]))
   261  				weight := table[p[0]]
   262  				*dstptr = uint16(weight)
   263  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   264  
   265  				weight = table[p[1]]
   266  				*dstptr = uint16(weight)
   267  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   268  
   269  				weight = table[p[2]]
   270  				*dstptr = uint16(weight)
   271  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   272  
   273  				weight = table[p[3]]
   274  				*dstptr = uint16(weight)
   275  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   276  
   277  				weight = table[p[4]]
   278  				*dstptr = uint16(weight)
   279  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   280  
   281  				weight = table[p[5]]
   282  				*dstptr = uint16(weight)
   283  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   284  
   285  				weight = table[p[6]]
   286  				*dstptr = uint16(weight)
   287  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   288  
   289  				weight = table[p[7]]
   290  				*dstptr = uint16(weight)
   291  				dstptr = (*uint16)(unsafe.Add(unsafe.Pointer(dstptr), weight>>16))
   292  
   293  				written := uintptr(unsafe.Pointer(dstptr)) - uintptr(unsafe.Pointer(&dstbytes[0]))
   294  				if written != 0 {
   295  					it.input = it.input[written>>1:]
   296  					return int(written)
   297  				}
   298  			}
   299  		}
   300  	}
   301  
   302  	// Slow path: just loop up to 8 times to fill the buffer and bail
   303  	// early if we exhaust the iterator.
   304  	for i := 0; i < 8; i++ {
   305  		w, ok := it.Next()
   306  		if !ok {
   307  			return i * 2
   308  		}
   309  		dst[i] = bits.ReverseBytes16(w)
   310  	}
   311  	return 16
   312  }
   313  
   314  func (it *FastIterator900) resetForNextLevel() {
   315  	it.input = it.original
   316  	switch it.level {
   317  	case 1:
   318  		it.fastTable = &fastweightTable_uca900_page000L1
   319  	case 2:
   320  		it.fastTable = &fastweightTable_uca900_page000L2
   321  	}
   322  }
   323  
   324  func (it *FastIterator900) Next() (uint16, bool) {
   325  	for {
   326  		if w, ok := it.codepoint.next(); ok {
   327  			return w, true
   328  		}
   329  
   330  		cp, width := utf8.DecodeRune(it.input)
   331  		if cp == utf8.RuneError && width < 3 {
   332  			it.level++
   333  			if it.level < it.maxLevel {
   334  				it.resetForNextLevel()
   335  				return 0, true
   336  			}
   337  			return 0, false
   338  		}
   339  
   340  		it.input = it.input[width:]
   341  		it.codepoint.init(&it.iterator900, cp)
   342  	}
   343  }