vitess.io/vitess@v0.16.2/go/mysql/collations/unicode.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package collations
    18  
    19  import (
    20  	"bytes"
    21  	"math"
    22  	"math/bits"
    23  
    24  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    25  )
    26  
    27  type Collation_unicode_general_ci struct {
    28  	id      ID
    29  	name    string
    30  	unicase *UnicaseInfo
    31  	charset charset.Charset
    32  }
    33  
    34  func (c *Collation_unicode_general_ci) Init() {}
    35  
    36  func (c *Collation_unicode_general_ci) ID() ID {
    37  	return c.id
    38  }
    39  
    40  func (c *Collation_unicode_general_ci) Name() string {
    41  	return c.name
    42  }
    43  
    44  func (c *Collation_unicode_general_ci) Charset() charset.Charset {
    45  	return c.charset
    46  }
    47  
    48  func (c *Collation_unicode_general_ci) IsBinary() bool {
    49  	return false
    50  }
    51  
    52  func (c *Collation_unicode_general_ci) Collate(left, right []byte, isPrefix bool) int {
    53  	unicaseInfo := c.unicase
    54  	cs := c.charset
    55  
    56  	for len(left) > 0 && len(right) > 0 {
    57  		l, lWidth := cs.DecodeRune(left)
    58  		r, rWidth := cs.DecodeRune(right)
    59  
    60  		if (l == charset.RuneError && lWidth < 3) || (r == charset.RuneError && rWidth < 3) {
    61  			return bytes.Compare(left, right)
    62  		}
    63  
    64  		lRune := unicaseInfo.unicodeSort(l)
    65  		rRune := unicaseInfo.unicodeSort(r)
    66  
    67  		if lRune > rRune {
    68  			return 1
    69  		} else if lRune < rRune {
    70  			return -1
    71  		}
    72  
    73  		left = left[lWidth:]
    74  		right = right[rWidth:]
    75  	}
    76  	if isPrefix {
    77  		return len(right)
    78  	}
    79  	return len(left) - len(right)
    80  }
    81  
    82  func (c *Collation_unicode_general_ci) WeightString(dst, src []byte, numCodepoints int) []byte {
    83  	unicaseInfo := c.unicase
    84  	cs := c.charset
    85  
    86  	if numCodepoints == 0 || numCodepoints == PadToMax {
    87  		for {
    88  			r, width := cs.DecodeRune(src)
    89  			if r == charset.RuneError && width < 3 {
    90  				break
    91  			}
    92  
    93  			src = src[width:]
    94  			sorted := unicaseInfo.unicodeSort(r)
    95  			dst = append(dst, byte(sorted>>8), byte(sorted))
    96  		}
    97  
    98  		if numCodepoints == PadToMax {
    99  			for len(dst)+1 < cap(dst) {
   100  				dst = append(dst, 0x00, 0x20)
   101  			}
   102  			if len(dst) < cap(dst) {
   103  				dst = append(dst, 0x00)
   104  			}
   105  		}
   106  	} else {
   107  		for numCodepoints > 0 {
   108  			r, width := cs.DecodeRune(src)
   109  			if r == charset.RuneError && width < 3 {
   110  				break
   111  			}
   112  
   113  			src = src[width:]
   114  			sorted := unicaseInfo.unicodeSort(r)
   115  			dst = append(dst, byte(sorted>>8), byte(sorted))
   116  			numCodepoints--
   117  		}
   118  		for numCodepoints > 0 {
   119  			dst = append(dst, 0x00, 0x20)
   120  			numCodepoints--
   121  		}
   122  	}
   123  
   124  	return dst
   125  }
   126  
   127  func (c *Collation_unicode_general_ci) Hash(src []byte, numCodepoints int) HashCode {
   128  	unicaseInfo := c.unicase
   129  	cs := c.charset
   130  
   131  	var hash = uintptr(c.id)
   132  	var left = numCodepoints
   133  	if left == 0 {
   134  		left = math.MaxInt32
   135  	}
   136  
   137  	for left > 0 {
   138  		r, width := cs.DecodeRune(src)
   139  		if r == charset.RuneError && width < 3 {
   140  			break
   141  		}
   142  		src = src[width:]
   143  		hash = memhash16(bits.ReverseBytes16(uint16(unicaseInfo.unicodeSort(r))), hash)
   144  		left--
   145  	}
   146  
   147  	if numCodepoints > 0 {
   148  		for left > 0 {
   149  			hash = memhash16(bits.ReverseBytes16(0x0020), hash)
   150  			left--
   151  		}
   152  	}
   153  	return hash
   154  }
   155  
   156  func (c *Collation_unicode_general_ci) WeightStringLen(numBytes int) int {
   157  	return ((numBytes + 3) / 4) * 2
   158  }
   159  
   160  func (c *Collation_unicode_general_ci) Wildcard(pat []byte, matchOne rune, matchMany rune, escape rune) WildcardPattern {
   161  	var sort = c.unicase.unicodeSort
   162  	var equals = func(a, b rune) bool {
   163  		return sort(a) == sort(b)
   164  	}
   165  	return newUnicodeWildcardMatcher(c.charset, equals, c.Collate, pat, matchOne, matchMany, escape)
   166  }
   167  
   168  type Collation_unicode_bin struct {
   169  	id      ID
   170  	name    string
   171  	charset charset.Charset
   172  }
   173  
   174  func (c *Collation_unicode_bin) Init() {}
   175  
   176  func (c *Collation_unicode_bin) ID() ID {
   177  	return c.id
   178  }
   179  
   180  func (c *Collation_unicode_bin) Name() string {
   181  	return c.name
   182  }
   183  
   184  func (c *Collation_unicode_bin) Charset() charset.Charset {
   185  	return c.charset
   186  }
   187  
   188  func (c *Collation_unicode_bin) IsBinary() bool {
   189  	return true
   190  }
   191  
   192  func (c *Collation_unicode_bin) Collate(left, right []byte, isPrefix bool) int {
   193  	return collationBinary(left, right, isPrefix)
   194  }
   195  
   196  func (c *Collation_unicode_bin) WeightString(dst, src []byte, numCodepoints int) []byte {
   197  	if c.charset.SupportsSupplementaryChars() {
   198  		return c.weightStringUnicode(dst, src, numCodepoints)
   199  	}
   200  	return c.weightStringBMP(dst, src, numCodepoints)
   201  }
   202  
   203  func (c *Collation_unicode_bin) weightStringBMP(dst, src []byte, numCodepoints int) []byte {
   204  	cs := c.charset
   205  	if numCodepoints == 0 || numCodepoints == PadToMax {
   206  		for {
   207  			r, width := cs.DecodeRune(src)
   208  			if r == charset.RuneError && width < 3 {
   209  				break
   210  			}
   211  			src = src[width:]
   212  			dst = append(dst, byte(r>>8), byte(r))
   213  		}
   214  
   215  		if numCodepoints == PadToMax {
   216  			for len(dst)+1 < cap(dst) {
   217  				dst = append(dst, 0x00, 0x20)
   218  			}
   219  			if len(dst) < cap(dst) {
   220  				dst = append(dst, 0x00)
   221  			}
   222  		}
   223  	} else {
   224  		for numCodepoints > 0 {
   225  			r, width := cs.DecodeRune(src)
   226  			if r == charset.RuneError && width < 3 {
   227  				break
   228  			}
   229  			src = src[width:]
   230  			dst = append(dst, byte(r>>8), byte(r))
   231  			numCodepoints--
   232  		}
   233  		for numCodepoints > 0 {
   234  			dst = append(dst, 0x00, 0x20)
   235  			numCodepoints--
   236  		}
   237  	}
   238  
   239  	return dst
   240  }
   241  
   242  func (c *Collation_unicode_bin) weightStringUnicode(dst, src []byte, numCodepoints int) []byte {
   243  	cs := c.charset
   244  	if numCodepoints == 0 || numCodepoints == PadToMax {
   245  		for {
   246  			r, width := cs.DecodeRune(src)
   247  			if r == charset.RuneError && width < 3 {
   248  				break
   249  			}
   250  
   251  			src = src[width:]
   252  			dst = append(dst, byte((r>>16)&0xFF), byte((r>>8)&0xFF), byte(r&0xFF))
   253  		}
   254  
   255  		if numCodepoints == PadToMax {
   256  			for len(dst)+2 < cap(dst) {
   257  				dst = append(dst, 0x00, 0x00, 0x20)
   258  			}
   259  			switch cap(dst) - len(dst) {
   260  			case 0:
   261  			case 1:
   262  				dst = append(dst, 0x00)
   263  			case 2:
   264  				dst = append(dst, 0x00, 0x00)
   265  			default:
   266  				panic("unreachable")
   267  			}
   268  		}
   269  	} else {
   270  		for numCodepoints > 0 {
   271  			r, width := cs.DecodeRune(src)
   272  			if r == charset.RuneError && width < 3 {
   273  				break
   274  			}
   275  
   276  			src = src[width:]
   277  			dst = append(dst, byte((r>>16)&0xFF), byte((r>>8)&0xFF), byte(r&0xFF))
   278  			numCodepoints--
   279  		}
   280  		for numCodepoints > 0 {
   281  			dst = append(dst, 0x00, 0x00, 0x20)
   282  			numCodepoints--
   283  		}
   284  	}
   285  
   286  	return dst
   287  }
   288  
   289  func (c *Collation_unicode_bin) Hash(src []byte, numCodepoints int) HashCode {
   290  	if c.charset.SupportsSupplementaryChars() {
   291  		return c.hashUnicode(src, numCodepoints)
   292  	}
   293  	return c.hashBMP(src, numCodepoints)
   294  }
   295  
   296  func (c *Collation_unicode_bin) hashUnicode(src []byte, numCodepoints int) uintptr {
   297  	cs := c.charset
   298  
   299  	var hash = uintptr(c.id)
   300  	var left = numCodepoints
   301  	if left == 0 {
   302  		left = math.MaxInt32
   303  	}
   304  	for left > 0 {
   305  		r, width := cs.DecodeRune(src)
   306  		if r == charset.RuneError && width < 3 {
   307  			break
   308  		}
   309  		src = src[width:]
   310  		hash = memhash32(bits.ReverseBytes32(uint32(r)), hash)
   311  		left--
   312  	}
   313  	if numCodepoints > 0 {
   314  		for left > 0 {
   315  			hash = memhash32(bits.ReverseBytes32(0x20), hash)
   316  			left--
   317  		}
   318  	}
   319  	return hash
   320  }
   321  
   322  func (c *Collation_unicode_bin) hashBMP(src []byte, numCodepoints int) uintptr {
   323  	cs := c.charset
   324  
   325  	var hash = uintptr(c.id)
   326  	var left = numCodepoints
   327  	if left == 0 {
   328  		left = math.MaxInt32
   329  	}
   330  	for left > 0 {
   331  		r, width := cs.DecodeRune(src)
   332  		if r == charset.RuneError && width < 3 {
   333  			break
   334  		}
   335  		src = src[width:]
   336  		hash = memhash16(bits.ReverseBytes16(uint16(r)), hash)
   337  		left--
   338  	}
   339  	if numCodepoints > 0 {
   340  		for left > 0 {
   341  			hash = memhash16(bits.ReverseBytes16(0x20), hash)
   342  			left--
   343  		}
   344  	}
   345  	return hash
   346  }
   347  
   348  func (c *Collation_unicode_bin) WeightStringLen(numBytes int) int {
   349  	return ((numBytes + 3) / 4) * 3
   350  }
   351  
   352  func (c *Collation_unicode_bin) Wildcard(pat []byte, matchOne rune, matchMany rune, escape rune) WildcardPattern {
   353  	equals := func(a, b rune) bool {
   354  		return a == b
   355  	}
   356  	return newUnicodeWildcardMatcher(c.charset, equals, c.Collate, pat, matchOne, matchMany, escape)
   357  }
   358  
   359  func collationBinary(left, right []byte, rightPrefix bool) int {
   360  	minLen := minInt(len(left), len(right))
   361  	if diff := bytes.Compare(left[:minLen], right[:minLen]); diff != 0 {
   362  		return diff
   363  	}
   364  	if rightPrefix {
   365  		left = left[:minLen]
   366  	}
   367  	return len(left) - len(right)
   368  }