vitess.io/vitess@v0.16.2/go/mysql/collations/collation.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package collations
    18  
    19  import (
    20  	"math"
    21  
    22  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    23  )
    24  
    25  // CaseAwareCollation implements lowercase and uppercase conventions for collations.
    26  type CaseAwareCollation interface {
    27  	Collation
    28  	ToUpper(dst []byte, src []byte) []byte
    29  	ToLower(dst []byte, src []byte) []byte
    30  }
    31  
    32  // ID is a numeric identifier for a collation. These identifiers are defined by MySQL, not by Vitess.
    33  type ID uint16
    34  
    35  // Unknown is the default ID for an unknown collation.
    36  const Unknown ID = 0
    37  
    38  // Collation implements a MySQL-compatible collation. It defines how to compare
    39  // for sorting order and equality two strings with the same encoding.
    40  type Collation interface {
    41  	// Init initializes the internal state for the collation the first time it is used
    42  	Init()
    43  
    44  	// ID returns the numerical identifier for this collation. This is the same
    45  	// value that is returned by MySQL in a query's headers to identify the collation
    46  	// for a given column
    47  	ID() ID
    48  
    49  	// Name is the full name of this collation, in the form of "ENCODING_LANG_SENSITIVITY"
    50  	Name() string
    51  
    52  	// Collate compares two strings using this collation. `left` and `right` must be the
    53  	// two strings encoded in the proper encoding for this collation. If `isPrefix` is true,
    54  	// the function instead behaves equivalently to `strings.HasPrefix(left, right)`, but
    55  	// being collation-aware.
    56  	// It returns a numeric value like a normal comparison function: <0 if left < right,
    57  	// 0 if left == right, >0 if left > right
    58  	Collate(left, right []byte, isPrefix bool) int
    59  
    60  	// WeightString returns a weight string for the given `src` string. A weight string
    61  	// is a binary representation of the weights for the given string, that can be
    62  	// compared byte-wise to return identical results to collating this string.
    63  	//
    64  	// This means:
    65  	//		bytes.Compare(WeightString(left), WeightString(right)) == Collate(left, right)
    66  	//
    67  	// The semantics of this API have been carefully designed to match MySQL's behavior
    68  	// in its `strnxfrm` API. Most notably, the `numCodepoints` argument implies different
    69  	// behaviors depending on the collation's padding mode:
    70  	//
    71  	// - For collations that pad WITH SPACE (this is, all legacy collations in MySQL except
    72  	//	for the newly introduced UCA v9.0.0 utf8mb4 collations in MySQL 8.0), `numCodepoints`
    73  	// 	can have the following values:
    74  	//
    75  	//		- if `numCodepoints` is any integer greater than zero, this treats the `src` string
    76  	//		as if it were in a `CHAR(numCodepoints)` column in MySQL, meaning that the resulting
    77  	//		weight string will be padded with the weight for the SPACE character until it becomes
    78  	//		wide enough to fill the `CHAR` column. This is necessary to perform weight comparisons
    79  	//		in fixed-`CHAR` columns. If `numCodepoints` is smaller than the actual amount of
    80  	//		codepoints stored in `src`, the result is unspecified.
    81  	//
    82  	//		- if `numCodepoints` is zero, this is equivalent to `numCodepoints = RuneCount(src)`,
    83  	//		meaning that the resulting weight string will have no padding at the end: it'll only have
    84  	//		the weight values for the exact amount of codepoints contained in `src`. This is the
    85  	//		behavior required to sort `VARCHAR` columns.
    86  	//
    87  	//		- if `numCodepoints` is the special constant PadToMax, then the `dst` slice must be
    88  	//		pre-allocated to a zero-length slice with enough capacity to hold the complete weight
    89  	//		string, and any remaining capacity in `dst` will be filled by the weights for the
    90  	//		padding character, repeatedly. This is a special flag used by MySQL when performing
    91  	//		filesorts, where all the sorting keys must have identical sizes, even for `VARCHAR`
    92  	//		columns.
    93  	//
    94  	//	- For collations that have NO PAD (this is, the newly introduced UCA v9.0.0 utf8mb4 collations
    95  	//	in MySQL 8.0), `numCodepoints` can only have the special constant `PadToMax`, which will make
    96  	//	the weight string padding equivalent to a PAD SPACE collation (as explained in the previous
    97  	//	section). All other values for `numCodepoints` are ignored, because NO PAD collations always
    98  	//	return the weights for the codepoints in their strings, with no further padding at the end.
    99  	//
   100  	// The resulting weight string is written to `dst`, which can be pre-allocated to
   101  	// WeightStringLen() bytes to prevent growing the slice. `dst` can also be nil, in which
   102  	// case it will grow dynamically. If `numCodepoints` has the special PadToMax value explained
   103  	// earlier, `dst` MUST be pre-allocated to the target size or the function will return an
   104  	// empty slice.
   105  	WeightString(dst, src []byte, numCodepoints int) []byte
   106  
   107  	// WeightStringLen returns a size (in bytes) that would fit any weight strings for a string
   108  	// with `numCodepoints` using this collation. Note that this is a higher bound for the size
   109  	// of the string, and in practice weight strings can be significantly smaller than the
   110  	// returned value.
   111  	WeightStringLen(numCodepoints int) int
   112  
   113  	// Hash returns a 32 or 64 bit identifier (depending on the platform) that uniquely identifies
   114  	// the given string based on this collation. It is functionally equivalent to calling WeightString
   115  	// and then hashing the result.
   116  	//
   117  	// Consequently, if the hashes for two strings are different, then the two strings are considered
   118  	// different according to this collation. If the hashes for two strings are equal, the two strings
   119  	// may or may not be considered equal according to this collation, because hashes can collide unlike
   120  	// weight strings.
   121  	//
   122  	// The numCodepoints argument has the same behavior as in WeightString: if this collation uses PAD SPACE,
   123  	// the hash will interpret the source string as if it were stored in a `CHAR(n)` column. If the value of
   124  	// numCodepoints is 0, this is equivalent to setting `numCodepoints = RuneCount(src)`.
   125  	// For collations with NO PAD, the numCodepoint argument is ignored.
   126  	Hash(src []byte, numCodepoints int) HashCode
   127  
   128  	// Wildcard returns a matcher for the given wildcard pattern. The matcher can be used to repeatedly
   129  	// test different strings to check if they match the pattern. The pattern must be a traditional wildcard
   130  	// pattern, which may contain the provided special characters for matching one character or several characters.
   131  	// The provided `escape` character will be used as an escape sequence in front of the other special characters.
   132  	//
   133  	// This method is fully collation aware; the matching will be performed according to the underlying collation.
   134  	// I.e. if this is a case-insensitive collation, matching will be case-insensitive.
   135  	//
   136  	// The returned WildcardPattern is always valid, but if the provided special characters do not exist in this
   137  	// collation's repertoire, the returned pattern will not match any strings. Likewise, if the provided pattern
   138  	// has invalid syntax, the returned pattern will not match any strings.
   139  	//
   140  	// If the provided special characters are 0, the defaults to parse an SQL 'LIKE' statement will be used.
   141  	// This is, '_' for matching one character, '%' for matching many and '\\' for escape.
   142  	//
   143  	// This method can also be used for Shell-like matching with '?', '*' and '\\' as their respective special
   144  	// characters.
   145  	Wildcard(pat []byte, matchOne, matchMany, escape rune) WildcardPattern
   146  
   147  	// Charset returns the Charset with which this collation is encoded
   148  	Charset() charset.Charset
   149  
   150  	// IsBinary returns whether this collation is a binary collation
   151  	IsBinary() bool
   152  }
   153  
   154  type HashCode = uintptr
   155  
   156  // WildcardPattern is a matcher for a wildcard pattern, constructed from a given collation
   157  type WildcardPattern interface {
   158  	// Match returns whether the given string matches this pattern
   159  	Match(in []byte) bool
   160  }
   161  
   162  const PadToMax = math.MaxInt32
   163  
   164  func minInt(i1, i2 int) int {
   165  	if i1 < i2 {
   166  		return i1
   167  	}
   168  	return i2
   169  }
   170  
   171  var globalAllCollations = make(map[ID]Collation)
   172  
   173  func register(c Collation) {
   174  	if _, found := globalAllCollations[c.ID()]; found {
   175  		panic("duplicated collation registered")
   176  	}
   177  	globalAllCollations[c.ID()] = c
   178  }
   179  
   180  // Slice returns the substring in `input[from:to]`, where `from` and `to`
   181  // are collation-aware character indices instead of bytes.
   182  func Slice(collation Collation, input []byte, from, to int) []byte {
   183  	return charset.Slice(collation.Charset(), input, from, to)
   184  }
   185  
   186  // Validate returns whether the given `input` is properly encoded with the
   187  // character set for the given collation.
   188  func Validate(collation Collation, input []byte) bool {
   189  	return charset.Validate(collation.Charset(), input)
   190  }
   191  
   192  // Convert converts the bytes in `src`, which are encoded in `srcCollation`'s charset,
   193  // into a byte slice encoded in `dstCollation`'s charset. The resulting byte slice is
   194  // appended to `dst` and returned.
   195  func Convert(dst []byte, dstCollation Collation, src []byte, srcCollation Collation) ([]byte, error) {
   196  	return charset.Convert(dst, dstCollation.Charset(), src, srcCollation.Charset())
   197  }
   198  
   199  // Length returns the number of codepoints in the input based on the given collation
   200  func Length(collation Collation, input []byte) int {
   201  	return charset.Length(collation.Charset(), input)
   202  }