vitess.io/vitess@v0.16.2/go/mysql/collations/collation.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package collations 18 19 import ( 20 "math" 21 22 "vitess.io/vitess/go/mysql/collations/internal/charset" 23 ) 24 25 // CaseAwareCollation implements lowercase and uppercase conventions for collations. 26 type CaseAwareCollation interface { 27 Collation 28 ToUpper(dst []byte, src []byte) []byte 29 ToLower(dst []byte, src []byte) []byte 30 } 31 32 // ID is a numeric identifier for a collation. These identifiers are defined by MySQL, not by Vitess. 33 type ID uint16 34 35 // Unknown is the default ID for an unknown collation. 36 const Unknown ID = 0 37 38 // Collation implements a MySQL-compatible collation. It defines how to compare 39 // for sorting order and equality two strings with the same encoding. 40 type Collation interface { 41 // Init initializes the internal state for the collation the first time it is used 42 Init() 43 44 // ID returns the numerical identifier for this collation. This is the same 45 // value that is returned by MySQL in a query's headers to identify the collation 46 // for a given column 47 ID() ID 48 49 // Name is the full name of this collation, in the form of "ENCODING_LANG_SENSITIVITY" 50 Name() string 51 52 // Collate compares two strings using this collation. `left` and `right` must be the 53 // two strings encoded in the proper encoding for this collation. If `isPrefix` is true, 54 // the function instead behaves equivalently to `strings.HasPrefix(left, right)`, but 55 // being collation-aware. 56 // It returns a numeric value like a normal comparison function: <0 if left < right, 57 // 0 if left == right, >0 if left > right 58 Collate(left, right []byte, isPrefix bool) int 59 60 // WeightString returns a weight string for the given `src` string. A weight string 61 // is a binary representation of the weights for the given string, that can be 62 // compared byte-wise to return identical results to collating this string. 63 // 64 // This means: 65 // bytes.Compare(WeightString(left), WeightString(right)) == Collate(left, right) 66 // 67 // The semantics of this API have been carefully designed to match MySQL's behavior 68 // in its `strnxfrm` API. Most notably, the `numCodepoints` argument implies different 69 // behaviors depending on the collation's padding mode: 70 // 71 // - For collations that pad WITH SPACE (this is, all legacy collations in MySQL except 72 // for the newly introduced UCA v9.0.0 utf8mb4 collations in MySQL 8.0), `numCodepoints` 73 // can have the following values: 74 // 75 // - if `numCodepoints` is any integer greater than zero, this treats the `src` string 76 // as if it were in a `CHAR(numCodepoints)` column in MySQL, meaning that the resulting 77 // weight string will be padded with the weight for the SPACE character until it becomes 78 // wide enough to fill the `CHAR` column. This is necessary to perform weight comparisons 79 // in fixed-`CHAR` columns. If `numCodepoints` is smaller than the actual amount of 80 // codepoints stored in `src`, the result is unspecified. 81 // 82 // - if `numCodepoints` is zero, this is equivalent to `numCodepoints = RuneCount(src)`, 83 // meaning that the resulting weight string will have no padding at the end: it'll only have 84 // the weight values for the exact amount of codepoints contained in `src`. This is the 85 // behavior required to sort `VARCHAR` columns. 86 // 87 // - if `numCodepoints` is the special constant PadToMax, then the `dst` slice must be 88 // pre-allocated to a zero-length slice with enough capacity to hold the complete weight 89 // string, and any remaining capacity in `dst` will be filled by the weights for the 90 // padding character, repeatedly. This is a special flag used by MySQL when performing 91 // filesorts, where all the sorting keys must have identical sizes, even for `VARCHAR` 92 // columns. 93 // 94 // - For collations that have NO PAD (this is, the newly introduced UCA v9.0.0 utf8mb4 collations 95 // in MySQL 8.0), `numCodepoints` can only have the special constant `PadToMax`, which will make 96 // the weight string padding equivalent to a PAD SPACE collation (as explained in the previous 97 // section). All other values for `numCodepoints` are ignored, because NO PAD collations always 98 // return the weights for the codepoints in their strings, with no further padding at the end. 99 // 100 // The resulting weight string is written to `dst`, which can be pre-allocated to 101 // WeightStringLen() bytes to prevent growing the slice. `dst` can also be nil, in which 102 // case it will grow dynamically. If `numCodepoints` has the special PadToMax value explained 103 // earlier, `dst` MUST be pre-allocated to the target size or the function will return an 104 // empty slice. 105 WeightString(dst, src []byte, numCodepoints int) []byte 106 107 // WeightStringLen returns a size (in bytes) that would fit any weight strings for a string 108 // with `numCodepoints` using this collation. Note that this is a higher bound for the size 109 // of the string, and in practice weight strings can be significantly smaller than the 110 // returned value. 111 WeightStringLen(numCodepoints int) int 112 113 // Hash returns a 32 or 64 bit identifier (depending on the platform) that uniquely identifies 114 // the given string based on this collation. It is functionally equivalent to calling WeightString 115 // and then hashing the result. 116 // 117 // Consequently, if the hashes for two strings are different, then the two strings are considered 118 // different according to this collation. If the hashes for two strings are equal, the two strings 119 // may or may not be considered equal according to this collation, because hashes can collide unlike 120 // weight strings. 121 // 122 // The numCodepoints argument has the same behavior as in WeightString: if this collation uses PAD SPACE, 123 // the hash will interpret the source string as if it were stored in a `CHAR(n)` column. If the value of 124 // numCodepoints is 0, this is equivalent to setting `numCodepoints = RuneCount(src)`. 125 // For collations with NO PAD, the numCodepoint argument is ignored. 126 Hash(src []byte, numCodepoints int) HashCode 127 128 // Wildcard returns a matcher for the given wildcard pattern. The matcher can be used to repeatedly 129 // test different strings to check if they match the pattern. The pattern must be a traditional wildcard 130 // pattern, which may contain the provided special characters for matching one character or several characters. 131 // The provided `escape` character will be used as an escape sequence in front of the other special characters. 132 // 133 // This method is fully collation aware; the matching will be performed according to the underlying collation. 134 // I.e. if this is a case-insensitive collation, matching will be case-insensitive. 135 // 136 // The returned WildcardPattern is always valid, but if the provided special characters do not exist in this 137 // collation's repertoire, the returned pattern will not match any strings. Likewise, if the provided pattern 138 // has invalid syntax, the returned pattern will not match any strings. 139 // 140 // If the provided special characters are 0, the defaults to parse an SQL 'LIKE' statement will be used. 141 // This is, '_' for matching one character, '%' for matching many and '\\' for escape. 142 // 143 // This method can also be used for Shell-like matching with '?', '*' and '\\' as their respective special 144 // characters. 145 Wildcard(pat []byte, matchOne, matchMany, escape rune) WildcardPattern 146 147 // Charset returns the Charset with which this collation is encoded 148 Charset() charset.Charset 149 150 // IsBinary returns whether this collation is a binary collation 151 IsBinary() bool 152 } 153 154 type HashCode = uintptr 155 156 // WildcardPattern is a matcher for a wildcard pattern, constructed from a given collation 157 type WildcardPattern interface { 158 // Match returns whether the given string matches this pattern 159 Match(in []byte) bool 160 } 161 162 const PadToMax = math.MaxInt32 163 164 func minInt(i1, i2 int) int { 165 if i1 < i2 { 166 return i1 167 } 168 return i2 169 } 170 171 var globalAllCollations = make(map[ID]Collation) 172 173 func register(c Collation) { 174 if _, found := globalAllCollations[c.ID()]; found { 175 panic("duplicated collation registered") 176 } 177 globalAllCollations[c.ID()] = c 178 } 179 180 // Slice returns the substring in `input[from:to]`, where `from` and `to` 181 // are collation-aware character indices instead of bytes. 182 func Slice(collation Collation, input []byte, from, to int) []byte { 183 return charset.Slice(collation.Charset(), input, from, to) 184 } 185 186 // Validate returns whether the given `input` is properly encoded with the 187 // character set for the given collation. 188 func Validate(collation Collation, input []byte) bool { 189 return charset.Validate(collation.Charset(), input) 190 } 191 192 // Convert converts the bytes in `src`, which are encoded in `srcCollation`'s charset, 193 // into a byte slice encoded in `dstCollation`'s charset. The resulting byte slice is 194 // appended to `dst` and returned. 195 func Convert(dst []byte, dstCollation Collation, src []byte, srcCollation Collation) ([]byte, error) { 196 return charset.Convert(dst, dstCollation.Charset(), src, srcCollation.Charset()) 197 } 198 199 // Length returns the number of codepoints in the input based on the given collation 200 func Length(collation Collation, input []byte) int { 201 return charset.Length(collation.Charset(), input) 202 }