vitess.io/vitess@v0.16.2/go/mysql/collations/coercion.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package collations 18 19 import ( 20 "fmt" 21 "unsafe" 22 23 "vitess.io/vitess/go/mysql/collations/internal/charset" 24 ) 25 26 func init() { 27 if unsafe.Sizeof(TypedCollation{}) != 4 { 28 panic("TypedCollation should fit in an int32") 29 } 30 } 31 32 // Coercibility is a numeric value that represents the precedence of a collation 33 // when applied to a SQL expression. When trying to coerce the collations 34 // of two different expressions so that they can be compared, the expression 35 // with the lowest coercibility value will win and its collation will be forced 36 // upon the other expression. 37 // 38 // The rules for assigning a Coercibility value to an expression are as follows: 39 // 40 // - An explicit COLLATE clause has a coercibility of 0 (not coercible at all). 41 // - The concatenation of two strings with different collations has a coercibility of 1. 42 // - The collation of a column or a stored routine parameter or local variable has a coercibility of 2. 43 // - A “system constant” (the string returned by functions such as USER() or VERSION()) has a coercibility of 3. 44 // - The collation of a literal has a coercibility of 4. 45 // - The collation of a numeric or temporal value has a coercibility of 5. 46 // - NULL or an expression that is derived from NULL has a coercibility of 6. 47 // 48 // According to the MySQL documentation, Coercibility is an actual word of the English 49 // language, although the Vitess maintainers disagree with this assessment. 50 // 51 // See: https://dev.mysql.com/doc/refman/8.0/en/charset-collation-coercibility.html 52 type Coercibility byte 53 54 const ( 55 CoerceExplicit Coercibility = iota 56 CoerceNone 57 CoerceImplicit 58 CoerceSysconst 59 CoerceCoercible 60 CoerceNumeric 61 CoerceIgnorable 62 ) 63 64 func (ci Coercibility) String() string { 65 switch ci { 66 case 0: 67 return "EXPLICIT" 68 case 1: 69 return "NONE" 70 case 2: 71 return "IMPLICIT" 72 case 3: 73 return "SYSCONST" 74 case 4: 75 return "COERCIBLE" 76 case 5: 77 return "NUMERIC" 78 case 6: 79 return "IGNORABLE" 80 default: 81 panic("invalid Coercibility value") 82 } 83 } 84 85 // Repertoire is a constant that defines the collection of characters in an expression. 86 // MySQL only distinguishes between an ASCII repertoire (i.e. an expression where all 87 // the contained codepoints are < 128), or an Unicode repertoire (an expression that 88 // can contain any possible codepoint). 89 // 90 // See: https://dev.mysql.com/doc/refman/8.0/en/charset-repertoire.html 91 type Repertoire byte 92 93 const ( 94 RepertoireASCII Repertoire = iota 95 RepertoireUnicode 96 ) 97 98 // Coercion is a function that will transform either the given argument 99 // arguments of the function into a specific character set. The `dst` argument 100 // will be used as the destination of the coerced argument, but it can be nil. 101 type Coercion func(dst, in []byte) ([]byte, error) 102 103 // TypedCollation is the Collation of a SQL expression, including its coercibility 104 // and repertoire. 105 type TypedCollation struct { 106 Collation ID 107 Coercibility Coercibility 108 Repertoire Repertoire 109 } 110 111 func (tc TypedCollation) Valid() bool { 112 return tc.Collation != Unknown 113 } 114 115 func checkCompatibleCollations( 116 left Collation, leftCoercibility Coercibility, leftRepertoire Repertoire, 117 right Collation, rightCoercibility Coercibility, rightRepertoire Repertoire, 118 ) bool { 119 leftCS := left.Charset() 120 rightCS := right.Charset() 121 122 switch leftCS.(type) { 123 case charset.Charset_utf8mb4: 124 if leftCoercibility <= rightCoercibility { 125 return true 126 } 127 128 case charset.Charset_utf32: 129 switch { 130 case leftCoercibility < rightCoercibility: 131 return true 132 case leftCoercibility == rightCoercibility: 133 if !charset.IsUnicode(rightCS) { 134 return true 135 } 136 if !left.IsBinary() { 137 return true 138 } 139 } 140 141 case charset.Charset_utf8mb3, charset.Charset_ucs2, charset.Charset_utf16, charset.Charset_utf16le: 142 switch { 143 case leftCoercibility < rightCoercibility: 144 return true 145 case leftCoercibility == rightCoercibility: 146 if !charset.IsUnicode(rightCS) { 147 return true 148 } 149 } 150 } 151 152 if rightRepertoire == RepertoireASCII { 153 switch { 154 case leftCoercibility < rightCoercibility: 155 return true 156 case leftCoercibility == rightCoercibility: 157 if leftRepertoire == RepertoireUnicode { 158 return true 159 } 160 } 161 } 162 163 return false 164 } 165 166 // CoercionOptions is used to configure how aggressive the algorithm can be 167 // when merging two different collations by transcoding them. 168 type CoercionOptions struct { 169 // ConvertToSuperset allows merging two different collations as long 170 // as the charset of one of them is a strict superset of the other. In 171 // order to operate on the two expressions, one of them will need to 172 // be transcoded. This transcoding will always be safe because the string 173 // with the smallest repertoire will be transcoded to its superset, which 174 // cannot fail. 175 ConvertToSuperset bool 176 177 // ConvertWithCoercion allows merging two different collations by forcing 178 // a coercion as long as the coercibility of the two sides is lax enough. 179 // This will force a transcoding of one of the expressions even if their 180 // respective charsets are not a strict superset, so the resulting transcoding 181 // CAN fail depending on the content of their strings. 182 ConvertWithCoercion bool 183 } 184 185 // MergeCollations returns a Coercion function for a pair of TypedCollation based 186 // on their coercibility. 187 // 188 // The function takes the typed collations for the two sides of a text operation 189 // (namely, a comparison or concatenation of two textual expressions). These typed 190 // collations includes the actual collation for the expression on each size, their 191 // coercibility values (see: Coercibility) and their respective repertoires, 192 // and returns the target collation (i.e. the collation into which the two expressions 193 // must be coerced, and a Coercion function. The Coercion function can be called repeatedly 194 // with the different values for the two expressions and will transcode either 195 // the left-hand or right-hand value to the appropriate charset so it can be 196 // collated against the other value. 197 // 198 // If the collations for both sides of the expressions are the same, the returned 199 // Coercion function will be a no-op. Likewise, if the two collations are not the same, 200 // but they are compatible and have the same charset, the Coercion function will also 201 // be a no-op. 202 // 203 // If the collations for both sides of the expression are not compatible, an error 204 // will be returned and the returned TypedCollation and Coercion will be nil. 205 func (env *Environment) MergeCollations(left, right TypedCollation, opt CoercionOptions) (TypedCollation, Coercion, Coercion, error) { 206 leftColl := env.LookupByID(left.Collation) 207 rightColl := env.LookupByID(right.Collation) 208 if leftColl == nil || rightColl == nil { 209 return TypedCollation{}, nil, nil, fmt.Errorf("unsupported TypeCollationID: %v / %v", left.Collation, right.Collation) 210 } 211 212 leftCS := leftColl.Charset() 213 rightCS := rightColl.Charset() 214 215 if left.Coercibility == CoerceExplicit && right.Coercibility == CoerceExplicit { 216 if left.Collation != right.Collation { 217 goto cannotCoerce 218 } 219 } 220 221 if leftCS.Name() == rightCS.Name() { 222 switch { 223 case left.Coercibility < right.Coercibility: 224 left.Repertoire |= right.Repertoire 225 return left, nil, nil, nil 226 227 case left.Coercibility > right.Coercibility: 228 right.Repertoire |= left.Repertoire 229 return right, nil, nil, nil 230 231 case left.Collation == right.Collation: 232 left.Repertoire |= right.Repertoire 233 return left, nil, nil, nil 234 } 235 236 if left.Coercibility == CoerceExplicit { 237 goto cannotCoerce 238 } 239 240 leftCsBin := leftColl.IsBinary() 241 rightCsBin := rightColl.IsBinary() 242 243 switch { 244 case leftCsBin && rightCsBin: 245 left.Coercibility = CoerceNone 246 return left, nil, nil, nil 247 248 case leftCsBin: 249 return left, nil, nil, nil 250 251 case rightCsBin: 252 return right, nil, nil, nil 253 } 254 255 defaults := env.byCharset[leftCS.Name()] 256 return TypedCollation{ 257 Collation: defaults.Binary.ID(), 258 Coercibility: CoerceNone, 259 Repertoire: left.Repertoire | right.Repertoire, 260 }, nil, nil, nil 261 } 262 263 if _, leftIsBinary := leftColl.(*Collation_binary); leftIsBinary { 264 if left.Coercibility <= right.Coercibility { 265 return left, nil, nil, nil 266 } 267 goto coerceToRight 268 } 269 if _, rightIsBinary := rightColl.(*Collation_binary); rightIsBinary { 270 if left.Coercibility >= right.Coercibility { 271 return right, nil, nil, nil 272 } 273 goto coerceToLeft 274 } 275 276 if opt.ConvertToSuperset { 277 if checkCompatibleCollations(leftColl, left.Coercibility, left.Repertoire, rightColl, right.Coercibility, right.Repertoire) { 278 goto coerceToLeft 279 } 280 if checkCompatibleCollations(rightColl, right.Coercibility, right.Repertoire, leftColl, left.Coercibility, left.Repertoire) { 281 goto coerceToRight 282 } 283 } 284 285 if opt.ConvertWithCoercion { 286 if left.Coercibility < right.Coercibility && right.Coercibility > CoerceImplicit { 287 goto coerceToLeft 288 } 289 if right.Coercibility < left.Coercibility && left.Coercibility > CoerceImplicit { 290 goto coerceToRight 291 } 292 } 293 294 cannotCoerce: 295 return TypedCollation{}, nil, nil, fmt.Errorf("Illegal mix of collations (%s,%s) and (%s,%s)", 296 leftColl.Name(), left.Coercibility, rightColl.Name(), right.Coercibility) 297 298 coerceToLeft: 299 return left, nil, 300 func(dst, in []byte) ([]byte, error) { 301 return charset.Convert(dst, leftCS, in, rightCS) 302 }, nil 303 304 coerceToRight: 305 return right, 306 func(dst, in []byte) ([]byte, error) { 307 return charset.Convert(dst, rightCS, in, leftCS) 308 }, nil, nil 309 } 310 311 func (env *Environment) EnsureCollate(fromID, toID ID) error { 312 // these two lookups should never fail 313 from := env.LookupByID(fromID) 314 to := env.LookupByID(toID) 315 if from.Charset().Name() != to.Charset().Name() { 316 return fmt.Errorf("COLLATION '%s' is not valid for CHARACTER SET '%s'", to.Name(), from.Charset().Name()) 317 } 318 return nil 319 }