vitess.io/vitess@v0.16.2/go/mysql/collations/coercion.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package collations
    18  
    19  import (
    20  	"fmt"
    21  	"unsafe"
    22  
    23  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    24  )
    25  
    26  func init() {
    27  	if unsafe.Sizeof(TypedCollation{}) != 4 {
    28  		panic("TypedCollation should fit in an int32")
    29  	}
    30  }
    31  
    32  // Coercibility is a numeric value that represents the precedence of a collation
    33  // when applied to a SQL expression. When trying to coerce the collations
    34  // of two different expressions so that they can be compared, the expression
    35  // with the lowest coercibility value will win and its collation will be forced
    36  // upon the other expression.
    37  //
    38  // The rules for assigning a Coercibility value to an expression are as follows:
    39  //
    40  //   - An explicit COLLATE clause has a coercibility of 0 (not coercible at all).
    41  //   - The concatenation of two strings with different collations has a coercibility of 1.
    42  //   - The collation of a column or a stored routine parameter or local variable has a coercibility of 2.
    43  //   - A “system constant” (the string returned by functions such as USER() or VERSION()) has a coercibility of 3.
    44  //   - The collation of a literal has a coercibility of 4.
    45  //   - The collation of a numeric or temporal value has a coercibility of 5.
    46  //   - NULL or an expression that is derived from NULL has a coercibility of 6.
    47  //
    48  // According to the MySQL documentation, Coercibility is an actual word of the English
    49  // language, although the Vitess maintainers disagree with this assessment.
    50  //
    51  // See: https://dev.mysql.com/doc/refman/8.0/en/charset-collation-coercibility.html
    52  type Coercibility byte
    53  
    54  const (
    55  	CoerceExplicit Coercibility = iota
    56  	CoerceNone
    57  	CoerceImplicit
    58  	CoerceSysconst
    59  	CoerceCoercible
    60  	CoerceNumeric
    61  	CoerceIgnorable
    62  )
    63  
    64  func (ci Coercibility) String() string {
    65  	switch ci {
    66  	case 0:
    67  		return "EXPLICIT"
    68  	case 1:
    69  		return "NONE"
    70  	case 2:
    71  		return "IMPLICIT"
    72  	case 3:
    73  		return "SYSCONST"
    74  	case 4:
    75  		return "COERCIBLE"
    76  	case 5:
    77  		return "NUMERIC"
    78  	case 6:
    79  		return "IGNORABLE"
    80  	default:
    81  		panic("invalid Coercibility value")
    82  	}
    83  }
    84  
    85  // Repertoire is a constant that defines the collection of characters in an expression.
    86  // MySQL only distinguishes between an ASCII repertoire (i.e. an expression where all
    87  // the contained codepoints are < 128), or an Unicode repertoire (an expression that
    88  // can contain any possible codepoint).
    89  //
    90  // See: https://dev.mysql.com/doc/refman/8.0/en/charset-repertoire.html
    91  type Repertoire byte
    92  
    93  const (
    94  	RepertoireASCII Repertoire = iota
    95  	RepertoireUnicode
    96  )
    97  
    98  // Coercion is a function that will transform either the given argument
    99  // arguments of the function into a specific character set. The `dst` argument
   100  // will be used as the destination of the coerced argument, but it can be nil.
   101  type Coercion func(dst, in []byte) ([]byte, error)
   102  
   103  // TypedCollation is the Collation of a SQL expression, including its coercibility
   104  // and repertoire.
   105  type TypedCollation struct {
   106  	Collation    ID
   107  	Coercibility Coercibility
   108  	Repertoire   Repertoire
   109  }
   110  
   111  func (tc TypedCollation) Valid() bool {
   112  	return tc.Collation != Unknown
   113  }
   114  
   115  func checkCompatibleCollations(
   116  	left Collation, leftCoercibility Coercibility, leftRepertoire Repertoire,
   117  	right Collation, rightCoercibility Coercibility, rightRepertoire Repertoire,
   118  ) bool {
   119  	leftCS := left.Charset()
   120  	rightCS := right.Charset()
   121  
   122  	switch leftCS.(type) {
   123  	case charset.Charset_utf8mb4:
   124  		if leftCoercibility <= rightCoercibility {
   125  			return true
   126  		}
   127  
   128  	case charset.Charset_utf32:
   129  		switch {
   130  		case leftCoercibility < rightCoercibility:
   131  			return true
   132  		case leftCoercibility == rightCoercibility:
   133  			if !charset.IsUnicode(rightCS) {
   134  				return true
   135  			}
   136  			if !left.IsBinary() {
   137  				return true
   138  			}
   139  		}
   140  
   141  	case charset.Charset_utf8mb3, charset.Charset_ucs2, charset.Charset_utf16, charset.Charset_utf16le:
   142  		switch {
   143  		case leftCoercibility < rightCoercibility:
   144  			return true
   145  		case leftCoercibility == rightCoercibility:
   146  			if !charset.IsUnicode(rightCS) {
   147  				return true
   148  			}
   149  		}
   150  	}
   151  
   152  	if rightRepertoire == RepertoireASCII {
   153  		switch {
   154  		case leftCoercibility < rightCoercibility:
   155  			return true
   156  		case leftCoercibility == rightCoercibility:
   157  			if leftRepertoire == RepertoireUnicode {
   158  				return true
   159  			}
   160  		}
   161  	}
   162  
   163  	return false
   164  }
   165  
   166  // CoercionOptions is used to configure how aggressive the algorithm can be
   167  // when merging two different collations by transcoding them.
   168  type CoercionOptions struct {
   169  	// ConvertToSuperset allows merging two different collations as long
   170  	// as the charset of one of them is a strict superset of the other. In
   171  	// order to operate on the two expressions, one of them will need to
   172  	// be transcoded. This transcoding will always be safe because the string
   173  	// with the smallest repertoire will be transcoded to its superset, which
   174  	// cannot fail.
   175  	ConvertToSuperset bool
   176  
   177  	// ConvertWithCoercion allows merging two different collations by forcing
   178  	// a coercion as long as the coercibility of the two sides is lax enough.
   179  	// This will force a transcoding of one of the expressions even if their
   180  	// respective charsets are not a strict superset, so the resulting transcoding
   181  	// CAN fail depending on the content of their strings.
   182  	ConvertWithCoercion bool
   183  }
   184  
   185  // MergeCollations returns a Coercion function for a pair of TypedCollation based
   186  // on their coercibility.
   187  //
   188  // The function takes the typed collations for the two sides of a text operation
   189  // (namely, a comparison or concatenation of two textual expressions). These typed
   190  // collations includes the actual collation for the expression on each size, their
   191  // coercibility values (see: Coercibility) and their respective repertoires,
   192  // and returns the target collation (i.e. the collation into which the two expressions
   193  // must be coerced, and a Coercion function. The Coercion function can be called repeatedly
   194  // with the different values for the two expressions and will transcode either
   195  // the left-hand or right-hand value to the appropriate charset so it can be
   196  // collated against the other value.
   197  //
   198  // If the collations for both sides of the expressions are the same, the returned
   199  // Coercion function will be a no-op. Likewise, if the two collations are not the same,
   200  // but they are compatible and have the same charset, the Coercion function will also
   201  // be a no-op.
   202  //
   203  // If the collations for both sides of the expression are not compatible, an error
   204  // will be returned and the returned TypedCollation and Coercion will be nil.
   205  func (env *Environment) MergeCollations(left, right TypedCollation, opt CoercionOptions) (TypedCollation, Coercion, Coercion, error) {
   206  	leftColl := env.LookupByID(left.Collation)
   207  	rightColl := env.LookupByID(right.Collation)
   208  	if leftColl == nil || rightColl == nil {
   209  		return TypedCollation{}, nil, nil, fmt.Errorf("unsupported TypeCollationID: %v / %v", left.Collation, right.Collation)
   210  	}
   211  
   212  	leftCS := leftColl.Charset()
   213  	rightCS := rightColl.Charset()
   214  
   215  	if left.Coercibility == CoerceExplicit && right.Coercibility == CoerceExplicit {
   216  		if left.Collation != right.Collation {
   217  			goto cannotCoerce
   218  		}
   219  	}
   220  
   221  	if leftCS.Name() == rightCS.Name() {
   222  		switch {
   223  		case left.Coercibility < right.Coercibility:
   224  			left.Repertoire |= right.Repertoire
   225  			return left, nil, nil, nil
   226  
   227  		case left.Coercibility > right.Coercibility:
   228  			right.Repertoire |= left.Repertoire
   229  			return right, nil, nil, nil
   230  
   231  		case left.Collation == right.Collation:
   232  			left.Repertoire |= right.Repertoire
   233  			return left, nil, nil, nil
   234  		}
   235  
   236  		if left.Coercibility == CoerceExplicit {
   237  			goto cannotCoerce
   238  		}
   239  
   240  		leftCsBin := leftColl.IsBinary()
   241  		rightCsBin := rightColl.IsBinary()
   242  
   243  		switch {
   244  		case leftCsBin && rightCsBin:
   245  			left.Coercibility = CoerceNone
   246  			return left, nil, nil, nil
   247  
   248  		case leftCsBin:
   249  			return left, nil, nil, nil
   250  
   251  		case rightCsBin:
   252  			return right, nil, nil, nil
   253  		}
   254  
   255  		defaults := env.byCharset[leftCS.Name()]
   256  		return TypedCollation{
   257  			Collation:    defaults.Binary.ID(),
   258  			Coercibility: CoerceNone,
   259  			Repertoire:   left.Repertoire | right.Repertoire,
   260  		}, nil, nil, nil
   261  	}
   262  
   263  	if _, leftIsBinary := leftColl.(*Collation_binary); leftIsBinary {
   264  		if left.Coercibility <= right.Coercibility {
   265  			return left, nil, nil, nil
   266  		}
   267  		goto coerceToRight
   268  	}
   269  	if _, rightIsBinary := rightColl.(*Collation_binary); rightIsBinary {
   270  		if left.Coercibility >= right.Coercibility {
   271  			return right, nil, nil, nil
   272  		}
   273  		goto coerceToLeft
   274  	}
   275  
   276  	if opt.ConvertToSuperset {
   277  		if checkCompatibleCollations(leftColl, left.Coercibility, left.Repertoire, rightColl, right.Coercibility, right.Repertoire) {
   278  			goto coerceToLeft
   279  		}
   280  		if checkCompatibleCollations(rightColl, right.Coercibility, right.Repertoire, leftColl, left.Coercibility, left.Repertoire) {
   281  			goto coerceToRight
   282  		}
   283  	}
   284  
   285  	if opt.ConvertWithCoercion {
   286  		if left.Coercibility < right.Coercibility && right.Coercibility > CoerceImplicit {
   287  			goto coerceToLeft
   288  		}
   289  		if right.Coercibility < left.Coercibility && left.Coercibility > CoerceImplicit {
   290  			goto coerceToRight
   291  		}
   292  	}
   293  
   294  cannotCoerce:
   295  	return TypedCollation{}, nil, nil, fmt.Errorf("Illegal mix of collations (%s,%s) and (%s,%s)",
   296  		leftColl.Name(), left.Coercibility, rightColl.Name(), right.Coercibility)
   297  
   298  coerceToLeft:
   299  	return left, nil,
   300  		func(dst, in []byte) ([]byte, error) {
   301  			return charset.Convert(dst, leftCS, in, rightCS)
   302  		}, nil
   303  
   304  coerceToRight:
   305  	return right,
   306  		func(dst, in []byte) ([]byte, error) {
   307  			return charset.Convert(dst, rightCS, in, leftCS)
   308  		}, nil, nil
   309  }
   310  
   311  func (env *Environment) EnsureCollate(fromID, toID ID) error {
   312  	// these two lookups should never fail
   313  	from := env.LookupByID(fromID)
   314  	to := env.LookupByID(toID)
   315  	if from.Charset().Name() != to.Charset().Name() {
   316  		return fmt.Errorf("COLLATION '%s' is not valid for CHARACTER SET '%s'", to.Name(), from.Charset().Name())
   317  	}
   318  	return nil
   319  }