vitess.io/vitess@v0.16.2/go/vt/vtgate/vindexes/unicode.go (about)

     1  /*
     2  Copyright 2020 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vindexes
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"sync"
    23  	"unicode/utf8"
    24  
    25  	"vitess.io/vitess/go/sqltypes"
    26  
    27  	"golang.org/x/text/collate"
    28  	"golang.org/x/text/language"
    29  )
    30  
    31  // Shared functions for Unicode string normalization
    32  // for Vindexes.
    33  
    34  func unicodeHash(hashFunc func([]byte) []byte, key sqltypes.Value) ([]byte, error) {
    35  	collator := collatorPool.Get().(*pooledCollator)
    36  	defer collatorPool.Put(collator)
    37  
    38  	keyBytes, err := key.ToBytes()
    39  	if err != nil {
    40  		return nil, err
    41  	}
    42  	norm, err := normalize(collator.col, collator.buf, keyBytes)
    43  	if err != nil {
    44  		return nil, err
    45  	}
    46  	return hashFunc(norm), nil
    47  }
    48  
    49  func normalize(col *collate.Collator, buf *collate.Buffer, in []byte) ([]byte, error) {
    50  	// We cannot pass invalid UTF-8 to the collator.
    51  	if !utf8.Valid(in) {
    52  		return nil, fmt.Errorf("cannot normalize string containing invalid UTF-8: %q", string(in))
    53  	}
    54  
    55  	// Ref: http://dev.mysql.com/doc/refman/5.6/en/char.html.
    56  	// Trailing spaces are ignored by MySQL.
    57  	in = bytes.TrimRight(in, " ")
    58  
    59  	// We use the collation key which can be used to
    60  	// perform lexical comparisons.
    61  	return col.Key(buf, in), nil
    62  }
    63  
    64  // pooledCollator pairs a Collator and a Buffer.
    65  // These pairs are pooled to avoid reallocating for every request,
    66  // which would otherwise be required because they can't be used concurrently.
    67  //
    68  // Note that you must ensure no active references into the buffer remain
    69  // before you return this pair back to the pool.
    70  // That is, either do your processing on the result first, or make a copy.
    71  type pooledCollator struct {
    72  	col *collate.Collator
    73  	buf *collate.Buffer
    74  }
    75  
    76  var collatorPool = sync.Pool{New: newPooledCollator}
    77  
    78  func newPooledCollator() any {
    79  	// Ref: http://www.unicode.org/reports/tr10/#Introduction.
    80  	// Unicode seems to define a universal (or default) order.
    81  	// But various locales have conflicting order,
    82  	// which they have the right to override.
    83  	// Unfortunately, the Go library requires you to specify a locale.
    84  	// So, I chose English assuming that it won't override
    85  	// the Unicode universal order. But I couldn't find an easy
    86  	// way to verify this.
    87  	// Also, the locale differences are not an issue for level 1,
    88  	// because the conservative comparison makes them all equal.
    89  	return &pooledCollator{
    90  		col: collate.New(language.English, collate.Loose),
    91  		buf: new(collate.Buffer),
    92  	}
    93  }