vitess.io/vitess@v0.16.2/go/mysql/collations/internal/uca/contractions.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package uca
    18  
    19  import (
    20  	"fmt"
    21  	"unicode/utf8"
    22  
    23  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    24  )
    25  
    26  type trie struct {
    27  	children map[rune]*trie
    28  	weights  []uint16
    29  }
    30  
    31  func (t *trie) walkUTF8(remainder []byte) ([]uint16, []byte) {
    32  	if len(remainder) > 0 {
    33  		cp, width := utf8.DecodeRune(remainder)
    34  		if cp == utf8.RuneError && width < 3 {
    35  			return nil, nil
    36  		}
    37  		if ch := t.children[cp]; ch != nil {
    38  			return ch.walkUTF8(remainder[width:])
    39  		}
    40  	}
    41  	return t.weights, remainder
    42  }
    43  
    44  func (t *trie) walkCharset(cs charset.Charset, remainder []byte, depth int) ([]uint16, []byte, int) {
    45  	if len(remainder) > 0 {
    46  		cp, width := cs.DecodeRune(remainder)
    47  		if cp == charset.RuneError && width < 3 {
    48  			return nil, nil, 0
    49  		}
    50  		if ch := t.children[cp]; ch != nil {
    51  			return ch.walkCharset(cs, remainder[width:], depth+1)
    52  		}
    53  	}
    54  	return t.weights, remainder, depth + 1
    55  }
    56  
    57  func (t *trie) insert(path []rune, weights []uint16) {
    58  	if len(path) == 0 {
    59  		if t.weights != nil {
    60  			panic("duplicate contraction")
    61  		}
    62  		t.weights = weights
    63  		return
    64  	}
    65  
    66  	if t.children == nil {
    67  		t.children = make(map[rune]*trie)
    68  	}
    69  	ch := t.children[path[0]]
    70  	if ch == nil {
    71  		ch = &trie{}
    72  		t.children[path[0]] = ch
    73  	}
    74  	ch.insert(path[1:], weights)
    75  }
    76  
    77  type trieContractor struct {
    78  	tr trie
    79  }
    80  
    81  func (ctr *trieContractor) insert(c *Contraction) {
    82  	if len(c.Path) < 2 {
    83  		panic("contraction is too short")
    84  	}
    85  	if len(c.Weights)%3 != 0 {
    86  		panic(fmt.Sprintf("weights are not well-formed: %#v has len=%d", c.Weights, len(c.Weights)))
    87  	}
    88  	if c.Contextual && len(c.Path) != 2 {
    89  		panic("contextual contractions can only span 2 codepoints")
    90  	}
    91  	ctr.tr.insert(c.Path, c.Weights)
    92  }
    93  
    94  func (ctr *trieContractor) Find(cs charset.Charset, cp rune, remainder []byte) ([]uint16, []byte, int) {
    95  	if tr := ctr.tr.children[cp]; tr != nil {
    96  		return tr.walkCharset(cs, remainder, 0)
    97  	}
    98  	return nil, nil, 0
    99  }
   100  
   101  func (ctr *trieContractor) FindContextual(cp, prev rune) []uint16 {
   102  	if tr := ctr.tr.children[cp]; tr != nil {
   103  		if trc := tr.children[prev]; trc != nil {
   104  			return trc.weights
   105  		}
   106  	}
   107  	return nil
   108  }
   109  
   110  func NewTrieContractor(all []Contraction) Contractor {
   111  	if len(all) == 0 {
   112  		return nil
   113  	}
   114  	ctr := &trieContractor{}
   115  	for _, c := range all {
   116  		ctr.insert(&c)
   117  	}
   118  	return ctr
   119  }
   120  
   121  type Contraction struct {
   122  	Path       []rune
   123  	Weights    []uint16
   124  	Contextual bool
   125  }
   126  
   127  type Contractor interface {
   128  	Find(cs charset.Charset, cp rune, remainder []byte) ([]uint16, []byte, int)
   129  	FindContextual(cp1, cp0 rune) []uint16
   130  }