vitess.io/vitess@v0.16.2/go/mysql/collations/internal/uca/iter_ja.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package uca
    18  
    19  import "unicode/utf8"
    20  
    21  type jaIterator900 struct {
    22  	iterator900
    23  	queuedWeight  uint16
    24  	prevCodepoint rune
    25  	kanas         map[rune]byte
    26  }
    27  
    28  func (it *jaIterator900) adjustJapaneseWeights(weight uint16) uint16 {
    29  	// based on the following weights dumped from MySQL:
    30  	// {0x1C47, 0x1FB5, 0x1C47, 0x1FB5}, // ?? this is a no-op
    31  	// {0x3D5A, 0x3D8B, 0x1FB6, 0x1FE7},
    32  	// {0x1FB6, 0x3D59, 0x0000, 0x0000},
    33  	// {0x3D8C, 0x54A3, 0x0000, 0x0000},
    34  	if it.level == 0 && weight >= 0x1FB6 && weight <= 0x54A3 {
    35  		switch {
    36  		// FIXME: this weight adjustment seems like a no-op, but it comes from the MySQL dump
    37  		// case weight >= 0x1C47 && weight <= 0x1FB5:
    38  		// 	return weight
    39  		case weight >= 0x3D5A && weight <= 0x3D8B:
    40  			return weight - 0x3D5A + 0x1FB6
    41  		case weight >= 0x1FB6 && weight <= 0x3D59 || weight >= 0x3D8C && weight <= 0x54A3:
    42  			it.queuedWeight = weight
    43  			return 0xFB86
    44  		}
    45  	}
    46  	return weight
    47  }
    48  
    49  func (it *jaIterator900) cacheKana(cp rune) {
    50  	if unicodeIsHiragana(cp) {
    51  		if it.kanas == nil {
    52  			it.kanas = make(map[rune]byte)
    53  		}
    54  		it.kanas[cp] = 0x2
    55  	} else if unicodeIsKatakana(cp) {
    56  		if it.kanas == nil {
    57  			it.kanas = make(map[rune]byte)
    58  		}
    59  		it.kanas[cp] = 0x8
    60  	}
    61  }
    62  
    63  func (it *jaIterator900) Done() {
    64  	it.queuedWeight = 0x0
    65  	it.prevCodepoint = 0
    66  	it.kanas = nil
    67  	it.original = nil
    68  	it.input = nil
    69  	it.iterpool.Put(it)
    70  }
    71  
    72  func (it *jaIterator900) Next() (uint16, bool) {
    73  	for {
    74  		if it.queuedWeight != 0x0 {
    75  			var w uint16
    76  			w, it.queuedWeight = it.queuedWeight, 0x0
    77  			return w, true
    78  		}
    79  		if w, ok := it.codepoint.next(); ok {
    80  			return it.adjustJapaneseWeights(w), true
    81  		}
    82  
    83  	decodeNext:
    84  		cp, width := utf8.DecodeRune(it.input)
    85  		if cp == utf8.RuneError && width < 3 {
    86  			it.level++
    87  			// if we're at level 3 (Kana-sensitive) and we haven't seen
    88  			// any Kanas in the previous levels, there's nothing to yield
    89  			if it.level == 3 && it.kanas == nil {
    90  				return 0, false
    91  			}
    92  			if it.level < it.maxLevel {
    93  				it.input = it.original
    94  				return 0, true
    95  			}
    96  			return 0, false
    97  		}
    98  
    99  		it.input = it.input[width:]
   100  		if weights := it.contract.FindContextual(cp, it.prevCodepoint); weights != nil {
   101  			// if this is a Kana-sensitive iterator and we're at level 3 (the Kana level),
   102  			// we cannot return the contraction's weight here, we need the actual weights in
   103  			// our Kana cache.
   104  			if it.level == 3 {
   105  				if w, ok := it.kanas[it.prevCodepoint]; ok {
   106  					it.prevCodepoint = 0
   107  					return uint16(w), true
   108  				}
   109  			}
   110  			it.codepoint.initContraction(weights, it.level)
   111  			it.prevCodepoint = 0
   112  			continue
   113  		}
   114  		it.prevCodepoint = cp
   115  
   116  		// if this is a Kana-sensitive iterator, we want to keep track of any
   117  		// kanas we've seen in a cache, so that when we reach level 3, we can
   118  		// quickly skip over codepoints that are not Kanas, as level 3 will
   119  		// only yield Kana-weights
   120  		if it.maxLevel == 4 {
   121  			switch it.level {
   122  			case 0:
   123  				if _, ok := it.kanas[cp]; !ok {
   124  					it.cacheKana(cp)
   125  				}
   126  			case 3:
   127  				if w, ok := it.kanas[cp]; ok {
   128  					return uint16(w), true
   129  				}
   130  				goto decodeNext
   131  			}
   132  		}
   133  
   134  		it.codepoint.init(&it.iterator900, cp)
   135  	}
   136  }