vitess.io/vitess@v0.16.2/go/mysql/collations/internal/uca/unicode.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package uca
    18  
    19  // UnicodeDecomposeHangulSyllable breaks down a Korean Hangul rune into its 2 or 3 composited
    20  // codepoints.
    21  // This is a straight port of the algorithm in http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf
    22  func UnicodeDecomposeHangulSyllable(syl rune) []rune {
    23  	const baseSyllabe = 0xAC00
    24  	const baseLeadingJamo = 0x1100
    25  	const baseVowelJamo = 0x1161
    26  	const baseTrailingJamo = 0x11A7
    27  	const countVowelJamo = 21
    28  	const countTrailingJamo = 28
    29  	const countJamoCombinations = countVowelJamo * countTrailingJamo
    30  
    31  	if syl < 0xAC00 || syl > 0xD7AF {
    32  		return nil
    33  	}
    34  
    35  	sylIndex := syl - baseSyllabe
    36  	indexLeadingJamo := sylIndex / countJamoCombinations
    37  	indexVowelJamo := (sylIndex % countJamoCombinations) / countTrailingJamo
    38  
    39  	result := []rune{baseLeadingJamo + indexLeadingJamo, baseVowelJamo + indexVowelJamo}
    40  	if indexTrailingJamo := sylIndex % countTrailingJamo; indexTrailingJamo != 0 {
    41  		result = append(result, baseTrailingJamo+indexTrailingJamo)
    42  	}
    43  	return result
    44  }
    45  
    46  // UnicodeImplicitWeights900 generates the implicit weights for this codepoint.
    47  // This is a straight port of the algorithm in https://www.unicode.org/reports/tr10/tr10-34.html#Implicit_Weights
    48  // It only applies to the UCA Standard v9.0.0
    49  func UnicodeImplicitWeights900(weights []uint16, codepoint rune) {
    50  	var aaaa, bbbb uint16
    51  
    52  	switch {
    53  	case codepoint >= 0x17000 && codepoint <= 0x18AFF:
    54  		aaaa = 0xFB00
    55  		bbbb = uint16(codepoint-0x17000) | 0x8000
    56  
    57  	case (codepoint >= 0x4E00 && codepoint <= 0x9FD5) ||
    58  		(codepoint >= 0xFA0E && codepoint <= 0xFA29):
    59  		aaaa = 0xFB40 + uint16(codepoint>>15)
    60  		bbbb = uint16(codepoint&0x7FFF) | 0x8000
    61  
    62  	case (codepoint >= 0x3400 && codepoint <= 0x4DB5) ||
    63  		(codepoint >= 0x20000 && codepoint <= 0x2A6D6) ||
    64  		(codepoint >= 0x2A700 && codepoint <= 0x2B734) ||
    65  		(codepoint >= 0x2B740 && codepoint <= 0x2B81D) ||
    66  		(codepoint >= 0x2B820 && codepoint <= 0x2CEA1):
    67  		aaaa = 0xFB80 + uint16(codepoint>>15)
    68  		bbbb = uint16(codepoint&0x7FFF) | 0x8000
    69  
    70  	default:
    71  		aaaa = 0xFBC0 + uint16(codepoint>>15)
    72  		bbbb = uint16(codepoint&0x7FFF) | 0x8000
    73  	}
    74  
    75  	weights[0] = aaaa
    76  	weights[1] = 0x0020
    77  	weights[2] = 0x0002
    78  	weights[3] = bbbb
    79  	weights[4] = 0x0000
    80  	weights[5] = 0x0000
    81  }
    82  
    83  // UnicodeImplicitWeightsLegacy generates the implicit weights for this codepoint.
    84  // This is a straight port of the algorithm in https://www.unicode.org/reports/tr10/tr10-20.html#Implicit_Weights
    85  // It only applies to the UCA Standard v4.0.0 and v5.2.0
    86  func UnicodeImplicitWeightsLegacy(weights []uint16, codepoint rune) {
    87  	/*
    88  		To derive the collation elements, the value of the code point is used to calculate two numbers,
    89  		by bit shifting and bit masking. The bit operations are chosen so that the resultant numbers have
    90  		the desired ranges for constructing implicit weights. The first number is calculated by taking the
    91  		code point expressed as a 32-bit binary integer CP and bit shifting it right by 15 bits.
    92  		Because code points range from U+0000 to U+10FFFF, the result will be a number in the range 0 to 2116 (= 3310).
    93  		This number is then added to the special value BASE.
    94  
    95  			AAAA = BASE + (CP >> 15);
    96  
    97  		Now mask off the bottom 15 bits of CP. OR a 1 into bit 15, so that the resultant value is non-zero.
    98  
    99  			BBBB = (CP & 0x7FFF) | 0x8000;
   100  
   101  		AAAA and BBBB are interpreted as unsigned 16-bit integers. The implicit weight mapping given to
   102  		the code point is then constructed as:
   103  
   104  			[.AAAA.0020.0002.][.BBBB.0000.0000.]
   105  
   106  		However, note that for the legacy iterator, to match MySQL's behavior, we're only
   107  		iterating through the level 0 weights, so we only have to yield AAAA and BBBB.
   108  	*/
   109  	switch {
   110  	case codepoint >= 0x3400 && codepoint <= 0x4DB5:
   111  		weights[0] = 0xFB80 + uint16(codepoint>>15)
   112  	case codepoint >= 0x4E00 && codepoint <= 0x9FA5:
   113  		weights[0] = 0xFB40 + uint16(codepoint>>15)
   114  	default:
   115  		weights[0] = 0xFBC0 + uint16(codepoint>>15)
   116  	}
   117  
   118  	weights[1] = uint16(codepoint&0x7FFF) | 0x8000
   119  }
   120  
   121  func unicodeIsKatakana(cp rune) bool {
   122  	switch {
   123  	case cp == 0x30FD || cp == 0x30FE || cp == 0x30FC:
   124  		return true
   125  	case (cp >= 0x30A1 && cp <= 0x30FA) || (cp >= 0xFF66 && cp <= 0xFF9D):
   126  		return true
   127  	default:
   128  		return false
   129  	}
   130  }
   131  
   132  func unicodeIsHiragana(cp rune) bool {
   133  	switch {
   134  	case cp >= 0x3041 && cp <= 0x3096:
   135  		return true
   136  	case cp == 0x309D || cp == 0x309E:
   137  		return true
   138  	default:
   139  		return false
   140  	}
   141  }
   142  
   143  // unicodeImplicitChineseWeights adjusts the ordering weights for implicit
   144  // codepoints in Chinese collations. It is not clear what is the rationale
   145  // behind these adjustments to the AAAA page for the weights, but these
   146  // page offsets have been reverse engineered to pass the WEIGHT_STRING tests
   147  // for all codepoints in the ZH range
   148  //
   149  // TODO: is this the right level to perform the adjustment?
   150  func unicodeImplicitChineseWeights(weights []uint16, codepoint rune) {
   151  	UnicodeImplicitWeights900(weights, codepoint)
   152  
   153  	switch weights[0] {
   154  	case 0xFB00:
   155  		weights[0] -= 1247
   156  	case 0xFB40, 0xFB41:
   157  		weights[0] -= 15745
   158  	case 0xFB80:
   159  		weights[0] -= 15807
   160  	case 0xFB84, 0xFB85:
   161  		weights[0] -= 15810
   162  	default:
   163  		weights[0] -= 1438
   164  	}
   165  }