vitess.io/vitess@v0.16.2/go/mysql/collations/internal/uca/unicode.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package uca 18 19 // UnicodeDecomposeHangulSyllable breaks down a Korean Hangul rune into its 2 or 3 composited 20 // codepoints. 21 // This is a straight port of the algorithm in http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf 22 func UnicodeDecomposeHangulSyllable(syl rune) []rune { 23 const baseSyllabe = 0xAC00 24 const baseLeadingJamo = 0x1100 25 const baseVowelJamo = 0x1161 26 const baseTrailingJamo = 0x11A7 27 const countVowelJamo = 21 28 const countTrailingJamo = 28 29 const countJamoCombinations = countVowelJamo * countTrailingJamo 30 31 if syl < 0xAC00 || syl > 0xD7AF { 32 return nil 33 } 34 35 sylIndex := syl - baseSyllabe 36 indexLeadingJamo := sylIndex / countJamoCombinations 37 indexVowelJamo := (sylIndex % countJamoCombinations) / countTrailingJamo 38 39 result := []rune{baseLeadingJamo + indexLeadingJamo, baseVowelJamo + indexVowelJamo} 40 if indexTrailingJamo := sylIndex % countTrailingJamo; indexTrailingJamo != 0 { 41 result = append(result, baseTrailingJamo+indexTrailingJamo) 42 } 43 return result 44 } 45 46 // UnicodeImplicitWeights900 generates the implicit weights for this codepoint. 47 // This is a straight port of the algorithm in https://www.unicode.org/reports/tr10/tr10-34.html#Implicit_Weights 48 // It only applies to the UCA Standard v9.0.0 49 func UnicodeImplicitWeights900(weights []uint16, codepoint rune) { 50 var aaaa, bbbb uint16 51 52 switch { 53 case codepoint >= 0x17000 && codepoint <= 0x18AFF: 54 aaaa = 0xFB00 55 bbbb = uint16(codepoint-0x17000) | 0x8000 56 57 case (codepoint >= 0x4E00 && codepoint <= 0x9FD5) || 58 (codepoint >= 0xFA0E && codepoint <= 0xFA29): 59 aaaa = 0xFB40 + uint16(codepoint>>15) 60 bbbb = uint16(codepoint&0x7FFF) | 0x8000 61 62 case (codepoint >= 0x3400 && codepoint <= 0x4DB5) || 63 (codepoint >= 0x20000 && codepoint <= 0x2A6D6) || 64 (codepoint >= 0x2A700 && codepoint <= 0x2B734) || 65 (codepoint >= 0x2B740 && codepoint <= 0x2B81D) || 66 (codepoint >= 0x2B820 && codepoint <= 0x2CEA1): 67 aaaa = 0xFB80 + uint16(codepoint>>15) 68 bbbb = uint16(codepoint&0x7FFF) | 0x8000 69 70 default: 71 aaaa = 0xFBC0 + uint16(codepoint>>15) 72 bbbb = uint16(codepoint&0x7FFF) | 0x8000 73 } 74 75 weights[0] = aaaa 76 weights[1] = 0x0020 77 weights[2] = 0x0002 78 weights[3] = bbbb 79 weights[4] = 0x0000 80 weights[5] = 0x0000 81 } 82 83 // UnicodeImplicitWeightsLegacy generates the implicit weights for this codepoint. 84 // This is a straight port of the algorithm in https://www.unicode.org/reports/tr10/tr10-20.html#Implicit_Weights 85 // It only applies to the UCA Standard v4.0.0 and v5.2.0 86 func UnicodeImplicitWeightsLegacy(weights []uint16, codepoint rune) { 87 /* 88 To derive the collation elements, the value of the code point is used to calculate two numbers, 89 by bit shifting and bit masking. The bit operations are chosen so that the resultant numbers have 90 the desired ranges for constructing implicit weights. The first number is calculated by taking the 91 code point expressed as a 32-bit binary integer CP and bit shifting it right by 15 bits. 92 Because code points range from U+0000 to U+10FFFF, the result will be a number in the range 0 to 2116 (= 3310). 93 This number is then added to the special value BASE. 94 95 AAAA = BASE + (CP >> 15); 96 97 Now mask off the bottom 15 bits of CP. OR a 1 into bit 15, so that the resultant value is non-zero. 98 99 BBBB = (CP & 0x7FFF) | 0x8000; 100 101 AAAA and BBBB are interpreted as unsigned 16-bit integers. The implicit weight mapping given to 102 the code point is then constructed as: 103 104 [.AAAA.0020.0002.][.BBBB.0000.0000.] 105 106 However, note that for the legacy iterator, to match MySQL's behavior, we're only 107 iterating through the level 0 weights, so we only have to yield AAAA and BBBB. 108 */ 109 switch { 110 case codepoint >= 0x3400 && codepoint <= 0x4DB5: 111 weights[0] = 0xFB80 + uint16(codepoint>>15) 112 case codepoint >= 0x4E00 && codepoint <= 0x9FA5: 113 weights[0] = 0xFB40 + uint16(codepoint>>15) 114 default: 115 weights[0] = 0xFBC0 + uint16(codepoint>>15) 116 } 117 118 weights[1] = uint16(codepoint&0x7FFF) | 0x8000 119 } 120 121 func unicodeIsKatakana(cp rune) bool { 122 switch { 123 case cp == 0x30FD || cp == 0x30FE || cp == 0x30FC: 124 return true 125 case (cp >= 0x30A1 && cp <= 0x30FA) || (cp >= 0xFF66 && cp <= 0xFF9D): 126 return true 127 default: 128 return false 129 } 130 } 131 132 func unicodeIsHiragana(cp rune) bool { 133 switch { 134 case cp >= 0x3041 && cp <= 0x3096: 135 return true 136 case cp == 0x309D || cp == 0x309E: 137 return true 138 default: 139 return false 140 } 141 } 142 143 // unicodeImplicitChineseWeights adjusts the ordering weights for implicit 144 // codepoints in Chinese collations. It is not clear what is the rationale 145 // behind these adjustments to the AAAA page for the weights, but these 146 // page offsets have been reverse engineered to pass the WEIGHT_STRING tests 147 // for all codepoints in the ZH range 148 // 149 // TODO: is this the right level to perform the adjustment? 150 func unicodeImplicitChineseWeights(weights []uint16, codepoint rune) { 151 UnicodeImplicitWeights900(weights, codepoint) 152 153 switch weights[0] { 154 case 0xFB00: 155 weights[0] -= 1247 156 case 0xFB40, 0xFB41: 157 weights[0] -= 15745 158 case 0xFB80: 159 weights[0] -= 15807 160 case 0xFB84, 0xFB85: 161 weights[0] -= 15810 162 default: 163 weights[0] -= 1438 164 } 165 }