github.com/dolthub/go-mysql-server@v0.18.0/sql/encodings/rangemap.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package encodings 16 17 import "unicode/utf8" 18 19 // RangeMap is an implementation of Encoder. Almost all files that make use of RangeMap have been generated by the 20 // collation-extractor project: https://github.com/dolthub/collation-extractor 21 type RangeMap struct { 22 inputEntries [][]rangeMapEntry 23 outputEntries [][]rangeMapEntry 24 toUpper map[rune]rune 25 toLower map[rune]rune 26 } 27 28 var _ Encoder = (*RangeMap)(nil) 29 30 // rangeMapEntry is an entry within a RangeMap, which represents a range of valid inputs along with the possible 31 // outputs, along with the multiplier for each byte position. 32 type rangeMapEntry struct { 33 inputRange rangeBounds 34 outputRange rangeBounds 35 inputMults []int 36 outputMults []int 37 } 38 39 // rangeBounds represents the minimum and maximum values for each section of this specific range. The byte at index 0 40 // represents the minimum, while the byte at index 1 represents the maximum. 41 type rangeBounds [][2]byte 42 43 // Decode implements the Encoder interface. 44 func (rm *RangeMap) Decode(str []byte) ([]byte, bool) { 45 // There's no way of knowing how large the resulting string will be, but we can at least set it to the same size to 46 // minimize allocations. 47 decodedStr := make([]byte, 0, len(str)) 48 for len(str) > 0 { 49 var decodedRune []byte 50 decodedRuneLen := 1 51 // The most common strings for most expected applications will find their result in the first loop, so the 52 // performance here shouldn't be as bad as it may seem. 53 for ; decodedRuneLen <= len(rm.inputEntries); decodedRuneLen++ { 54 if decodedRuneLen > len(str) { 55 return nil, false 56 } 57 var ok bool 58 decodedRune, ok = rm.DecodeRune(str[:decodedRuneLen]) 59 if ok { 60 break 61 } 62 } 63 if decodedRuneLen > len(rm.inputEntries) { 64 return nil, false 65 } 66 decodedStr = append(decodedStr, decodedRune...) 67 str = str[decodedRuneLen:] 68 } 69 return decodedStr, true 70 } 71 72 // Encode implements the Encoder interface. 73 func (rm *RangeMap) Encode(str []byte) ([]byte, bool) { 74 // There's no way of knowing how large the resulting string will be, but we can at least set it to the same size to 75 // minimize allocations. 76 encodedStr := make([]byte, 0, len(str)) 77 for len(str) > 0 { 78 var encodedRune []byte 79 encodedRuneLen := 1 80 // The most common strings for most expected applications will find their result in the first loop, so the 81 // performance here shouldn't be as bad as it may seem. 82 for ; encodedRuneLen <= len(rm.inputEntries); encodedRuneLen++ { 83 var ok bool 84 encodedRune, ok = rm.EncodeRune(str[:encodedRuneLen]) 85 if ok { 86 break 87 } 88 } 89 if encodedRuneLen > len(rm.inputEntries) { 90 return nil, false 91 } 92 encodedStr = append(encodedStr, encodedRune...) 93 str = str[encodedRuneLen:] 94 } 95 return encodedStr, true 96 } 97 98 // EncodeReplaceUnknown implements the Encoder interface. 99 func (rm *RangeMap) EncodeReplaceUnknown(str []byte) []byte { 100 // There's no way of knowing how large the resulting string will be, but we can at least set it to the same size to 101 // minimize allocations. 102 encodedStr := make([]byte, 0, len(str)) 103 for len(str) > 0 { 104 var encodedRune []byte 105 encodedRuneLen := 1 106 // The most common strings for most expected applications will find their result in the first loop, so the 107 // performance here shouldn't be as bad as it may seem. 108 for ; encodedRuneLen <= len(rm.inputEntries) && encodedRuneLen <= len(str); encodedRuneLen++ { 109 var ok bool 110 encodedRune, ok = rm.EncodeRune(str[:encodedRuneLen]) 111 if ok { 112 break 113 } 114 } 115 if encodedRuneLen > len(rm.inputEntries) { 116 // The rune is not valid in this character set, so we'll attempt to see if the rune is valid utf8. 117 // If it is, then we want to replace the entire rune with a question mark. If it's not, then we'll 118 // just replace the next byte. 119 _, encodedRuneLen = utf8.DecodeRune(str) 120 if encodedRuneLen == 0 { 121 encodedRuneLen = 1 122 } 123 encodedRune = []byte{'?'} 124 } 125 // Since we do not terminate on invalid sequences, we may end up in a scenario where our count is misaligned, so 126 // we need to catch such instances. 127 if encodedRuneLen >= len(str) { 128 encodedRuneLen = len(str) 129 } 130 if len(encodedRune) == 0 { 131 encodedRune = []byte{'?'} 132 } 133 encodedStr = append(encodedStr, encodedRune...) 134 str = str[encodedRuneLen:] 135 } 136 return encodedStr 137 } 138 139 // DecodeRune implements the Encoder interface. 140 func (rm *RangeMap) DecodeRune(r []byte) ([]byte, bool) { 141 if len(r) > len(rm.inputEntries) { 142 return nil, false 143 } 144 for _, entry := range rm.inputEntries[len(r)-1] { 145 if entry.inputRange.contains(r) { 146 outputData := make([]byte, len(entry.outputRange)) 147 increase := 0 148 for i := len(entry.inputRange) - 1; i >= 0; i-- { 149 increase += int(r[i]-entry.inputRange[i][0]) * entry.inputMults[i] 150 } 151 for i := 0; i < len(outputData); i++ { 152 diff := increase / entry.outputMults[i] 153 outputData[i] = entry.outputRange[i][0] + byte(diff) 154 increase -= diff * entry.outputMults[i] 155 } 156 return outputData, true 157 } 158 } 159 return nil, false 160 } 161 162 // EncodeRune implements the Encoder interface. 163 func (rm *RangeMap) EncodeRune(r []byte) ([]byte, bool) { 164 if len(r) > len(rm.outputEntries) { 165 return nil, false 166 } 167 for _, entry := range rm.outputEntries[len(r)-1] { 168 if entry.outputRange.contains(r) { 169 inputData := make([]byte, len(entry.inputRange)) 170 increase := 0 171 for i := len(entry.outputRange) - 1; i >= 0; i-- { 172 increase += int(r[i]-entry.outputRange[i][0]) * entry.outputMults[i] 173 } 174 for i := 0; i < len(inputData); i++ { 175 diff := increase / entry.inputMults[i] 176 inputData[i] = entry.inputRange[i][0] + byte(diff) 177 increase -= diff * entry.inputMults[i] 178 } 179 return inputData, true 180 } 181 } 182 return nil, false 183 } 184 185 // Uppercase implements the Encoder interface. 186 func (rm *RangeMap) Uppercase(str string) string { 187 newStr := make([]byte, 0, len(str)) 188 // Range loops over strings automatically read the string as a series of runes, similar to utf8.DecodeRuneInString(). 189 // See: https://go.dev/doc/effective_go#for & https://pkg.go.dev/unicode/utf8#DecodeRuneInString 190 for _, r := range str { 191 // Wrapping a rune in a string will convert it to a sequence of bytes, which are then appended to the byte slice 192 newStr = append(newStr, string(rm.UppercaseRune(r))...) 193 } 194 return BytesToString(newStr) 195 } 196 197 // Lowercase implements the Encoder interface. 198 func (rm *RangeMap) Lowercase(str string) string { 199 newStr := make([]byte, 0, len(str)) 200 // Range loops over strings automatically read the string as a series of runes, similar to utf8.DecodeRuneInString(). 201 // See: https://go.dev/doc/effective_go#for & https://pkg.go.dev/unicode/utf8#DecodeRuneInString 202 for _, r := range str { 203 // Wrapping a rune in a string will convert it to a sequence of bytes, which are then appended to the byte slice 204 newStr = append(newStr, string(rm.LowercaseRune(r))...) 205 } 206 return BytesToString(newStr) 207 } 208 209 // UppercaseRune implements the Encoder interface. 210 func (rm *RangeMap) UppercaseRune(r rune) rune { 211 if uRune, ok := rm.toUpper[r]; ok { 212 return uRune 213 } 214 return r 215 } 216 217 // LowercaseRune implements the Encoder interface. 218 func (rm *RangeMap) LowercaseRune(r rune) rune { 219 if lRune, ok := rm.toLower[r]; ok { 220 return lRune 221 } 222 return r 223 } 224 225 // NextRune implements the Encoder interface. 226 func (rm *RangeMap) NextRune(str string) (rune, int) { 227 return utf8.DecodeRuneInString(str) 228 } 229 230 // IsReturnSafe implements the Encoder interface. All returns from RangeMap are safe to edit as they create a new byte 231 // slice. 232 func (rm *RangeMap) IsReturnSafe() bool { 233 return true 234 } 235 236 // contains returns whether the data falls within the range bounds. Assumes that the length of the data matches the 237 // length of the range bounds. 238 func (r rangeBounds) contains(data []byte) bool { 239 for i := 0; i < len(r); i++ { 240 if r[i][0] > data[i] || r[i][1] < data[i] { 241 return false 242 } 243 } 244 return true 245 }