github.com/dolthub/go-mysql-server@v0.18.0/sql/encodings/rangemap.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package encodings
    16  
    17  import "unicode/utf8"
    18  
    19  // RangeMap is an implementation of Encoder. Almost all files that make use of RangeMap have been generated by the
    20  // collation-extractor project: https://github.com/dolthub/collation-extractor
    21  type RangeMap struct {
    22  	inputEntries  [][]rangeMapEntry
    23  	outputEntries [][]rangeMapEntry
    24  	toUpper       map[rune]rune
    25  	toLower       map[rune]rune
    26  }
    27  
    28  var _ Encoder = (*RangeMap)(nil)
    29  
    30  // rangeMapEntry is an entry within a RangeMap, which represents a range of valid inputs along with the possible
    31  // outputs, along with the multiplier for each byte position.
    32  type rangeMapEntry struct {
    33  	inputRange  rangeBounds
    34  	outputRange rangeBounds
    35  	inputMults  []int
    36  	outputMults []int
    37  }
    38  
    39  // rangeBounds represents the minimum and maximum values for each section of this specific range. The byte at index 0
    40  // represents the minimum, while the byte at index 1 represents the maximum.
    41  type rangeBounds [][2]byte
    42  
    43  // Decode implements the Encoder interface.
    44  func (rm *RangeMap) Decode(str []byte) ([]byte, bool) {
    45  	// There's no way of knowing how large the resulting string will be, but we can at least set it to the same size to
    46  	// minimize allocations.
    47  	decodedStr := make([]byte, 0, len(str))
    48  	for len(str) > 0 {
    49  		var decodedRune []byte
    50  		decodedRuneLen := 1
    51  		// The most common strings for most expected applications will find their result in the first loop, so the
    52  		// performance here shouldn't be as bad as it may seem.
    53  		for ; decodedRuneLen <= len(rm.inputEntries); decodedRuneLen++ {
    54  			if decodedRuneLen > len(str) {
    55  				return nil, false
    56  			}
    57  			var ok bool
    58  			decodedRune, ok = rm.DecodeRune(str[:decodedRuneLen])
    59  			if ok {
    60  				break
    61  			}
    62  		}
    63  		if decodedRuneLen > len(rm.inputEntries) {
    64  			return nil, false
    65  		}
    66  		decodedStr = append(decodedStr, decodedRune...)
    67  		str = str[decodedRuneLen:]
    68  	}
    69  	return decodedStr, true
    70  }
    71  
    72  // Encode implements the Encoder interface.
    73  func (rm *RangeMap) Encode(str []byte) ([]byte, bool) {
    74  	// There's no way of knowing how large the resulting string will be, but we can at least set it to the same size to
    75  	// minimize allocations.
    76  	encodedStr := make([]byte, 0, len(str))
    77  	for len(str) > 0 {
    78  		var encodedRune []byte
    79  		encodedRuneLen := 1
    80  		// The most common strings for most expected applications will find their result in the first loop, so the
    81  		// performance here shouldn't be as bad as it may seem.
    82  		for ; encodedRuneLen <= len(rm.inputEntries); encodedRuneLen++ {
    83  			var ok bool
    84  			encodedRune, ok = rm.EncodeRune(str[:encodedRuneLen])
    85  			if ok {
    86  				break
    87  			}
    88  		}
    89  		if encodedRuneLen > len(rm.inputEntries) {
    90  			return nil, false
    91  		}
    92  		encodedStr = append(encodedStr, encodedRune...)
    93  		str = str[encodedRuneLen:]
    94  	}
    95  	return encodedStr, true
    96  }
    97  
    98  // EncodeReplaceUnknown implements the Encoder interface.
    99  func (rm *RangeMap) EncodeReplaceUnknown(str []byte) []byte {
   100  	// There's no way of knowing how large the resulting string will be, but we can at least set it to the same size to
   101  	// minimize allocations.
   102  	encodedStr := make([]byte, 0, len(str))
   103  	for len(str) > 0 {
   104  		var encodedRune []byte
   105  		encodedRuneLen := 1
   106  		// The most common strings for most expected applications will find their result in the first loop, so the
   107  		// performance here shouldn't be as bad as it may seem.
   108  		for ; encodedRuneLen <= len(rm.inputEntries) && encodedRuneLen <= len(str); encodedRuneLen++ {
   109  			var ok bool
   110  			encodedRune, ok = rm.EncodeRune(str[:encodedRuneLen])
   111  			if ok {
   112  				break
   113  			}
   114  		}
   115  		if encodedRuneLen > len(rm.inputEntries) {
   116  			// The rune is not valid in this character set, so we'll attempt to see if the rune is valid utf8.
   117  			// If it is, then we want to replace the entire rune with a question mark. If it's not, then we'll
   118  			// just replace the next byte.
   119  			_, encodedRuneLen = utf8.DecodeRune(str)
   120  			if encodedRuneLen == 0 {
   121  				encodedRuneLen = 1
   122  			}
   123  			encodedRune = []byte{'?'}
   124  		}
   125  		// Since we do not terminate on invalid sequences, we may end up in a scenario where our count is misaligned, so
   126  		// we need to catch such instances.
   127  		if encodedRuneLen >= len(str) {
   128  			encodedRuneLen = len(str)
   129  		}
   130  		if len(encodedRune) == 0 {
   131  			encodedRune = []byte{'?'}
   132  		}
   133  		encodedStr = append(encodedStr, encodedRune...)
   134  		str = str[encodedRuneLen:]
   135  	}
   136  	return encodedStr
   137  }
   138  
   139  // DecodeRune implements the Encoder interface.
   140  func (rm *RangeMap) DecodeRune(r []byte) ([]byte, bool) {
   141  	if len(r) > len(rm.inputEntries) {
   142  		return nil, false
   143  	}
   144  	for _, entry := range rm.inputEntries[len(r)-1] {
   145  		if entry.inputRange.contains(r) {
   146  			outputData := make([]byte, len(entry.outputRange))
   147  			increase := 0
   148  			for i := len(entry.inputRange) - 1; i >= 0; i-- {
   149  				increase += int(r[i]-entry.inputRange[i][0]) * entry.inputMults[i]
   150  			}
   151  			for i := 0; i < len(outputData); i++ {
   152  				diff := increase / entry.outputMults[i]
   153  				outputData[i] = entry.outputRange[i][0] + byte(diff)
   154  				increase -= diff * entry.outputMults[i]
   155  			}
   156  			return outputData, true
   157  		}
   158  	}
   159  	return nil, false
   160  }
   161  
   162  // EncodeRune implements the Encoder interface.
   163  func (rm *RangeMap) EncodeRune(r []byte) ([]byte, bool) {
   164  	if len(r) > len(rm.outputEntries) {
   165  		return nil, false
   166  	}
   167  	for _, entry := range rm.outputEntries[len(r)-1] {
   168  		if entry.outputRange.contains(r) {
   169  			inputData := make([]byte, len(entry.inputRange))
   170  			increase := 0
   171  			for i := len(entry.outputRange) - 1; i >= 0; i-- {
   172  				increase += int(r[i]-entry.outputRange[i][0]) * entry.outputMults[i]
   173  			}
   174  			for i := 0; i < len(inputData); i++ {
   175  				diff := increase / entry.inputMults[i]
   176  				inputData[i] = entry.inputRange[i][0] + byte(diff)
   177  				increase -= diff * entry.inputMults[i]
   178  			}
   179  			return inputData, true
   180  		}
   181  	}
   182  	return nil, false
   183  }
   184  
   185  // Uppercase implements the Encoder interface.
   186  func (rm *RangeMap) Uppercase(str string) string {
   187  	newStr := make([]byte, 0, len(str))
   188  	// Range loops over strings automatically read the string as a series of runes, similar to utf8.DecodeRuneInString().
   189  	// See: https://go.dev/doc/effective_go#for & https://pkg.go.dev/unicode/utf8#DecodeRuneInString
   190  	for _, r := range str {
   191  		// Wrapping a rune in a string will convert it to a sequence of bytes, which are then appended to the byte slice
   192  		newStr = append(newStr, string(rm.UppercaseRune(r))...)
   193  	}
   194  	return BytesToString(newStr)
   195  }
   196  
   197  // Lowercase implements the Encoder interface.
   198  func (rm *RangeMap) Lowercase(str string) string {
   199  	newStr := make([]byte, 0, len(str))
   200  	// Range loops over strings automatically read the string as a series of runes, similar to utf8.DecodeRuneInString().
   201  	// See: https://go.dev/doc/effective_go#for & https://pkg.go.dev/unicode/utf8#DecodeRuneInString
   202  	for _, r := range str {
   203  		// Wrapping a rune in a string will convert it to a sequence of bytes, which are then appended to the byte slice
   204  		newStr = append(newStr, string(rm.LowercaseRune(r))...)
   205  	}
   206  	return BytesToString(newStr)
   207  }
   208  
   209  // UppercaseRune implements the Encoder interface.
   210  func (rm *RangeMap) UppercaseRune(r rune) rune {
   211  	if uRune, ok := rm.toUpper[r]; ok {
   212  		return uRune
   213  	}
   214  	return r
   215  }
   216  
   217  // LowercaseRune implements the Encoder interface.
   218  func (rm *RangeMap) LowercaseRune(r rune) rune {
   219  	if lRune, ok := rm.toLower[r]; ok {
   220  		return lRune
   221  	}
   222  	return r
   223  }
   224  
   225  // NextRune implements the Encoder interface.
   226  func (rm *RangeMap) NextRune(str string) (rune, int) {
   227  	return utf8.DecodeRuneInString(str)
   228  }
   229  
   230  // IsReturnSafe implements the Encoder interface. All returns from RangeMap are safe to edit as they create a new byte
   231  // slice.
   232  func (rm *RangeMap) IsReturnSafe() bool {
   233  	return true
   234  }
   235  
   236  // contains returns whether the data falls within the range bounds. Assumes that the length of the data matches the
   237  // length of the range bounds.
   238  func (r rangeBounds) contains(data []byte) bool {
   239  	for i := 0; i < len(r); i++ {
   240  		if r[i][0] > data[i] || r[i][1] < data[i] {
   241  			return false
   242  		}
   243  	}
   244  	return true
   245  }