github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/soliton/collate/unicode_ci.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package defCauslate
    15  
    16  import (
    17  	"github.com/whtcorpsinc/milevadb/soliton/stringutil"
    18  )
    19  
    20  const (
    21  	// magic number indicate weight has 2 uint64, should get from `longRuneMap`
    22  	longRune uint64 = 0xFFFD
    23  	// first byte of a 2-byte encoding starts 110 and carries 5 bits of data
    24  	b2Mask = 0x1F // 0001 1111
    25  
    26  	// first byte of a 3-byte encoding starts 1110 and carries 4 bits of data
    27  	b3Mask = 0x0F // 0000 1111
    28  
    29  	// first byte of a 4-byte encoding starts 11110 and carries 3 bits of data
    30  	b4Mask = 0x07 // 0000 0111
    31  
    32  	// non-first bytes start 10 and carry 6 bits of data
    33  	mbMask = 0x3F // 0011 1111
    34  )
    35  
    36  // decode rune by hand
    37  func decodeRune(s string, si int) (r rune, newIndex int) {
    38  	switch b := s[si]; {
    39  	case b < 0x80:
    40  		r = rune(b)
    41  		newIndex = si + 1
    42  	case b < 0xE0:
    43  		r = rune(b&b2Mask)<<6 |
    44  			rune(s[1+si]&mbMask)
    45  		newIndex = si + 2
    46  	case b < 0xF0:
    47  		r = rune(b&b3Mask)<<12 |
    48  			rune(s[si+1]&mbMask)<<6 |
    49  			rune(s[si+2]&mbMask)
    50  		newIndex = si + 3
    51  	default:
    52  		r = rune(b&b4Mask)<<18 |
    53  			rune(s[si+1]&mbMask)<<12 |
    54  			rune(s[si+2]&mbMask)<<6 |
    55  			rune(s[si+3]&mbMask)
    56  		newIndex = si + 4
    57  	}
    58  	return
    59  }
    60  
    61  // unicodeCIDefCauslator implements UCA. see http://unicode.org/reports/tr10/
    62  type unicodeCIDefCauslator struct {
    63  }
    64  
    65  // Compare implements DefCauslator interface.
    66  func (uc *unicodeCIDefCauslator) Compare(a, b string) int {
    67  	a = truncateTailingSpace(a)
    68  	b = truncateTailingSpace(b)
    69  	// weight of a, b. weight in unicode_ci may has 8 uint16s. xn indicate first 4 u16s, xs indicate last 4 u16s
    70  	an, bn := uint64(0), uint64(0)
    71  	as, bs := uint64(0), uint64(0)
    72  	// rune of a, b
    73  	ar, br := rune(0), rune(0)
    74  	// decode index of a, b
    75  	ai, bi := 0, 0
    76  	for {
    77  		if an == 0 {
    78  			if as == 0 {
    79  				for an == 0 && ai < len(a) {
    80  					ar, ai = decodeRune(a, ai)
    81  					an, as = convertUnicode(ar)
    82  				}
    83  			} else {
    84  				an = as
    85  				as = 0
    86  			}
    87  		}
    88  
    89  		if bn == 0 {
    90  			if bs == 0 {
    91  				for bn == 0 && bi < len(b) {
    92  					br, bi = decodeRune(b, bi)
    93  					bn, bs = convertUnicode(br)
    94  				}
    95  			} else {
    96  				bn = bs
    97  				bs = 0
    98  			}
    99  		}
   100  
   101  		if an == 0 || bn == 0 {
   102  			return sign(int(an) - int(bn))
   103  		}
   104  
   105  		if an == bn {
   106  			an, bn = 0, 0
   107  			continue
   108  		}
   109  
   110  		for an != 0 && bn != 0 {
   111  			if (an^bn)&0xFFFF == 0 {
   112  				an >>= 16
   113  				bn >>= 16
   114  			} else {
   115  				return sign(int(an&0xFFFF) - int(bn&0xFFFF))
   116  			}
   117  		}
   118  	}
   119  }
   120  
   121  // Key implements DefCauslator interface.
   122  func (uc *unicodeCIDefCauslator) Key(str string) []byte {
   123  	str = truncateTailingSpace(str)
   124  	buf := make([]byte, 0, len(str)*2)
   125  	r := rune(0)
   126  	si := 0                        // decode index of s
   127  	sn, ss := uint64(0), uint64(0) // weight of str. weight in unicode_ci may has 8 uint16s. sn indicate first 4 u16s, ss indicate last 4 u16s
   128  
   129  	for si < len(str) {
   130  		r, si = decodeRune(str, si)
   131  		sn, ss = convertUnicode(r)
   132  		for sn != 0 {
   133  			buf = append(buf, byte((sn&0xFF00)>>8), byte(sn))
   134  			sn >>= 16
   135  		}
   136  		for ss != 0 {
   137  			buf = append(buf, byte((ss&0xFF00)>>8), byte(ss))
   138  			ss >>= 16
   139  		}
   140  	}
   141  	return buf
   142  }
   143  
   144  // convert rune to weights.
   145  // `first` represent first 4 uint16 weights of rune
   146  // `second` represent last 4 uint16 weights of rune if exist, 0 if not
   147  func convertUnicode(r rune) (first, second uint64) {
   148  	if r > 0xFFFF {
   149  		return 0xFFFD, 0
   150  	}
   151  	if mapBlock[r] == longRune {
   152  		return longRuneMap[r][0], longRuneMap[r][1]
   153  	}
   154  	return mapBlock[r], 0
   155  }
   156  
   157  // Pattern implements DefCauslator interface.
   158  func (uc *unicodeCIDefCauslator) Pattern() WildcardPattern {
   159  	return &unicodePattern{}
   160  }
   161  
   162  type unicodePattern struct {
   163  	patChars []rune
   164  	patTypes []byte
   165  }
   166  
   167  // Compile implements WildcardPattern interface.
   168  func (p *unicodePattern) Compile(patternStr string, escape byte) {
   169  	p.patChars, p.patTypes = compilePatternUnicodeCI(patternStr, escape)
   170  }
   171  
   172  // DoMatch implements WildcardPattern interface.
   173  func (p *unicodePattern) DoMatch(str string) bool {
   174  	return doMatchUnicodeCI(str, p.patChars, p.patTypes)
   175  }
   176  
   177  // compilePatternUnicodeCI handles escapes and wild cards, generate pattern weights and types.
   178  // This function is modified from stringutil.CompilePattern.
   179  func compilePatternUnicodeCI(pattern string, escape byte) (patWeights []rune, patTypes []byte) {
   180  	runes := []rune(pattern)
   181  	escapeRune := rune(escape)
   182  	lenRunes := len(runes)
   183  	patWeights = make([]rune, lenRunes)
   184  	patTypes = make([]byte, lenRunes)
   185  	patLen := 0
   186  	for i := 0; i < lenRunes; i++ {
   187  		var tp byte
   188  		var r = runes[i]
   189  		switch r {
   190  		case escapeRune:
   191  			tp = stringutil.PatMatch
   192  			if i < lenRunes-1 {
   193  				i++
   194  				r = runes[i]
   195  				if r == escapeRune || r == '_' || r == '%' {
   196  					// Valid escape.
   197  				} else {
   198  					// Invalid escape, fall back to escape byte.
   199  					// allegrosql will treat escape character as the origin value even
   200  					// the escape sequence is invalid in Go or C.
   201  					// e.g., \m is invalid in Go, but in MyALLEGROSQL we will get "m" for select '\m'.
   202  					// Following case is correct just for escape \, not for others like +.
   203  					// TODO: Add more checks for other escapes.
   204  					i--
   205  					r = escapeRune
   206  				}
   207  			}
   208  		case '_':
   209  			// %_ => _%
   210  			if patLen > 0 && patTypes[patLen-1] == stringutil.PatAny {
   211  				tp = stringutil.PatAny
   212  				r = '%'
   213  				patWeights[patLen-1], patTypes[patLen-1] = '_', stringutil.PatOne
   214  			} else {
   215  				tp = stringutil.PatOne
   216  			}
   217  		case '%':
   218  			// %% => %
   219  			if patLen > 0 && patTypes[patLen-1] == stringutil.PatAny {
   220  				continue
   221  			}
   222  			tp = stringutil.PatAny
   223  		default:
   224  			tp = stringutil.PatMatch
   225  		}
   226  		patWeights[patLen] = r
   227  		patTypes[patLen] = tp
   228  		patLen++
   229  	}
   230  	patWeights = patWeights[:patLen]
   231  	patTypes = patTypes[:patLen]
   232  	return
   233  }
   234  
   235  // doMatchUnicodeCI matches the string with patWeights and patTypes.
   236  // The algorithm has linear time complexity.
   237  // https://research.swtch.com/glob
   238  // This function is modified from stringutil.DoMatch.
   239  func doMatchUnicodeCI(str string, patWeights []rune, patTypes []byte) bool {
   240  	runes := []rune(str)
   241  	lenRunes := len(runes)
   242  	var rIdx, pIdx, nextRIdx, nextPIdx int
   243  	for pIdx < len(patWeights) || rIdx < lenRunes {
   244  		if pIdx < len(patWeights) {
   245  			switch patTypes[pIdx] {
   246  			case stringutil.PatMatch:
   247  				if rIdx < lenRunes && runeEqual(runes[rIdx], patWeights[pIdx]) {
   248  					pIdx++
   249  					rIdx++
   250  					continue
   251  				}
   252  			case stringutil.PatOne:
   253  				if rIdx < lenRunes {
   254  					pIdx++
   255  					rIdx++
   256  					continue
   257  				}
   258  			case stringutil.PatAny:
   259  				// Try to match at sIdx.
   260  				// If that doesn't work out,
   261  				// restart at sIdx+1 next.
   262  				nextPIdx = pIdx
   263  				nextRIdx = rIdx + 1
   264  				pIdx++
   265  				continue
   266  			}
   267  		}
   268  		// Mismatch. Maybe restart.
   269  		if 0 < nextRIdx && nextRIdx <= lenRunes {
   270  			pIdx = nextPIdx
   271  			rIdx = nextRIdx
   272  			continue
   273  		}
   274  		return false
   275  	}
   276  	// Matched all of pattern to all of name. Success.
   277  	return true
   278  }
   279  
   280  // runeEqual compare rune is equal with unicode_ci defCauslation
   281  func runeEqual(a, b rune) bool {
   282  	if a > 0xFFFF || b > 0xFFFF {
   283  		return a == b
   284  	}
   285  
   286  	ar, br := mapBlock[a], mapBlock[b]
   287  	if ar != br {
   288  		return false
   289  	}
   290  
   291  	if ar == longRune {
   292  		return a == b
   293  	}
   294  
   295  	return true
   296  }