github.com/lbryio/lbcd@v0.22.119/claimtrie/normalization/case_folder.go (about)

     1  package normalization
     2  
     3  import (
     4  	"bytes"
     5  	_ "embed"
     6  	"regexp"
     7  	"strconv"
     8  	"strings"
     9  	"unicode/utf8"
    10  )
    11  
    12  //go:embed CaseFolding_v11.txt
    13  var v11 string
    14  
    15  var foldMap map[rune][]rune
    16  
    17  func init() {
    18  	foldMap = map[rune][]rune{}
    19  	r, _ := regexp.Compile(`([[:xdigit:]]+?); (.); ([[:xdigit:] ]+?);`)
    20  	matches := r.FindAllStringSubmatch(v11, 1000000000)
    21  	for i := range matches {
    22  		if matches[i][2] == "C" || matches[i][2] == "F" {
    23  			key, err := strconv.ParseUint(matches[i][1], 16, len(matches[i][1])*4)
    24  			if err != nil {
    25  				panic(err)
    26  			}
    27  			splits := strings.Split(matches[i][3], " ")
    28  			var values []rune
    29  			for j := range splits {
    30  				value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
    31  				if err != nil {
    32  					panic(err)
    33  				}
    34  				values = append(values, rune(value))
    35  			}
    36  			foldMap[rune(key)] = values
    37  		}
    38  	}
    39  }
    40  
    41  func caseFold(name []byte) []byte {
    42  	var b bytes.Buffer
    43  	b.Grow(len(name))
    44  	for i := 0; i < len(name); {
    45  		r, w := utf8.DecodeRune(name[i:])
    46  		if r == utf8.RuneError && w < 2 {
    47  			// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
    48  			return name
    49  		}
    50  		replacements := foldMap[r]
    51  		if len(replacements) > 0 {
    52  			for j := range replacements {
    53  				b.WriteRune(replacements[j])
    54  			}
    55  		} else {
    56  			b.WriteRune(r)
    57  		}
    58  		i += w
    59  	}
    60  	return b.Bytes()
    61  }