github.com/lbryio/lbcd@v0.22.119/claimtrie/normalization/char_decomposer.go (about)

     1  package normalization
     2  
     3  import (
     4  	"bufio"
     5  	_ "embed"
     6  	"strconv"
     7  	"strings"
     8  	"unicode/utf8"
     9  )
    10  
    11  //go:embed NFC_v11.txt
    12  var decompositions string // the data file that came from ICU 63.2
    13  
    14  var nfdMap map[rune][]rune
    15  var nfdOrder map[rune]int32
    16  
    17  func init() {
    18  	nfdMap = map[rune][]rune{}
    19  	nfdOrder = map[rune]int32{}
    20  	scanner := bufio.NewScanner(strings.NewReader(decompositions))
    21  	for scanner.Scan() {
    22  		line := scanner.Text()
    23  		if len(line) <= 0 || line[0] == '#' || line[0] == '*' {
    24  			continue
    25  		}
    26  		if strings.ContainsAny(line, ":") {
    27  			// it's a ordering def:
    28  			addOrdering(line)
    29  			continue
    30  		}
    31  		splits := strings.Split(line, "=")
    32  		if len(splits) <= 1 {
    33  			splits = strings.Split(line, ">")
    34  			if len(splits) <= 1 {
    35  				continue
    36  			}
    37  		}
    38  		key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4)
    39  		if err != nil {
    40  			panic(err)
    41  		}
    42  		splits = strings.Split(splits[1], " ")
    43  		values := make([]rune, 0, len(splits))
    44  		for j := range splits {
    45  			value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
    46  			if err != nil {
    47  				panic(err)
    48  			}
    49  			existing := nfdMap[rune(value)]
    50  			if len(existing) > 0 {
    51  				values = append(values, existing...)
    52  			} else {
    53  				values = append(values, rune(value))
    54  			}
    55  		}
    56  		nfdMap[rune(key)] = values
    57  	}
    58  
    59  	// run one more expansion pass to catch stragglers
    60  	for key, values := range nfdMap {
    61  		for i, value := range values {
    62  			other := nfdMap[value]
    63  			if len(other) > 0 {
    64  				newValues := make([]rune, len(values)+len(other)-1)
    65  				copy(newValues, values[:i])
    66  				copy(newValues[i:i+len(other)], other)
    67  				copy(newValues[i+len(other):], values[i+1:])
    68  				nfdMap[key] = newValues
    69  			}
    70  		}
    71  	}
    72  
    73  	// assert no more expansions are necessary:
    74  	for _, values := range nfdMap {
    75  		for _, value := range values {
    76  			other := nfdMap[value]
    77  			if len(other) > 0 {
    78  				panic("Failed in NFD expansion")
    79  			}
    80  		}
    81  	}
    82  }
    83  
    84  func addOrdering(line string) {
    85  	splits := strings.Split(line, ":")
    86  	ranges := strings.Split(splits[0], "..")
    87  
    88  	value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4)
    89  	if err != nil {
    90  		panic(err)
    91  	}
    92  
    93  	start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4)
    94  	if err != nil {
    95  		panic(err)
    96  	}
    97  	end := start
    98  	if len(ranges) > 1 {
    99  		end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4)
   100  		if err != nil {
   101  			panic(err)
   102  		}
   103  	}
   104  	for i := start; i <= end; i++ {
   105  		nfdOrder[rune(i)] = int32(value)
   106  	}
   107  }
   108  
   109  func decompose(name []byte) []byte {
   110  	// see https://unicode.org/reports/tr15/ section 1.3
   111  	runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length
   112  	for i := 0; i < len(name); {
   113  		r, w := utf8.DecodeRune(name[i:])
   114  		if r == utf8.RuneError && w < 2 {
   115  			// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
   116  			return name
   117  		}
   118  		replacements := nfdMap[r]
   119  		if len(replacements) > 0 {
   120  			runes = append(runes, replacements...)
   121  		} else {
   122  			hanguls := decomposeHangul(r)
   123  			if len(hanguls) > 0 {
   124  				runes = append(runes, hanguls...)
   125  			} else {
   126  				runes = append(runes, r)
   127  			}
   128  		}
   129  		i += w
   130  	}
   131  	repairOrdering(runes)
   132  	return []byte(string(runes))
   133  }
   134  
   135  func decomposeHangul(s rune) []rune {
   136  	// see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
   137  
   138  	const SBase int32 = 0xAC00
   139  	const LBase int32 = 0x1100
   140  	const VBase int32 = 0x1161
   141  	const TBase int32 = 0x11A7
   142  	const LCount int32 = 19
   143  	const VCount int32 = 21
   144  	const TCount int32 = 28
   145  	const NCount = VCount * TCount // 588
   146  	const SCount = LCount * NCount // 11172
   147  
   148  	SIndex := s - SBase
   149  	if SIndex < 0 || SIndex >= SCount {
   150  		return nil
   151  	}
   152  	L := LBase + SIndex/NCount
   153  	V := VBase + (SIndex%NCount)/TCount
   154  	T := TBase + SIndex%TCount
   155  	result := []rune{L, V}
   156  	if T != TBase {
   157  		result = append(result, T)
   158  	}
   159  	return result
   160  }
   161  
   162  func repairOrdering(runes []rune) {
   163  	for i := 1; i < len(runes); i++ {
   164  		a := runes[i-1]
   165  		b := runes[i]
   166  		oa := nfdOrder[a]
   167  		ob := nfdOrder[b]
   168  		if oa > ob && ob > 0 {
   169  			runes[i-1], runes[i] = b, a
   170  			if i >= 2 {
   171  				i -= 2
   172  			} else {
   173  				i = 0
   174  			}
   175  		}
   176  	}
   177  }