github.com/seeker-insurance/kit@v0.0.13/str/normalize_unicode.go (about) 1 package str 2 3 import ( 4 "strings" 5 "unicode" 6 7 "golang.org/x/text/runes" 8 "golang.org/x/text/transform" 9 "golang.org/x/text/unicode/norm" 10 "golang.org/x/text/unicode/rangetable" 11 ) 12 13 func isNonSpacingMark(r rune) bool { 14 return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks 15 } 16 17 var ( 18 printable = rangetable.Merge(unicode.PrintRanges...) 19 UnicodeNonSpacingMarksSet = runes.In(unicode.Mn) 20 UnicodePuncuationSet = runes.In(unicode.Punct) 21 UnicodeControlSet = runes.In(unicode.C) 22 UnicodePrintable = runes.In(printable) 23 24 nonSpacingOrPunctuationOrControl = runes.In(rangetable.Merge(unicode.Mn, unicode.Punct, unicode.C)) 25 UnicodeNonPrintable = runes.NotIn(printable) 26 27 removeNonSpacingMarksPunctuationAndControl = runes.Remove(nonSpacingOrPunctuationOrControl) 28 removeUnicodeNonSpacingMarks = runes.Remove(UnicodeNonSpacingMarksSet) 29 removeUnicodePunctuation = runes.Remove(UnicodePuncuationSet) 30 removeUnicodeControl = runes.Remove(UnicodeControlSet) 31 removeUnicodeNonPrintable = runes.Remove(UnicodeNonPrintable) 32 ) 33 34 //RemoveDiacriticsNFC creates a copy of s with the diacritics removed. It also transforms it to NFC. 35 //It is NOT thread Safe 36 func RemoveDiacriticsNFC(s string) string { 37 var diacriticRemover = transform.Chain(norm.NFD, removeUnicodeNonSpacingMarks, norm.NFC) 38 out, _, _ := transform.String(diacriticRemover, s) 39 return out 40 } 41 42 //ExtremeNormalization heavily normalizes a string for purposes of comparison and safety. 43 //It lowercases the string, removes ALL nonspacing marks, nonprinting marks, whitespace, control characters, and punctuation, and transforms the string to NFKC encoding. This can and will lose a lot of information! 44 func ExtremeNormalization(s string) string { 45 46 extremeNormalizer := transform.Chain( //this is created here because transform.Chain is not thread-safe 47 norm.NFKD, 48 removeNonSpacingMarksPunctuationAndControl, 49 removeUnicodeNonPrintable, 50 norm.NFKC, 51 ) 52 s = strings.ToLower(s) 53 s = RemoveASCIIWhiteSpace(s) 54 s = RemoveASCIIPunctuation(s) 55 s, _, _ = transform.String(extremeNormalizer, s) 56 return s 57 }