github.com/seeker-insurance/kit@v0.0.13/str/normalize_unicode.go (about)

     1  package str
     2  
     3  import (
     4  	"strings"
     5  	"unicode"
     6  
     7  	"golang.org/x/text/runes"
     8  	"golang.org/x/text/transform"
     9  	"golang.org/x/text/unicode/norm"
    10  	"golang.org/x/text/unicode/rangetable"
    11  )
    12  
    13  func isNonSpacingMark(r rune) bool {
    14  	return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
    15  }
    16  
    17  var (
    18  	printable                 = rangetable.Merge(unicode.PrintRanges...)
    19  	UnicodeNonSpacingMarksSet = runes.In(unicode.Mn)
    20  	UnicodePuncuationSet      = runes.In(unicode.Punct)
    21  	UnicodeControlSet         = runes.In(unicode.C)
    22  	UnicodePrintable          = runes.In(printable)
    23  
    24  	nonSpacingOrPunctuationOrControl = runes.In(rangetable.Merge(unicode.Mn, unicode.Punct, unicode.C))
    25  	UnicodeNonPrintable              = runes.NotIn(printable)
    26  
    27  	removeNonSpacingMarksPunctuationAndControl = runes.Remove(nonSpacingOrPunctuationOrControl)
    28  	removeUnicodeNonSpacingMarks               = runes.Remove(UnicodeNonSpacingMarksSet)
    29  	removeUnicodePunctuation                   = runes.Remove(UnicodePuncuationSet)
    30  	removeUnicodeControl                       = runes.Remove(UnicodeControlSet)
    31  	removeUnicodeNonPrintable                  = runes.Remove(UnicodeNonPrintable)
    32  )
    33  
    34  //RemoveDiacriticsNFC creates a copy of s with the diacritics removed. It also transforms it to NFC.
    35  //It is NOT thread Safe
    36  func RemoveDiacriticsNFC(s string) string {
    37  	var diacriticRemover = transform.Chain(norm.NFD, removeUnicodeNonSpacingMarks, norm.NFC)
    38  	out, _, _ := transform.String(diacriticRemover, s)
    39  	return out
    40  }
    41  
    42  //ExtremeNormalization heavily normalizes a string for purposes of comparison and safety.
    43  //It lowercases the string, removes ALL nonspacing marks, nonprinting marks, whitespace, control characters, and punctuation, and transforms the string to NFKC encoding. This can and will lose a lot of information!
    44  func ExtremeNormalization(s string) string {
    45  
    46  	extremeNormalizer := transform.Chain( //this is created here because transform.Chain is not thread-safe
    47  		norm.NFKD,
    48  		removeNonSpacingMarksPunctuationAndControl,
    49  		removeUnicodeNonPrintable,
    50  		norm.NFKC,
    51  	)
    52  	s = strings.ToLower(s)
    53  	s = RemoveASCIIWhiteSpace(s)
    54  	s = RemoveASCIIPunctuation(s)
    55  	s, _, _ = transform.String(extremeNormalizer, s)
    56  	return s
    57  }