github.com/seeker-insurance/kit@v0.0.13/str/normalize_ascii.go (about) 1 package str 2 3 import ( 4 "bytes" 5 6 "github.com/seeker-insurance/kit/runeset" 7 ) 8 9 const ( 10 //ASCIIPunct is contains all ASCII punctuation, identical to string.punctuation in python 3.6 11 ASCIIPunct = `$+<=>^|~!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~` + "`" 12 13 //ASCIIWhitespace is a list of all ASCII whitespace, identical to string.Whitespace in python 3.6 14 ASCIIWhitespace = " \t\n\r\x0b\x0c" 15 16 //ASCIIPrintable is a list of all ASCII printable characters, identical to string.printable in python 3.6 17 ASCIIPrintable = `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~ \t\n\r\x0b\x0c` + "`" 18 19 //ASCIILowercase is all lowercase letters in the latin alphabet. (code points in [97, 122]) 20 ASCIILowercase = `abcdefghijklmnopqrstuvwxyz` 21 22 //ASCIIUpperCase is all uppercase letters in the latin alphabet (code points in [65, 90]) 23 ASCIIUpperCase = `ABCDEFGHIJKLMNOPQRSTUVWXYZ` 24 25 ASCIILetters = ASCIILowercase + ASCIIUpperCase 26 27 //ASCIINumerics are the numerals 0-9 (code points in [30, 39]) 28 ASCIINumerics = "0123456789" 29 30 ASCIIAlphaNumeric = ASCIILowercase + ASCIIUpperCase + ASCIINumerics 31 32 //ASCII is all ASCII characters, comprising the unicode code points 0-127. 33 ASCII = "`" + `\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~\x7f` 34 ) 35 36 var ( 37 //ASCIIPunctSet contains all ASCII punctuation. Equivalent ot set(string.punctuation) in python 3.6 38 ASCIIPunctSet = runeset.FromString(ASCIIPunct) 39 40 //ASCIIWhitespaceSet contains all ASCII whitespace, identical to set(string.whitespace) in python 3.6 41 ASCIIWhitespaceSet = runeset.FromString(ASCIIWhitespace) 42 43 ASCIISet = runeset.FromString(ASCII) 44 ) 45 46 //RemovePunctuation removes punctuation (as defined by unicode) from a string. 47 //Note that this converts to runes and back to UTF-8, so RemoveWhiteSpace(s) == s 48 //for a string that contains non-punctuation characters does not necessarially hold, since the code points may differ. 49 func RemoveASCIIPunctuation(s string) string { 50 return removeRunesInSet(s, ASCIIPunctSet) 51 } 52 53 //RemoveASCIIWhiteSpace returns a copy of the string with the ASCII whitespace (" \t\n\r\x0b\x0c") removed. 54 func RemoveASCIIWhiteSpace(s string) string { 55 buf := bytes.Buffer{} 56 for _, r := range s { 57 if !ASCIIWhitespaceSet.Contains(r) { 58 buf.WriteRune(r) 59 } 60 } 61 return buf.String() 62 } 63 64 //RemoveNonASCII returns a copy of the string with all non-ASCII runes removed. 65 func RemoveNonASCII(s string) string { 66 ascii := make([]byte, 0, len(s)) 67 for _, r := range s { 68 if r < 128 { 69 ascii = append(ascii, byte(r)) 70 } 71 } 72 return string(ascii) 73 }