github.com/seeker-insurance/kit@v0.0.13/str/normalize_ascii.go

github.com/seeker-insurance/kit@v0.0.13/str/normalize_ascii.go (about)

     1  package str
     2  
     3  import (
     4  	"bytes"
     5  
     6  	"github.com/seeker-insurance/kit/runeset"
     7  )
     8  
     9  const (
    10  	//ASCIIPunct is contains all ASCII punctuation, identical to string.punctuation in python 3.6
    11  	ASCIIPunct = `$+<=>^|~!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~` + "`"
    12  
    13  	//ASCIIWhitespace is a list of all ASCII whitespace, identical to string.Whitespace in python 3.6
    14  	ASCIIWhitespace = " \t\n\r\x0b\x0c"
    15  
    16  	//ASCIIPrintable is a list of all ASCII printable characters, identical to string.printable in python 3.6
    17  	ASCIIPrintable = `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~ \t\n\r\x0b\x0c` + "`"
    18  
    19  	//ASCIILowercase is all lowercase letters in the latin alphabet. (code points in [97, 122])
    20  	ASCIILowercase = `abcdefghijklmnopqrstuvwxyz`
    21  
    22  	//ASCIIUpperCase is all uppercase letters in the latin alphabet (code points in [65, 90])
    23  	ASCIIUpperCase = `ABCDEFGHIJKLMNOPQRSTUVWXYZ`
    24  
    25  	ASCIILetters = ASCIILowercase + ASCIIUpperCase
    26  
    27  	//ASCIINumerics are the numerals 0-9 (code points in [30, 39])
    28  	ASCIINumerics = "0123456789"
    29  
    30  	ASCIIAlphaNumeric = ASCIILowercase + ASCIIUpperCase + ASCIINumerics
    31  
    32  	//ASCII is all ASCII characters, comprising the unicode code points 0-127.
    33  	ASCII = "`" + `\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~\x7f`
    34  )
    35  
    36  var (
    37  	//ASCIIPunctSet  contains all ASCII punctuation. Equivalent ot set(string.punctuation) in python 3.6
    38  	ASCIIPunctSet = runeset.FromString(ASCIIPunct)
    39  
    40  	//ASCIIWhitespaceSet contains all ASCII whitespace, identical to set(string.whitespace) in python 3.6
    41  	ASCIIWhitespaceSet = runeset.FromString(ASCIIWhitespace)
    42  
    43  	ASCIISet = runeset.FromString(ASCII)
    44  )
    45  
    46  //RemovePunctuation removes punctuation (as defined by unicode) from a string.
    47  //Note that this converts to runes and back to UTF-8, so RemoveWhiteSpace(s) == s
    48  //for a string that contains non-punctuation characters does not necessarially hold, since the code points may differ.
    49  func RemoveASCIIPunctuation(s string) string {
    50  	return removeRunesInSet(s, ASCIIPunctSet)
    51  }
    52  
    53  //RemoveASCIIWhiteSpace returns a copy of the string with the ASCII whitespace (" \t\n\r\x0b\x0c") removed.
    54  func RemoveASCIIWhiteSpace(s string) string {
    55  	buf := bytes.Buffer{}
    56  	for _, r := range s {
    57  		if !ASCIIWhitespaceSet.Contains(r) {
    58  			buf.WriteRune(r)
    59  		}
    60  	}
    61  	return buf.String()
    62  }
    63  
    64  //RemoveNonASCII returns a copy of the string with all non-ASCII runes removed.
    65  func RemoveNonASCII(s string) string {
    66  	ascii := make([]byte, 0, len(s))
    67  	for _, r := range s {
    68  		if r < 128 {
    69  			ascii = append(ascii, byte(r))
    70  		}
    71  	}
    72  	return string(ascii)
    73  }