github.com/rudderlabs/rudder-go-kit@v0.30.0/sanitize/sanitize.go (about)

     1  package sanitize
     2  
     3  import (
     4  	"strings"
     5  	"unicode"
     6  
     7  	"golang.org/x/text/unicode/rangetable"
     8  )
     9  
    10  // invisibleRunes unicode.IsPrint does not include all invisible characters,
    11  // so I got this list from https://invisible-characters.com/
    12  var invisibleRunes = []rune{
    13  	'\u0000', // NULL
    14  	'\u0009', // CHARACTER TABULATION
    15  	'\u00A0', // NO-BREAK SPACE
    16  	'\u00AD', // SOFT HYPHEN
    17  	'\u034F', // COMBINING GRAPHEME JOINER
    18  	'\u061C', // ARABIC LETTER MARK
    19  	'\u115F', // HANGUL CHOSEONG FILLER
    20  	'\u1160', // HANGUL JUNGSEONG FILLER
    21  	'\u17B4', // KHMER VOWEL INHERENT AQ
    22  	'\u17B5', // KHMER VOWEL INHERENT AA
    23  	'\u180E', // MONGOLIAN VOWEL SEPARATOR
    24  	'\u2000', // EN QUAD
    25  	'\u2001', // EM QUAD
    26  	'\u2002', // EN SPACE
    27  	'\u2003', // EM SPACE
    28  	'\u2004', // THREE-PER-EM SPACE
    29  	'\u2005', // FOUR-PER-EM SPACE
    30  	'\u2006', // SIX-PER-EM SPACE
    31  	'\u2007', // FIGURE SPACE
    32  	'\u2008', // PUNCTUATION SPACE
    33  	'\u2009', // THIN SPACE
    34  	'\u200A', // HAIR SPACE
    35  	'\u200B', // ZERO WIDTH SPACE
    36  	'\u200C', // ZERO WIDTH NON-JOINER
    37  	'\u200D', // ZERO WIDTH JOINER
    38  	'\u200E', // LEFT-TO-RIGHT MARK
    39  	'\u200F', // RIGHT-TO-LEFT MARK
    40  	'\u202F', // NARROW NO-BREAK SPACE
    41  	'\u205F', // MEDIUM MATHEMATICAL SPACE
    42  	'\u2060', // WORD JOINER
    43  	'\u2061', // FUNCTION APPLICATION
    44  	'\u2062', // INVISIBLE TIMES
    45  	'\u2063', // INVISIBLE SEPARATOR
    46  	'\u2064', // INVISIBLE PLUS
    47  	'\u206A', // INHIBIT SYMMETRIC SWAPPING
    48  	'\u206B', // ACTIVATE SYMMETRIC SWAPPING
    49  	'\u206C', // INHIBIT ARABIC FORM SHAPING
    50  	'\u206D', // ACTIVATE ARABIC FORM SHAPING
    51  	'\u206E', // NATIONAL DIGIT SHAPES
    52  	'\u206F', // NOMINAL DIGIT SHAPES
    53  	'\u3000', // IDEOGRAPHIC SPACE
    54  	'\u2800', // BRAILLE PATTERN BLANK
    55  	'\u3164', // HANGUL FILLER
    56  	'\uFEFF', // ZERO WIDTH NO-BREAK SPACE
    57  	'\uFFA0', // HALF WIDTH HANGUL FILLER
    58  }
    59  
    60  var invisibleRangeTable *unicode.RangeTable
    61  
    62  func init() {
    63  	invisibleRangeTable = rangetable.New(invisibleRunes...)
    64  }
    65  
    66  // Unicode removes irregularly invisible characters from a string.
    67  //
    68  // Irregularly invisible characters are defined as:
    69  //   - Non-printable characters according to Go's unicode package (unicode.IsPrint).
    70  //   - Characters in the invisibleRunes list (https://invisible-characters.com/).
    71  //
    72  // Note: Regular ASCII space (0x20) is not removed.
    73  func Unicode(str string) string {
    74  	return strings.Map(func(r rune) rune {
    75  		if unicode.Is(invisibleRangeTable, r) || !unicode.IsPrint(r) {
    76  			return -1
    77  		}
    78  		return r
    79  	}, str)
    80  }