github.com/rudderlabs/rudder-go-kit@v0.30.0/sanitize/sanitize.go (about) 1 package sanitize 2 3 import ( 4 "strings" 5 "unicode" 6 7 "golang.org/x/text/unicode/rangetable" 8 ) 9 10 // invisibleRunes unicode.IsPrint does not include all invisible characters, 11 // so I got this list from https://invisible-characters.com/ 12 var invisibleRunes = []rune{ 13 '\u0000', // NULL 14 '\u0009', // CHARACTER TABULATION 15 '\u00A0', // NO-BREAK SPACE 16 '\u00AD', // SOFT HYPHEN 17 '\u034F', // COMBINING GRAPHEME JOINER 18 '\u061C', // ARABIC LETTER MARK 19 '\u115F', // HANGUL CHOSEONG FILLER 20 '\u1160', // HANGUL JUNGSEONG FILLER 21 '\u17B4', // KHMER VOWEL INHERENT AQ 22 '\u17B5', // KHMER VOWEL INHERENT AA 23 '\u180E', // MONGOLIAN VOWEL SEPARATOR 24 '\u2000', // EN QUAD 25 '\u2001', // EM QUAD 26 '\u2002', // EN SPACE 27 '\u2003', // EM SPACE 28 '\u2004', // THREE-PER-EM SPACE 29 '\u2005', // FOUR-PER-EM SPACE 30 '\u2006', // SIX-PER-EM SPACE 31 '\u2007', // FIGURE SPACE 32 '\u2008', // PUNCTUATION SPACE 33 '\u2009', // THIN SPACE 34 '\u200A', // HAIR SPACE 35 '\u200B', // ZERO WIDTH SPACE 36 '\u200C', // ZERO WIDTH NON-JOINER 37 '\u200D', // ZERO WIDTH JOINER 38 '\u200E', // LEFT-TO-RIGHT MARK 39 '\u200F', // RIGHT-TO-LEFT MARK 40 '\u202F', // NARROW NO-BREAK SPACE 41 '\u205F', // MEDIUM MATHEMATICAL SPACE 42 '\u2060', // WORD JOINER 43 '\u2061', // FUNCTION APPLICATION 44 '\u2062', // INVISIBLE TIMES 45 '\u2063', // INVISIBLE SEPARATOR 46 '\u2064', // INVISIBLE PLUS 47 '\u206A', // INHIBIT SYMMETRIC SWAPPING 48 '\u206B', // ACTIVATE SYMMETRIC SWAPPING 49 '\u206C', // INHIBIT ARABIC FORM SHAPING 50 '\u206D', // ACTIVATE ARABIC FORM SHAPING 51 '\u206E', // NATIONAL DIGIT SHAPES 52 '\u206F', // NOMINAL DIGIT SHAPES 53 '\u3000', // IDEOGRAPHIC SPACE 54 '\u2800', // BRAILLE PATTERN BLANK 55 '\u3164', // HANGUL FILLER 56 '\uFEFF', // ZERO WIDTH NO-BREAK SPACE 57 '\uFFA0', // HALF WIDTH HANGUL FILLER 58 } 59 60 var invisibleRangeTable *unicode.RangeTable 61 62 func init() { 63 invisibleRangeTable = rangetable.New(invisibleRunes...) 64 } 65 66 // Unicode removes irregularly invisible characters from a string. 67 // 68 // Irregularly invisible characters are defined as: 69 // - Non-printable characters according to Go's unicode package (unicode.IsPrint). 70 // - Characters in the invisibleRunes list (https://invisible-characters.com/). 71 // 72 // Note: Regular ASCII space (0x20) is not removed. 73 func Unicode(str string) string { 74 return strings.Map(func(r rune) rune { 75 if unicode.Is(invisibleRangeTable, r) || !unicode.IsPrint(r) { 76 return -1 77 } 78 return r 79 }, str) 80 }