gitlab.com/thomasboni/go-enry/v2@v2.8.3-0.20220418031202-30b0d7a3de98/internal/tokenizer/tokenize_test.go (about) 1 package tokenizer 2 3 import ( 4 "fmt" 5 "testing" 6 7 "gitlab.com/thomasboni/go-enry/v2/regex" 8 "github.com/stretchr/testify/assert" 9 "github.com/stretchr/testify/require" 10 ) 11 12 const ( 13 testContent = `#!/usr/bin/ruby 14 15 #!/usr/bin/env node 16 17 aaa 18 19 #!/usr/bin/env A=B foo=bar awk -f 20 21 #!python 22 23 func Tokenize(content []byte) []string { 24 splitted := bytes.Fields(content) 25 tokens := /* make([]string, 0, len(splitted)) 26 no comment -- comment 27 for _, tokenByte := range splitted { 28 token64 := base64.StdEncoding.EncodeToString(tokenByte) 29 tokens = append(tokens, token64) 30 notcatchasanumber3.5 31 }*/ 32 othercode 33 /* testing multiple 34 35 multiline comments*/ 36 37 <!-- com 38 ment --> 39 <!-- comment 2--> 40 ppp no comment # comment 41 42 "literal1" 43 44 abb (tokenByte, 0xAF02) | ,3.2L 45 46 'literal2' notcatchasanumber3.5 47 48 5 += number * anotherNumber 49 if isTrue && isToo { 50 0b00001000 >> 1 51 } 52 53 return tokens 54 55 oneBool = 3 <= 2 56 varBool = 3<=2> 57 58 #ifndef 59 #i'm not a comment if the single line comment symbol is not followed by a white 60 61 PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4."); 62 63 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 64 <html xmlns="http://www.w3.org/1999/xhtml"> 65 <head> 66 <title id="hola" class="">This is a XHTML sample file</title> 67 <style type="text/css"><![CDATA[ 68 #example { 69 background-color: yellow; 70 } 71 ]]></style> 72 </head> 73 <body> 74 <div id="example"> 75 Just a simple <strong>XHTML</strong> test page. 76 </div> 77 </body> 78 </html>` 79 ) 80 81 var ( 82 tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "html", "PUBLIC", 83 "W3C", "DTD", "XHTML", "1", "0", "Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1", 84 "strict", "dtd", "<html>", "xmlns=", "<head>", "<title>", "id=", "class=", "</title>", "<style>", "type=", 85 "<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "id=", 86 "<strong>", "</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")", 87 "{", "}", "(", ")", ";", "#", "/usr/bin/ruby", "#", "/usr/bin/env", "node", "aaa", "#", "/usr/bin/env", "A", 88 "B", "foo", "bar", "awk", "f", "#", "python", "func", "Tokenize", "content", "byte", "string", "splitted", 89 "bytes.Fields", "content", "tokens", "othercode", "ppp", "no", "comment", "abb", "tokenByte", 90 "notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return", "tokens", 91 "oneBool", "varBool", "#ifndef", "#i", "m", "not", "a", "comment", "if", "the", "single", "line", "comment", 92 "symbol", "is", "not", "followed", "by", "a", "white", "PyErr_SetString", "PyExc_RuntimeError", "This", "is", 93 "a", "XHTML", "sample", "file", "Just", "a", "simple", "XHTML", "test", "page.", "-", "|", "+", "&&", "<", "<", 94 "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", "'", ","} 95 96 tests = []struct { 97 name string 98 content []byte 99 expected []string 100 }{ 101 {name: "content", content: []byte(testContent), expected: tokensFromTestContent}, 102 } 103 ) 104 105 func TestTokenize(t *testing.T) { 106 for _, test := range tests { 107 t.Run(test.name, func(t *testing.T) { 108 before := string(test.content) 109 tokens := Tokenize(test.content) 110 after := string(test.content) 111 require.Equal(t, before, after, "the input slice was modified") 112 require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens))) 113 114 for i, expectedToken := range test.expected { 115 assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken)) 116 } 117 }) 118 } 119 } 120 121 func TestTokenizerLatin1AsUtf8(t *testing.T) { 122 content := []byte("th\xe5 filling") // `th� filling` 123 t.Logf("%v - %q", content, string(content)) 124 tokens := Tokenize(content) 125 for i, token := range tokens { 126 t.Logf("token %d, %s", i+1, token) 127 } 128 require.Equal(t, 3, len(tokens)) 129 } 130 131 func TestRegexpOnInvalidUtf8(t *testing.T) { 132 origContent := []struct { 133 text string 134 tokens []string 135 }{ 136 {"th\xe0 filling", []string{"th", "filling"}}, // `th� filling` 137 {"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling` 138 {"привет, как дела?", []string{}}, // empty, no ASCII tokens 139 } 140 re := regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) // a reRegularToken from tokenizer.go 141 142 for _, content := range origContent { 143 t.Run("", func(t *testing.T) { 144 t.Logf("%v - %q", content, content.text) 145 input := []byte(content.text) 146 tokens := re.FindAll(input, -1) 147 require.Equal(t, len(content.tokens), len(tokens)) 148 149 newContent := re.ReplaceAll(input, []byte(` `)) 150 t.Logf("content:%q, tokens:[", newContent) 151 for i, token := range tokens { 152 t.Logf("\t%q,", string(token)) 153 require.Equal(t, content.tokens[i], string(token)) 154 } 155 t.Logf(" ]\n") 156 }) 157 } 158 } 159 160 func BenchmarkTokenizer_BaselineCopy(b *testing.B) { 161 b.ReportAllocs() 162 for i := 0; i < b.N; i++ { 163 for _, test := range tests { 164 if len(test.content) > ByteLimit { 165 test.content = test.content[:ByteLimit] 166 } 167 _ = append([]byte(nil), test.content...) 168 } 169 } 170 } 171 172 func BenchmarkTokenizer(b *testing.B) { 173 b.ReportAllocs() 174 for i := 0; i < b.N; i++ { 175 for _, test := range tests { 176 Tokenize(test.content) 177 } 178 } 179 }