gitlab.com/thomasboni/go-enry/v2@v2.8.3-0.20220418031202-30b0d7a3de98/internal/tokenizer/tokenize_test.go

gitlab.com/thomasboni/go-enry/v2@v2.8.3-0.20220418031202-30b0d7a3de98/internal/tokenizer/tokenize_test.go (about)

     1  package tokenizer
     2  
     3  import (
     4  	"fmt"
     5  	"testing"
     6  
     7  	"gitlab.com/thomasboni/go-enry/v2/regex"
     8  	"github.com/stretchr/testify/assert"
     9  	"github.com/stretchr/testify/require"
    10  )
    11  
    12  const (
    13  	testContent = `#!/usr/bin/ruby
    14  
    15  #!/usr/bin/env node
    16  
    17  aaa
    18  
    19  #!/usr/bin/env A=B foo=bar awk -f
    20  
    21  #!python
    22  
    23  func Tokenize(content []byte) []string {
    24  	splitted := bytes.Fields(content)
    25  	tokens := /* make([]string, 0, len(splitted))
    26  	no comment -- comment
    27  	for _, tokenByte := range splitted {
    28  		token64 := base64.StdEncoding.EncodeToString(tokenByte)
    29  		tokens = append(tokens, token64)
    30  		notcatchasanumber3.5
    31  	}*/
    32  othercode
    33  	/* testing multiple 
    34  	
    35  		multiline comments*/
    36  
    37  <!-- com
    38  	ment -->
    39  <!-- comment 2-->
    40  ppp no comment # comment
    41  
    42  "literal1"
    43  
    44  abb (tokenByte, 0xAF02) | ,3.2L
    45  
    46  'literal2' notcatchasanumber3.5
    47  
    48  	5 += number * anotherNumber
    49  	if isTrue && isToo {
    50  		0b00001000 >> 1
    51  	}
    52  
    53  	return tokens
    54  
    55  oneBool = 3 <= 2
    56  varBool = 3<=2>
    57   
    58  #ifndef
    59  #i'm not a comment if the single line comment symbol is not followed by a white
    60  
    61    PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
    62  
    63  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
    64  <html xmlns="http://www.w3.org/1999/xhtml">
    65      <head>
    66          <title id="hola" class="">This is a XHTML sample file</title>
    67          <style type="text/css"><![CDATA[
    68              #example {
    69                  background-color: yellow;
    70              }
    71          ]]></style>
    72      </head>
    73      <body>
    74          <div id="example">
    75              Just a simple <strong>XHTML</strong> test page.
    76          </div>
    77      </body>
    78  </html>`
    79  )
    80  
    81  var (
    82  	tokensFromTestContent = []string{"SHEBANG#!ruby", "SHEBANG#!node", "SHEBANG#!awk", "<!DOCTYPE>", "html", "PUBLIC",
    83  		"W3C", "DTD", "XHTML", "1", "0", "Strict", "EN", "http", "www", "w3", "org", "TR", "xhtml1", "DTD", "xhtml1",
    84  		"strict", "dtd", "<html>", "xmlns=", "<head>", "<title>", "id=", "class=", "</title>", "<style>", "type=",
    85  		"<![CDATA[>", "example", "background", "color", "yellow", "</style>", "</head>", "<body>", "<div>", "id=",
    86  		"<strong>", "</strong>", "</div>", "</body>", "</html>", "(", "[", "]", ")", "[", "]", "{", "(", ")", "(", ")",
    87  		"{", "}", "(", ")", ";", "#", "/usr/bin/ruby", "#", "/usr/bin/env", "node", "aaa", "#", "/usr/bin/env", "A",
    88  		"B", "foo", "bar", "awk", "f", "#", "python", "func", "Tokenize", "content", "byte", "string", "splitted",
    89  		"bytes.Fields", "content", "tokens", "othercode", "ppp", "no", "comment", "abb", "tokenByte",
    90  		"notcatchasanumber", "number", "*", "anotherNumber", "if", "isTrue", "isToo", "b", "return", "tokens",
    91  		"oneBool", "varBool", "#ifndef", "#i", "m", "not", "a", "comment", "if", "the", "single", "line", "comment",
    92  		"symbol", "is", "not", "followed", "by", "a", "white", "PyErr_SetString", "PyExc_RuntimeError", "This", "is",
    93  		"a", "XHTML", "sample", "file", "Just", "a", "simple", "XHTML", "test", "page.", "-", "|", "+", "&&", "<", "<",
    94  		"!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">", "'", ","}
    95  
    96  	tests = []struct {
    97  		name     string
    98  		content  []byte
    99  		expected []string
   100  	}{
   101  		{name: "content", content: []byte(testContent), expected: tokensFromTestContent},
   102  	}
   103  )
   104  
   105  func TestTokenize(t *testing.T) {
   106  	for _, test := range tests {
   107  		t.Run(test.name, func(t *testing.T) {
   108  			before := string(test.content)
   109  			tokens := Tokenize(test.content)
   110  			after := string(test.content)
   111  			require.Equal(t, before, after, "the input slice was modified")
   112  			require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
   113  
   114  			for i, expectedToken := range test.expected {
   115  				assert.Equal(t, expectedToken, tokens[i], fmt.Sprintf("token = %v, want %v", tokens[i], expectedToken))
   116  			}
   117  		})
   118  	}
   119  }
   120  
   121  func TestTokenizerLatin1AsUtf8(t *testing.T) {
   122  	content := []byte("th\xe5 filling") // `th� filling`
   123  	t.Logf("%v - %q", content, string(content))
   124  	tokens := Tokenize(content)
   125  	for i, token := range tokens {
   126  		t.Logf("token %d, %s", i+1, token)
   127  	}
   128  	require.Equal(t, 3, len(tokens))
   129  }
   130  
   131  func TestRegexpOnInvalidUtf8(t *testing.T) {
   132  	origContent := []struct {
   133  		text   string
   134  		tokens []string
   135  	}{
   136  		{"th\xe0 filling", []string{"th", "filling"}},   // `th� filling`
   137  		{"th\u0100 filling", []string{"th", "filling"}}, // `thĀ filling`
   138  		{"привет, как дела?", []string{}},               // empty, no ASCII tokens
   139  	}
   140  	re := regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) // a reRegularToken from tokenizer.go
   141  
   142  	for _, content := range origContent {
   143  		t.Run("", func(t *testing.T) {
   144  			t.Logf("%v - %q", content, content.text)
   145  			input := []byte(content.text)
   146  			tokens := re.FindAll(input, -1)
   147  			require.Equal(t, len(content.tokens), len(tokens))
   148  
   149  			newContent := re.ReplaceAll(input, []byte(` `))
   150  			t.Logf("content:%q, tokens:[", newContent)
   151  			for i, token := range tokens {
   152  				t.Logf("\t%q,", string(token))
   153  				require.Equal(t, content.tokens[i], string(token))
   154  			}
   155  			t.Logf(" ]\n")
   156  		})
   157  	}
   158  }
   159  
   160  func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
   161  	b.ReportAllocs()
   162  	for i := 0; i < b.N; i++ {
   163  		for _, test := range tests {
   164  			if len(test.content) > ByteLimit {
   165  				test.content = test.content[:ByteLimit]
   166  			}
   167  			_ = append([]byte(nil), test.content...)
   168  		}
   169  	}
   170  }
   171  
   172  func BenchmarkTokenizer(b *testing.B) {
   173  	b.ReportAllocs()
   174  	for i := 0; i < b.N; i++ {
   175  		for _, test := range tests {
   176  			Tokenize(test.content)
   177  		}
   178  	}
   179  }