github.com/vugu/vugu@v0.3.6-0.20240430171613-3f6f402e014b/internal/htmlx/token_test.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package htmlx
     6  
     7  import (
     8  	"bytes"
     9  	"io"
    10  	"os"
    11  	"reflect"
    12  	"runtime"
    13  	"strings"
    14  	"testing"
    15  )
    16  
    17  type tokenTest struct {
    18  	// A short description of the test case.
    19  	desc string
    20  	// The HTML to parse.
    21  	html string
    22  	// The string representations of the expected tokens, joined by '$'.
    23  	golden string
    24  }
    25  
    26  var tokenTests = []tokenTest{
    27  	{
    28  		"empty",
    29  		"",
    30  		"",
    31  	},
    32  	// A single text node. The tokenizer should not break text nodes on whitespace,
    33  	// nor should it normalize whitespace within a text node.
    34  	{
    35  		"text",
    36  		"foo  bar",
    37  		"foo  bar",
    38  	},
    39  	// An entity.
    40  	{
    41  		"entity",
    42  		"one < two",
    43  		"one < two",
    44  	},
    45  	// A start, self-closing and end tag. The tokenizer does not care if the start
    46  	// and end tokens don't match; that is the job of the parser.
    47  	{
    48  		"tags",
    49  		"<a>b<c/>d</e>",
    50  		"<a>$b$<c/>$d$</e>",
    51  	},
    52  	// Angle brackets that aren't a tag.
    53  	{
    54  		"not a tag #0",
    55  		"<",
    56  		"&lt;",
    57  	},
    58  	{
    59  		"not a tag #1",
    60  		"</",
    61  		"&lt;/",
    62  	},
    63  	{
    64  		"not a tag #2",
    65  		"</>",
    66  		"<!---->",
    67  	},
    68  	{
    69  		"not a tag #3",
    70  		"a</>b",
    71  		"a$<!---->$b",
    72  	},
    73  	{
    74  		"not a tag #4",
    75  		"</ >",
    76  		"<!-- -->",
    77  	},
    78  	{
    79  		"not a tag #5",
    80  		"</.",
    81  		"<!--.-->",
    82  	},
    83  	{
    84  		"not a tag #6",
    85  		"</.>",
    86  		"<!--.-->",
    87  	},
    88  	{
    89  		"not a tag #7",
    90  		"a < b",
    91  		"a &lt; b",
    92  	},
    93  	{
    94  		"not a tag #8",
    95  		"<.>",
    96  		"&lt;.&gt;",
    97  	},
    98  	{
    99  		"not a tag #9",
   100  		"a<<<b>>>c",
   101  		"a&lt;&lt;$<b>$&gt;&gt;c",
   102  	},
   103  	{
   104  		"not a tag #10",
   105  		"if x<0 and y < 0 then x*y>0",
   106  		"if x&lt;0 and y &lt; 0 then x*y&gt;0",
   107  	},
   108  	{
   109  		"not a tag #11",
   110  		"<<p>",
   111  		"&lt;$<p>",
   112  	},
   113  	// EOF in a tag name.
   114  	{
   115  		"tag name eof #0",
   116  		"<a",
   117  		"",
   118  	},
   119  	{
   120  		"tag name eof #1",
   121  		"<a ",
   122  		"",
   123  	},
   124  	{
   125  		"tag name eof #2",
   126  		"a<b",
   127  		"a",
   128  	},
   129  	{
   130  		"tag name eof #3",
   131  		"<a><b",
   132  		"<a>",
   133  	},
   134  	{
   135  		"tag name eof #4",
   136  		`<a x`,
   137  		``,
   138  	},
   139  	// Some malformed tags that are missing a '>'.
   140  	{
   141  		"malformed tag #0",
   142  		`<p</p>`,
   143  		`<p< p="">`,
   144  	},
   145  	{
   146  		"malformed tag #1",
   147  		`<p </p>`,
   148  		`<p <="" p="">`,
   149  	},
   150  	{
   151  		"malformed tag #2",
   152  		`<p id`,
   153  		``,
   154  	},
   155  	{
   156  		"malformed tag #3",
   157  		`<p id=`,
   158  		``,
   159  	},
   160  	{
   161  		"malformed tag #4",
   162  		`<p id=>`,
   163  		`<p id="">`,
   164  	},
   165  	{
   166  		"malformed tag #5",
   167  		`<p id=0`,
   168  		``,
   169  	},
   170  	{
   171  		"malformed tag #6",
   172  		`<p id=0</p>`,
   173  		`<p id="0&lt;/p">`,
   174  	},
   175  	{
   176  		"malformed tag #7",
   177  		`<p id="0</p>`,
   178  		``,
   179  	},
   180  	{
   181  		"malformed tag #8",
   182  		`<p id="0"</p>`,
   183  		`<p id="0" <="" p="">`,
   184  	},
   185  	{
   186  		"malformed tag #9",
   187  		`<p></p id`,
   188  		`<p>`,
   189  	},
   190  	// Raw text and RCDATA.
   191  	{
   192  		"basic raw text",
   193  		"<script><a></b></script>",
   194  		"<script>$&lt;a&gt;&lt;/b&gt;$</script>",
   195  	},
   196  	{
   197  		"unfinished script end tag",
   198  		"<SCRIPT>a</SCR",
   199  		"<script>$a&lt;/SCR",
   200  	},
   201  	{
   202  		"broken script end tag",
   203  		"<SCRIPT>a</SCR ipt>",
   204  		"<script>$a&lt;/SCR ipt&gt;",
   205  	},
   206  	{
   207  		"EOF in script end tag",
   208  		"<SCRIPT>a</SCRipt",
   209  		"<script>$a&lt;/SCRipt",
   210  	},
   211  	{
   212  		"scriptx end tag",
   213  		"<SCRIPT>a</SCRiptx",
   214  		"<script>$a&lt;/SCRiptx",
   215  	},
   216  	{
   217  		"' ' completes script end tag",
   218  		"<SCRIPT>a</SCRipt ",
   219  		"<script>$a",
   220  	},
   221  	{
   222  		"'>' completes script end tag",
   223  		"<SCRIPT>a</SCRipt>",
   224  		"<script>$a$</script>",
   225  	},
   226  	{
   227  		"self-closing script end tag",
   228  		"<SCRIPT>a</SCRipt/>",
   229  		"<script>$a$</script>",
   230  	},
   231  	{
   232  		"nested script tag",
   233  		"<SCRIPT>a</SCRipt<script>",
   234  		"<script>$a&lt;/SCRipt&lt;script&gt;",
   235  	},
   236  	{
   237  		"script end tag after unfinished",
   238  		"<SCRIPT>a</SCRipt</script>",
   239  		"<script>$a&lt;/SCRipt$</script>",
   240  	},
   241  	{
   242  		"script/style mismatched tags",
   243  		"<script>a</style>",
   244  		"<script>$a&lt;/style&gt;",
   245  	},
   246  	{
   247  		"style element with entity",
   248  		"<style>&apos;",
   249  		"<style>$&amp;apos;",
   250  	},
   251  	{
   252  		"textarea with tag",
   253  		"<textarea><div></textarea>",
   254  		"<textarea>$&lt;div&gt;$</textarea>",
   255  	},
   256  	{
   257  		"title with tag and entity",
   258  		"<title><b>K&amp;R C</b></title>",
   259  		"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
   260  	},
   261  	// DOCTYPE tests.
   262  	{
   263  		"Proper DOCTYPE",
   264  		"<!DOCTYPE html>",
   265  		"<!DOCTYPE html>",
   266  	},
   267  	{
   268  		"DOCTYPE with no space",
   269  		"<!doctypehtml>",
   270  		"<!DOCTYPE html>",
   271  	},
   272  	{
   273  		"DOCTYPE with two spaces",
   274  		"<!doctype  html>",
   275  		"<!DOCTYPE html>",
   276  	},
   277  	{
   278  		"looks like DOCTYPE but isn't",
   279  		"<!DOCUMENT html>",
   280  		"<!--DOCUMENT html-->",
   281  	},
   282  	{
   283  		"DOCTYPE at EOF",
   284  		"<!DOCtype",
   285  		"<!DOCTYPE >",
   286  	},
   287  	// XML processing instructions.
   288  	{
   289  		"XML processing instruction",
   290  		"<?xml?>",
   291  		"<!--?xml?-->",
   292  	},
   293  	// Comments.
   294  	{
   295  		"comment0",
   296  		"abc<b><!-- skipme --></b>def",
   297  		"abc$<b>$<!-- skipme -->$</b>$def",
   298  	},
   299  	{
   300  		"comment1",
   301  		"a<!-->z",
   302  		"a$<!---->$z",
   303  	},
   304  	{
   305  		"comment2",
   306  		"a<!--->z",
   307  		"a$<!---->$z",
   308  	},
   309  	{
   310  		"comment3",
   311  		"a<!--x>-->z",
   312  		"a$<!--x>-->$z",
   313  	},
   314  	{
   315  		"comment4",
   316  		"a<!--x->-->z",
   317  		"a$<!--x->-->$z",
   318  	},
   319  	{
   320  		"comment5",
   321  		"a<!>z",
   322  		"a$<!---->$z",
   323  	},
   324  	{
   325  		"comment6",
   326  		"a<!->z",
   327  		"a$<!----->$z",
   328  	},
   329  	{
   330  		"comment7",
   331  		"a<!---<>z",
   332  		"a$<!---<>z-->",
   333  	},
   334  	{
   335  		"comment8",
   336  		"a<!--z",
   337  		"a$<!--z-->",
   338  	},
   339  	{
   340  		"comment9",
   341  		"a<!--z-",
   342  		"a$<!--z-->",
   343  	},
   344  	{
   345  		"comment10",
   346  		"a<!--z--",
   347  		"a$<!--z-->",
   348  	},
   349  	{
   350  		"comment11",
   351  		"a<!--z---",
   352  		"a$<!--z--->",
   353  	},
   354  	{
   355  		"comment12",
   356  		"a<!--z----",
   357  		"a$<!--z---->",
   358  	},
   359  	{
   360  		"comment13",
   361  		"a<!--x--!>z",
   362  		"a$<!--x-->$z",
   363  	},
   364  	// An attribute with a backslash.
   365  	{
   366  		"backslash",
   367  		`<p id="a\"b">`,
   368  		`<p id="a\" b"="">`,
   369  	},
   370  	// Entities, tag name and attribute key lower-casing, and whitespace
   371  	// normalization within a tag.
   372  	{
   373  		"tricky",
   374  		"<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
   375  		`<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
   376  	},
   377  	// A nonexistent entity. Tokenizing and converting back to a string should
   378  	// escape the "&" to become "&amp;".
   379  	{
   380  		"noSuchEntity",
   381  		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
   382  		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
   383  	},
   384  	{
   385  		"entity without semicolon",
   386  		`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
   387  		`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
   388  	},
   389  	{
   390  		"entity with digits",
   391  		"&frac12;",
   392  		"½",
   393  	},
   394  	// Attribute tests:
   395  	// http://dev.w3.org/html5/pf-summary/Overview.html#attributes
   396  	{
   397  		"Empty attribute",
   398  		`<input disabled FOO>`,
   399  		`<input disabled="" foo="">`,
   400  	},
   401  	{
   402  		"Empty attribute, whitespace",
   403  		`<input disabled FOO >`,
   404  		`<input disabled="" foo="">`,
   405  	},
   406  	{
   407  		"Unquoted attribute value",
   408  		`<input value=yes FOO=BAR>`,
   409  		`<input value="yes" foo="BAR">`,
   410  	},
   411  	{
   412  		"Unquoted attribute value, spaces",
   413  		`<input value = yes FOO = BAR>`,
   414  		`<input value="yes" foo="BAR">`,
   415  	},
   416  	{
   417  		"Unquoted attribute value, trailing space",
   418  		`<input value=yes FOO=BAR >`,
   419  		`<input value="yes" foo="BAR">`,
   420  	},
   421  	{
   422  		"Single-quoted attribute value",
   423  		`<input value='yes' FOO='BAR'>`,
   424  		`<input value="yes" foo="BAR">`,
   425  	},
   426  	{
   427  		"Single-quoted attribute value, trailing space",
   428  		`<input value='yes' FOO='BAR' >`,
   429  		`<input value="yes" foo="BAR">`,
   430  	},
   431  	{
   432  		"Double-quoted attribute value",
   433  		`<input value="I'm an attribute" FOO="BAR">`,
   434  		`<input value="I&#39;m an attribute" foo="BAR">`,
   435  	},
   436  	{
   437  		"Attribute name characters",
   438  		`<meta http-equiv="content-type">`,
   439  		`<meta http-equiv="content-type">`,
   440  	},
   441  	{
   442  		"Mixed attributes",
   443  		`a<P V="0 1" w='2' X=3 y>z`,
   444  		`a$<p v="0 1" w="2" x="3" y="">$z`,
   445  	},
   446  	{
   447  		"Attributes with a solitary single quote",
   448  		`<p id=can't><p id=won't>`,
   449  		`<p id="can&#39;t">$<p id="won&#39;t">`,
   450  	},
   451  }
   452  
   453  func TestTokenizer(t *testing.T) {
   454  loop:
   455  	for _, tt := range tokenTests {
   456  		z := NewTokenizer(strings.NewReader(tt.html))
   457  		if tt.golden != "" {
   458  			for i, s := range strings.Split(tt.golden, "$") {
   459  				if z.Next() == ErrorToken {
   460  					t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
   461  					continue loop
   462  				}
   463  				actual := z.Token().String()
   464  				if s != actual {
   465  					t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
   466  					continue loop
   467  				}
   468  			}
   469  		}
   470  		z.Next()
   471  		if z.Err() != io.EOF {
   472  			t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
   473  		}
   474  	}
   475  }
   476  
   477  func TestMaxBuffer(t *testing.T) {
   478  	// Exceeding the maximum buffer size generates ErrBufferExceeded.
   479  	z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
   480  	z.SetMaxBuf(5)
   481  	tt := z.Next()
   482  	if got, want := tt, ErrorToken; got != want {
   483  		t.Fatalf("token type: got: %v want: %v", got, want)
   484  	}
   485  	if got, want := z.Err(), ErrBufferExceeded; got != want {
   486  		t.Errorf("error type: got: %v want: %v", got, want)
   487  	}
   488  	if got, want := string(z.Raw()), "<tttt"; got != want {
   489  		t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
   490  	}
   491  }
   492  
   493  func TestMaxBufferReconstruction(t *testing.T) {
   494  	// Exceeding the maximum buffer size at any point while tokenizing permits
   495  	// reconstructing the original input.
   496  tests:
   497  	for _, test := range tokenTests {
   498  		for maxBuf := 1; ; maxBuf++ {
   499  			r := strings.NewReader(test.html)
   500  			z := NewTokenizer(r)
   501  			z.SetMaxBuf(maxBuf)
   502  			var tokenized bytes.Buffer
   503  			for {
   504  				tt := z.Next()
   505  				tokenized.Write(z.Raw())
   506  				if tt == ErrorToken {
   507  					if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
   508  						t.Errorf("%s: unexpected error: %v", test.desc, err)
   509  					}
   510  					break
   511  				}
   512  			}
   513  			// Anything tokenized along with untokenized input or data left in the reader.
   514  			assembled, err := io.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
   515  			if err != nil {
   516  				t.Errorf("%s: ReadAll: %v", test.desc, err)
   517  				continue tests
   518  			}
   519  			if got, want := string(assembled), test.html; got != want {
   520  				t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
   521  				continue tests
   522  			}
   523  			// EOF indicates that we completed tokenization and hence found the max
   524  			// maxBuf that generates ErrBufferExceeded, so continue to the next test.
   525  			if z.Err() == io.EOF {
   526  				break
   527  			}
   528  		} // buffer sizes
   529  	} // tests
   530  }
   531  
   532  func TestPassthrough(t *testing.T) {
   533  	// Accumulating the raw output for each parse event should reconstruct the
   534  	// original input.
   535  	for _, test := range tokenTests {
   536  		z := NewTokenizer(strings.NewReader(test.html))
   537  		var parsed bytes.Buffer
   538  		for {
   539  			tt := z.Next()
   540  			parsed.Write(z.Raw())
   541  			if tt == ErrorToken {
   542  				break
   543  			}
   544  		}
   545  		if got, want := parsed.String(), test.html; got != want {
   546  			t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
   547  		}
   548  	}
   549  }
   550  
   551  func TestBufAPI(t *testing.T) {
   552  	s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
   553  	z := NewTokenizer(bytes.NewBufferString(s))
   554  	var result bytes.Buffer
   555  	depth := 0
   556  loop:
   557  	for {
   558  		tt := z.Next()
   559  		switch tt {
   560  		case ErrorToken:
   561  			if z.Err() != io.EOF {
   562  				t.Error(z.Err())
   563  			}
   564  			break loop
   565  		case TextToken:
   566  			if depth > 0 {
   567  				result.Write(z.Text())
   568  			}
   569  		case StartTagToken, EndTagToken:
   570  			tn, _ := z.TagName()
   571  			if len(tn) == 1 && tn[0] == 'a' {
   572  				if tt == StartTagToken {
   573  					depth++
   574  				} else {
   575  					depth--
   576  				}
   577  			}
   578  		}
   579  	}
   580  	u := "14567"
   581  	v := result.String()
   582  	if u != v {
   583  		t.Errorf("TestBufAPI: want %q got %q", u, v)
   584  	}
   585  }
   586  
   587  func TestConvertNewlines(t *testing.T) {
   588  	testCases := map[string]string{
   589  		"Mac\rDOS\r\nUnix\n":    "Mac\nDOS\nUnix\n",
   590  		"Unix\nMac\rDOS\r\n":    "Unix\nMac\nDOS\n",
   591  		"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
   592  		"":                      "",
   593  		"\n":                    "\n",
   594  		"\n\r":                  "\n\n",
   595  		"\r":                    "\n",
   596  		"\r\n":                  "\n",
   597  		"\r\n\n":                "\n\n",
   598  		"\r\n\r":                "\n\n",
   599  		"\r\n\r\n":              "\n\n",
   600  		"\r\r":                  "\n\n",
   601  		"\r\r\n":                "\n\n",
   602  		"\r\r\n\n":              "\n\n\n",
   603  		"\r\r\r\n":              "\n\n\n",
   604  		"\r \n":                 "\n \n",
   605  		"xyz":                   "xyz",
   606  	}
   607  	for in, want := range testCases {
   608  		if got := string(convertNewlines([]byte(in))); got != want {
   609  			t.Errorf("input %q: got %q, want %q", in, got, want)
   610  		}
   611  	}
   612  }
   613  
   614  func TestReaderEdgeCases(t *testing.T) {
   615  	const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
   616  	testCases := []io.Reader{
   617  		&zeroOneByteReader{s: s},
   618  		&eofStringsReader{s: s},
   619  		&stuckReader{},
   620  	}
   621  	for i, tc := range testCases {
   622  		got := []TokenType{}
   623  		z := NewTokenizer(tc)
   624  		for {
   625  			tt := z.Next()
   626  			if tt == ErrorToken {
   627  				break
   628  			}
   629  			got = append(got, tt)
   630  		}
   631  		if err := z.Err(); err != nil && err != io.EOF {
   632  			if err != io.ErrNoProgress {
   633  				t.Errorf("i=%d: %v", i, err)
   634  			}
   635  			continue
   636  		}
   637  		want := []TokenType{
   638  			StartTagToken,
   639  			TextToken,
   640  			EndTagToken,
   641  		}
   642  		if !reflect.DeepEqual(got, want) {
   643  			t.Errorf("i=%d: got %v, want %v", i, got, want)
   644  			continue
   645  		}
   646  	}
   647  }
   648  
   649  // zeroOneByteReader is like a strings.Reader that alternates between
   650  // returning 0 bytes and 1 byte at a time.
   651  type zeroOneByteReader struct {
   652  	s string
   653  	n int
   654  }
   655  
   656  func (r *zeroOneByteReader) Read(p []byte) (int, error) {
   657  	if len(p) == 0 {
   658  		return 0, nil
   659  	}
   660  	if len(r.s) == 0 {
   661  		return 0, io.EOF
   662  	}
   663  	r.n++
   664  	if r.n%2 != 0 {
   665  		return 0, nil
   666  	}
   667  	p[0], r.s = r.s[0], r.s[1:]
   668  	return 1, nil
   669  }
   670  
   671  // eofStringsReader is like a strings.Reader but can return an (n, err) where
   672  // n > 0 && err != nil.
   673  type eofStringsReader struct {
   674  	s string
   675  }
   676  
   677  func (r *eofStringsReader) Read(p []byte) (int, error) {
   678  	n := copy(p, r.s)
   679  	r.s = r.s[n:]
   680  	if r.s != "" {
   681  		return n, nil
   682  	}
   683  	return n, io.EOF
   684  }
   685  
   686  // stuckReader is an io.Reader that always returns no data and no error.
   687  type stuckReader struct{}
   688  
   689  func (*stuckReader) Read(p []byte) (int, error) {
   690  	return 0, nil
   691  }
   692  
   693  const (
   694  	rawLevel = iota
   695  	lowLevel
   696  	highLevel
   697  )
   698  
   699  func benchmarkTokenizer(b *testing.B, level int) {
   700  	buf, err := os.ReadFile("testdata/go1.html")
   701  	if err != nil {
   702  		b.Fatalf("could not read testdata/go1.html: %v", err)
   703  	}
   704  	b.SetBytes(int64(len(buf)))
   705  	runtime.GC()
   706  	b.ReportAllocs()
   707  	b.ResetTimer()
   708  	for i := 0; i < b.N; i++ {
   709  		z := NewTokenizer(bytes.NewBuffer(buf))
   710  		for {
   711  			tt := z.Next()
   712  			if tt == ErrorToken {
   713  				if err := z.Err(); err != nil && err != io.EOF {
   714  					b.Fatalf("tokenizer error: %v", err)
   715  				}
   716  				break
   717  			}
   718  			switch level {
   719  			case rawLevel:
   720  				// Calling z.Raw just returns the raw bytes of the token. It does
   721  				// not unescape &lt; to <, or lower-case tag names and attribute keys.
   722  				z.Raw()
   723  			case lowLevel:
   724  				// Caling z.Text, z.TagName and z.TagAttr returns []byte values
   725  				// whose contents may change on the next call to z.Next.
   726  				switch tt {
   727  				case TextToken, CommentToken, DoctypeToken:
   728  					z.Text()
   729  				case StartTagToken, SelfClosingTagToken:
   730  					_, more := z.TagName()
   731  					for more {
   732  						_, _, more = z.TagAttr()
   733  					}
   734  				case EndTagToken:
   735  					z.TagName()
   736  				}
   737  			case highLevel:
   738  				// Calling z.Token converts []byte values to strings whose validity
   739  				// extend beyond the next call to z.Next.
   740  				z.Token()
   741  			}
   742  		}
   743  	}
   744  }
   745  
   746  func BenchmarkRawLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, rawLevel) }
   747  func BenchmarkLowLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, lowLevel) }
   748  func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }