github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/unicode/norm/normalize_test.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  import (
     8  	"bytes"
     9  	"flag"
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"strings"
    14  	"testing"
    15  	"unicode/utf8"
    16  )
    17  
    18  var (
    19  	testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
    20  )
    21  
    22  // pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
    23  func pc(s string) []byte {
    24  	b := bytes.NewBuffer(make([]byte, 0, len(s)))
    25  	for i := 0; i < len(s); {
    26  		r, sz := utf8.DecodeRuneInString(s[i:])
    27  		n := 0
    28  		if sz == 1 {
    29  			// Special-case one-byte case to handle repetition for invalid UTF-8.
    30  			for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
    31  			}
    32  		} else {
    33  			for _, r2 := range s[i:] {
    34  				if r2 != r {
    35  					break
    36  				}
    37  				n++
    38  			}
    39  		}
    40  		b.WriteString(s[i : i+sz])
    41  		if n > 1 {
    42  			fmt.Fprintf(b, "{%d}", n)
    43  		}
    44  		i += sz * n
    45  	}
    46  	return b.Bytes()
    47  }
    48  
    49  // pidx finds the index from which two strings start to differ, plus context.
    50  // It returns the index and ellipsis if the index is greater than 0.
    51  func pidx(a, b string) (i int, prefix string) {
    52  	for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
    53  	}
    54  	if i < 8 {
    55  		return 0, ""
    56  	}
    57  	i -= 3 // ensure taking at least one full rune before the difference.
    58  	for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
    59  	}
    60  	return i, "..."
    61  }
    62  
    63  type PositionTest struct {
    64  	input  string
    65  	pos    int
    66  	buffer string // expected contents of reorderBuffer, if applicable
    67  }
    68  
    69  type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
    70  
    71  func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
    72  	rb := reorderBuffer{}
    73  	rb.init(f, nil)
    74  	for i, test := range tests {
    75  		rb.reset()
    76  		rb.src = inputString(test.input)
    77  		rb.nsrc = len(test.input)
    78  		pos, out := fn(&rb, test.input)
    79  		if pos != test.pos {
    80  			t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
    81  		}
    82  		if outs := string(out); outs != test.buffer {
    83  			k, pfx := pidx(outs, test.buffer)
    84  			t.Errorf("%s:%d: buffer \nwas  %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
    85  		}
    86  	}
    87  }
    88  
    89  func grave(n int) string {
    90  	return rep(0x0300, n)
    91  }
    92  
    93  func rep(r rune, n int) string {
    94  	return strings.Repeat(string(r), n)
    95  }
    96  
    97  const segSize = maxByteBufferSize
    98  
    99  var cgj = GraphemeJoiner
   100  
   101  var decomposeSegmentTests = []PositionTest{
   102  	// illegal runes
   103  	{"\xC2", 0, ""},
   104  	{"\xC0", 1, "\xC0"},
   105  	{"\u00E0\x80", 2, "\u0061\u0300"},
   106  	// starter
   107  	{"a", 1, "a"},
   108  	{"ab", 1, "a"},
   109  	// starter + composing
   110  	{"a\u0300", 3, "a\u0300"},
   111  	{"a\u0300b", 3, "a\u0300"},
   112  	// with decomposition
   113  	{"\u00C0", 2, "A\u0300"},
   114  	{"\u00C0b", 2, "A\u0300"},
   115  	// long
   116  	{grave(31), 60, grave(30) + cgj},
   117  	{"a" + grave(31), 61, "a" + grave(30) + cgj},
   118  
   119  	// Stability tests: see http://www.unicode.org/review/pr-29.html.
   120  	// U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
   121  	// U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
   122  	// U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
   123  	// U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
   124  	// U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
   125  	{"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
   126  	{"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
   127  	{"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
   128  	{"\u1100\u1161", 6, "\u1100\u1161"},
   129  
   130  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   131  	// Sequence of decomposing characters that are starters and modifiers.
   132  	{"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
   133  
   134  	{grave(30), 60, grave(30)},
   135  	// U+FF9E is a starter, but decomposes to U+3099, which is not.
   136  	{grave(30) + "\uff9e", 60, grave(30) + cgj},
   137  	// ends with incomplete UTF-8 encoding
   138  	{"\xCC", 0, ""},
   139  	{"\u0300\xCC", 2, "\u0300"},
   140  }
   141  
   142  func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
   143  	rb.initString(NFD, s)
   144  	rb.setFlusher(nil, appendFlush)
   145  	p := decomposeSegment(rb, 0, true)
   146  	return p, rb.out
   147  }
   148  
   149  func TestDecomposeSegment(t *testing.T) {
   150  	runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
   151  }
   152  
   153  var firstBoundaryTests = []PositionTest{
   154  	// no boundary
   155  	{"", -1, ""},
   156  	{"\u0300", -1, ""},
   157  	{"\x80\x80", -1, ""},
   158  	// illegal runes
   159  	{"\xff", 0, ""},
   160  	{"\u0300\xff", 2, ""},
   161  	{"\u0300\xc0\x80\x80", 2, ""},
   162  	// boundaries
   163  	{"a", 0, ""},
   164  	{"\u0300a", 2, ""},
   165  	// Hangul
   166  	{"\u1103\u1161", 0, ""},
   167  	{"\u110B\u1173\u11B7", 0, ""},
   168  	{"\u1161\u110B\u1173\u11B7", 3, ""},
   169  	{"\u1173\u11B7\u1103\u1161", 6, ""},
   170  	// too many combining characters.
   171  	{grave(maxNonStarters - 1), -1, ""},
   172  	{grave(maxNonStarters), 60, ""},
   173  	{grave(maxNonStarters + 1), 60, ""},
   174  }
   175  
   176  func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   177  	return rb.f.form.FirstBoundary([]byte(s)), nil
   178  }
   179  
   180  func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
   181  	return rb.f.form.FirstBoundaryInString(s), nil
   182  }
   183  
   184  func TestFirstBoundary(t *testing.T) {
   185  	runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
   186  	runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
   187  }
   188  
   189  var decomposeToLastTests = []PositionTest{
   190  	// ends with inert character
   191  	{"Hello!", 6, ""},
   192  	{"\u0632", 2, ""},
   193  	{"a\u0301\u0635", 5, ""},
   194  	// ends with non-inert starter
   195  	{"a", 0, "a"},
   196  	{"a\u0301a", 3, "a"},
   197  	{"a\u0301\u03B9", 3, "\u03B9"},
   198  	{"a\u0327", 0, "a\u0327"},
   199  	// illegal runes
   200  	{"\xFF", 1, ""},
   201  	{"aa\xFF", 3, ""},
   202  	{"\xC0\x80\x80", 3, ""},
   203  	{"\xCC\x80\x80", 3, ""},
   204  	// ends with incomplete UTF-8 encoding
   205  	{"a\xCC", 2, ""},
   206  	// ends with combining characters
   207  	{"\u0300\u0301", 0, "\u0300\u0301"},
   208  	{"a\u0300\u0301", 0, "a\u0300\u0301"},
   209  	{"a\u0301\u0308", 0, "a\u0301\u0308"},
   210  	{"a\u0308\u0301", 0, "a\u0308\u0301"},
   211  	{"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
   212  	{"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
   213  	{"\u00C0", 0, "A\u0300"},
   214  	{"a\u00C0", 1, "A\u0300"},
   215  	// decomposing
   216  	{"a\u0300\u00E0", 3, "a\u0300"},
   217  	// multisegment decompositions (flushes leading segments)
   218  	{"a\u0300\uFDC0", 7, "\u064A"},
   219  	{"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
   220  	{"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
   221  	{"\uFDC0" + grave(31), 5, grave(30)},
   222  	{"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
   223  	// Overflow
   224  	{"\u00E0" + grave(29), 0, "a" + grave(30)},
   225  	{"\u00E0" + grave(30), 2, grave(30)},
   226  	// Hangul
   227  	{"a\u1103", 1, "\u1103"},
   228  	{"a\u110B", 1, "\u110B"},
   229  	{"a\u110B\u1173", 1, "\u110B\u1173"},
   230  	// See comment in composition.go:compBoundaryAfter.
   231  	{"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
   232  	{"a\uC73C", 1, "\u110B\u1173"},
   233  	{"다음", 3, "\u110B\u1173\u11B7"},
   234  	{"다", 0, "\u1103\u1161"},
   235  	{"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
   236  	{"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
   237  	{"다음음", 6, "\u110B\u1173\u11B7"},
   238  	{"음다다", 6, "\u1103\u1161"},
   239  	// maximized buffer
   240  	{"a" + grave(30), 0, "a" + grave(30)},
   241  	// Buffer overflow
   242  	{"a" + grave(31), 3, grave(30)},
   243  	// weird UTF-8
   244  	{"a\u0300\u11B7", 0, "a\u0300\u11B7"},
   245  }
   246  
   247  func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
   248  	rb.setFlusher([]byte(s), appendFlush)
   249  	decomposeToLastBoundary(rb)
   250  	buf := rb.flush(nil)
   251  	return len(rb.out), buf
   252  }
   253  
   254  func TestDecomposeToLastBoundary(t *testing.T) {
   255  	runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
   256  }
   257  
   258  var lastBoundaryTests = []PositionTest{
   259  	// ends with inert character
   260  	{"Hello!", 6, ""},
   261  	{"\u0632", 2, ""},
   262  	// ends with non-inert starter
   263  	{"a", 0, ""},
   264  	// illegal runes
   265  	{"\xff", 1, ""},
   266  	{"aa\xff", 3, ""},
   267  	{"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
   268  	{"\xc0\x80\x80", 3, ""},
   269  	{"\xc0\x80\x80\u0300", 3, ""},
   270  	// ends with incomplete UTF-8 encoding
   271  	{"\xCC", -1, ""},
   272  	{"\xE0\x80", -1, ""},
   273  	{"\xF0\x80\x80", -1, ""},
   274  	{"a\xCC", 0, ""},
   275  	{"\x80\xCC", 1, ""},
   276  	{"\xCC\xCC", 1, ""},
   277  	// ends with combining characters
   278  	{"a\u0300\u0301", 0, ""},
   279  	{"aaaa\u0300\u0301", 3, ""},
   280  	{"\u0300a\u0300\u0301", 2, ""},
   281  	{"\u00C2", 0, ""},
   282  	{"a\u00C2", 1, ""},
   283  	// decomposition may recombine
   284  	{"\u0226", 0, ""},
   285  	// no boundary
   286  	{"", -1, ""},
   287  	{"\u0300\u0301", -1, ""},
   288  	{"\u0300", -1, ""},
   289  	{"\x80\x80", -1, ""},
   290  	{"\x80\x80\u0301", -1, ""},
   291  	// Hangul
   292  	{"다음", 3, ""},
   293  	{"다", 0, ""},
   294  	{"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
   295  	{"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
   296  	// too many combining characters.
   297  	{grave(maxNonStarters - 1), -1, ""},
   298  	// May still be preceded with a non-starter.
   299  	{grave(maxNonStarters), -1, ""},
   300  	// May still need to insert a cgj after the last combiner.
   301  	{grave(maxNonStarters + 1), 2, ""},
   302  	{grave(maxNonStarters + 2), 4, ""},
   303  
   304  	{"a" + grave(maxNonStarters-1), 0, ""},
   305  	{"a" + grave(maxNonStarters), 0, ""},
   306  	// May still need to insert a cgj after the last combiner.
   307  	{"a" + grave(maxNonStarters+1), 3, ""},
   308  	{"a" + grave(maxNonStarters+2), 5, ""},
   309  }
   310  
   311  func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   312  	return rb.f.form.LastBoundary([]byte(s)), nil
   313  }
   314  
   315  func TestLastBoundary(t *testing.T) {
   316  	runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
   317  }
   318  
   319  var quickSpanTests = []PositionTest{
   320  	{"", 0, ""},
   321  	// starters
   322  	{"a", 1, ""},
   323  	{"abc", 3, ""},
   324  	{"\u043Eb", 3, ""},
   325  	// incomplete last rune.
   326  	{"\xCC", 1, ""},
   327  	{"a\xCC", 2, ""},
   328  	// incorrectly ordered combining characters
   329  	{"\u0300\u0316", 0, ""},
   330  	{"\u0300\u0316cd", 0, ""},
   331  	// have a maximum number of combining characters.
   332  	{rep(0x035D, 30) + "\u035B", 0, ""},
   333  	{"a" + rep(0x035D, 30) + "\u035B", 0, ""},
   334  	{"Ɵ" + rep(0x035D, 30) + "\u035B", 0, ""},
   335  	{"aa" + rep(0x035D, 30) + "\u035B", 1, ""},
   336  	{rep(0x035D, 30) + cgj + "\u035B", 64, ""},
   337  	{"a" + rep(0x035D, 30) + cgj + "\u035B", 65, ""},
   338  	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", 66, ""},
   339  	{"aa" + rep(0x035D, 30) + cgj + "\u035B", 66, ""},
   340  }
   341  
   342  var quickSpanNFDTests = []PositionTest{
   343  	// needs decomposing
   344  	{"\u00C0", 0, ""},
   345  	{"abc\u00C0", 3, ""},
   346  	// correctly ordered combining characters
   347  	{"\u0300", 2, ""},
   348  	{"ab\u0300", 4, ""},
   349  	{"ab\u0300cd", 6, ""},
   350  	{"\u0300cd", 4, ""},
   351  	{"\u0316\u0300", 4, ""},
   352  	{"ab\u0316\u0300", 6, ""},
   353  	{"ab\u0316\u0300cd", 8, ""},
   354  	{"ab\u0316\u0300\u00C0", 6, ""},
   355  	{"\u0316\u0300cd", 6, ""},
   356  	{"\u043E\u0308b", 5, ""},
   357  	// incorrectly ordered combining characters
   358  	{"ab\u0300\u0316", 1, ""}, // TODO: we could skip 'b' as well.
   359  	{"ab\u0300\u0316cd", 1, ""},
   360  	// Hangul
   361  	{"같은", 0, ""},
   362  }
   363  
   364  var quickSpanNFCTests = []PositionTest{
   365  	// okay composed
   366  	{"\u00C0", 2, ""},
   367  	{"abc\u00C0", 5, ""},
   368  	// correctly ordered combining characters
   369  	{"ab\u0300", 1, ""},
   370  	{"ab\u0300cd", 1, ""},
   371  	{"ab\u0316\u0300", 1, ""},
   372  	{"ab\u0316\u0300cd", 1, ""},
   373  	{"\u00C0\u035D", 4, ""},
   374  	// we do not special case leading combining characters
   375  	{"\u0300cd", 0, ""},
   376  	{"\u0300", 0, ""},
   377  	{"\u0316\u0300", 0, ""},
   378  	{"\u0316\u0300cd", 0, ""},
   379  	// incorrectly ordered combining characters
   380  	{"ab\u0300\u0316", 1, ""},
   381  	{"ab\u0300\u0316cd", 1, ""},
   382  	// Hangul
   383  	{"같은", 6, ""},
   384  	// We return the start of the violating segment in case of overflow.
   385  	{grave(30) + "\uff9e", 0, ""},
   386  	{grave(30), 0, ""},
   387  }
   388  
   389  func doQuickSpan(rb *reorderBuffer, s string) (int, []byte) {
   390  	return rb.f.form.QuickSpan([]byte(s)), nil
   391  }
   392  
   393  func doQuickSpanString(rb *reorderBuffer, s string) (int, []byte) {
   394  	return rb.f.form.QuickSpanString(s), nil
   395  }
   396  
   397  func TestQuickSpan(t *testing.T) {
   398  	runPosTests(t, "TestQuickSpanNFD1", NFD, doQuickSpan, quickSpanTests)
   399  	runPosTests(t, "TestQuickSpanNFD2", NFD, doQuickSpan, quickSpanNFDTests)
   400  	runPosTests(t, "TestQuickSpanNFC1", NFC, doQuickSpan, quickSpanTests)
   401  	runPosTests(t, "TestQuickSpanNFC2", NFC, doQuickSpan, quickSpanNFCTests)
   402  
   403  	runPosTests(t, "TestQuickSpanStringNFD1", NFD, doQuickSpanString, quickSpanTests)
   404  	runPosTests(t, "TestQuickSpanStringNFD2", NFD, doQuickSpanString, quickSpanNFDTests)
   405  	runPosTests(t, "TestQuickSpanStringNFC1", NFC, doQuickSpanString, quickSpanTests)
   406  	runPosTests(t, "TestQuickSpanStringNFC2", NFC, doQuickSpanString, quickSpanNFCTests)
   407  }
   408  
   409  var isNormalTests = []PositionTest{
   410  	{"", 1, ""},
   411  	// illegal runes
   412  	{"\xff", 1, ""},
   413  	// starters
   414  	{"a", 1, ""},
   415  	{"abc", 1, ""},
   416  	{"\u043Eb", 1, ""},
   417  	// incorrectly ordered combining characters
   418  	{"\u0300\u0316", 0, ""},
   419  	{"ab\u0300\u0316", 0, ""},
   420  	{"ab\u0300\u0316cd", 0, ""},
   421  	{"\u0300\u0316cd", 0, ""},
   422  }
   423  var isNormalNFDTests = []PositionTest{
   424  	// needs decomposing
   425  	{"\u00C0", 0, ""},
   426  	{"abc\u00C0", 0, ""},
   427  	// correctly ordered combining characters
   428  	{"\u0300", 1, ""},
   429  	{"ab\u0300", 1, ""},
   430  	{"ab\u0300cd", 1, ""},
   431  	{"\u0300cd", 1, ""},
   432  	{"\u0316\u0300", 1, ""},
   433  	{"ab\u0316\u0300", 1, ""},
   434  	{"ab\u0316\u0300cd", 1, ""},
   435  	{"\u0316\u0300cd", 1, ""},
   436  	{"\u043E\u0308b", 1, ""},
   437  	// Hangul
   438  	{"같은", 0, ""},
   439  }
   440  var isNormalNFCTests = []PositionTest{
   441  	// okay composed
   442  	{"\u00C0", 1, ""},
   443  	{"abc\u00C0", 1, ""},
   444  	// need reordering
   445  	{"a\u0300", 0, ""},
   446  	{"a\u0300cd", 0, ""},
   447  	{"a\u0316\u0300", 0, ""},
   448  	{"a\u0316\u0300cd", 0, ""},
   449  	// correctly ordered combining characters
   450  	{"ab\u0300", 1, ""},
   451  	{"ab\u0300cd", 1, ""},
   452  	{"ab\u0316\u0300", 1, ""},
   453  	{"ab\u0316\u0300cd", 1, ""},
   454  	{"\u00C0\u035D", 1, ""},
   455  	{"\u0300", 1, ""},
   456  	{"\u0316\u0300cd", 1, ""},
   457  	// Hangul
   458  	{"같은", 1, ""},
   459  }
   460  
   461  var isNormalNFKXTests = []PositionTest{
   462  	// Special case.
   463  	{"\u00BC", 0, ""},
   464  }
   465  
   466  func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
   467  	if rb.f.form.IsNormal([]byte(s)) {
   468  		return 1, nil
   469  	}
   470  	return 0, nil
   471  }
   472  
   473  func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
   474  	if rb.f.form.IsNormalString(s) {
   475  		return 1, nil
   476  	}
   477  	return 0, nil
   478  }
   479  
   480  func TestIsNormal(t *testing.T) {
   481  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
   482  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
   483  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
   484  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
   485  	runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
   486  	runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
   487  	runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
   488  	runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
   489  	runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
   490  	runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
   491  }
   492  
   493  func TestIsNormalString(t *testing.T) {
   494  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
   495  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
   496  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
   497  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
   498  }
   499  
   500  type AppendTest struct {
   501  	left  string
   502  	right string
   503  	out   string
   504  }
   505  
   506  type appendFunc func(f Form, out []byte, s string) []byte
   507  
   508  var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
   509  
   510  func runNormTests(t *testing.T, name string, fn appendFunc) {
   511  	for f := NFC; f <= NFKD; f++ {
   512  		runAppendTests(t, name, f, fn, normTests[f])
   513  	}
   514  }
   515  
   516  func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
   517  	for i, test := range tests {
   518  		if *testn >= 0 && i != *testn {
   519  			continue
   520  		}
   521  		out := []byte(test.left)
   522  		have := string(fn(f, out, test.right))
   523  		if len(have) != len(test.out) {
   524  			t.Errorf("%s.%s:%d: length is %d; want %d (%+q vs %+q)", fstr[f], name, i, len(have), len(test.out), pc(have), pc(test.out))
   525  		}
   526  		if have != test.out {
   527  			k, pf := pidx(have, test.out)
   528  			t.Errorf("%s.%s:%d: \nwas  %s%+q; \nwant %s%+q", fstr[f], name, i, pf, pc(have[k:]), pf, pc(test.out[k:]))
   529  		}
   530  
   531  		// Bootstrap by normalizing input. Ensures that the various variants
   532  		// behave the same.
   533  		for g := NFC; g <= NFKD; g++ {
   534  			if f == g {
   535  				continue
   536  			}
   537  			want := g.String(test.left + test.right)
   538  			have := string(fn(g, g.AppendString(nil, test.left), test.right))
   539  			if len(have) != len(want) {
   540  				t.Errorf("%s(%s.%s):%d: length is %d; want %d (%+q vs %+q)", fstr[g], fstr[f], name, i, len(have), len(want), pc(have), pc(want))
   541  			}
   542  			if have != want {
   543  				k, pf := pidx(have, want)
   544  				t.Errorf("%s(%s.%s):%d: \nwas  %s%+q; \nwant %s%+q", fstr[g], fstr[f], name, i, pf, pc(have[k:]), pf, pc(want[k:]))
   545  			}
   546  		}
   547  	}
   548  }
   549  
   550  var normTests = [][]AppendTest{
   551  	appendTestsNFC,
   552  	appendTestsNFD,
   553  	appendTestsNFKC,
   554  	appendTestsNFKD,
   555  }
   556  
   557  var appendTestsNFC = []AppendTest{
   558  	{"", ascii, ascii},
   559  	{"", txt_all, txt_all},
   560  	{"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
   561  	{grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
   562  
   563  	// Tests designed for Iter.
   564  	{ // ordering of non-composing combining characters
   565  		"",
   566  		"\u0305\u0316",
   567  		"\u0316\u0305",
   568  	},
   569  	{ // segment overflow
   570  		"",
   571  		"a" + rep(0x0305, maxNonStarters+4) + "\u0316",
   572  		"a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
   573  	},
   574  
   575  	{ // Combine across non-blocking non-starters.
   576  		// U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
   577  		// U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
   578  		"", "a\u0327\u0325", "\u1e01\u0327",
   579  	},
   580  
   581  	{ // Jamo V+T does not combine.
   582  		"",
   583  		"\u1161\u11a8",
   584  		"\u1161\u11a8",
   585  	},
   586  
   587  	// Stability tests: see http://www.unicode.org/review/pr-29.html.
   588  	{"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
   589  	{"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
   590  	{"", "\u0b47\u0b3e", "\u0b4b"},
   591  	{"", "\u1100\u1161", "\uac00"},
   592  
   593  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   594  	{ // 0d4a starts a new segment.
   595  		"",
   596  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   597  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   598  	},
   599  
   600  	{ // Split combining characters.
   601  		// TODO: don't insert CGJ before starters.
   602  		"",
   603  		"\u0d46" + strings.Repeat("\u0d3e", 31),
   604  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   605  	},
   606  
   607  	{ // Split combining characters.
   608  		"",
   609  		"\u0d4a" + strings.Repeat("\u0d3e", 30),
   610  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   611  	},
   612  }
   613  
   614  var appendTestsNFD = []AppendTest{
   615  // TODO: Move some of the tests here.
   616  }
   617  
   618  var appendTestsNFKC = []AppendTest{
   619  	// empty buffers
   620  	{"", "", ""},
   621  	{"a", "", "a"},
   622  	{"", "a", "a"},
   623  	{"", "\u0041\u0307\u0304", "\u01E0"},
   624  	// segment split across buffers
   625  	{"", "a\u0300b", "\u00E0b"},
   626  	{"a", "\u0300b", "\u00E0b"},
   627  	{"a", "\u0300\u0316", "\u00E0\u0316"},
   628  	{"a", "\u0316\u0300", "\u00E0\u0316"},
   629  	{"a", "\u0300a\u0300", "\u00E0\u00E0"},
   630  	{"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
   631  	{"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
   632  	{"a\u0300", "\u0327", "\u00E0\u0327"},
   633  	{"a\u0327", "\u0300", "\u00E0\u0327"},
   634  	{"a\u0316", "\u0300", "\u00E0\u0316"},
   635  	{"\u0041\u0307", "\u0304", "\u01E0"},
   636  	// Hangul
   637  	{"", "\u110B\u1173", "\uC73C"},
   638  	{"", "\u1103\u1161", "\uB2E4"},
   639  	{"", "\u110B\u1173\u11B7", "\uC74C"},
   640  	{"", "\u320E", "\x28\uAC00\x29"},
   641  	{"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
   642  	{"\u1103", "\u1161", "\uB2E4"},
   643  	{"\u110B", "\u1173\u11B7", "\uC74C"},
   644  	{"\u110B\u1173", "\u11B7", "\uC74C"},
   645  	{"\uC73C", "\u11B7", "\uC74C"},
   646  	// UTF-8 encoding split across buffers
   647  	{"a\xCC", "\x80", "\u00E0"},
   648  	{"a\xCC", "\x80b", "\u00E0b"},
   649  	{"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
   650  	{"a\xCC", "\x80\x80", "\u00E0\x80"},
   651  	{"a\xCC", "\x80\xCC", "\u00E0\xCC"},
   652  	{"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
   653  	// ending in incomplete UTF-8 encoding
   654  	{"", "\xCC", "\xCC"},
   655  	{"a", "\xCC", "a\xCC"},
   656  	{"a", "b\xCC", "ab\xCC"},
   657  	{"\u0226", "\xCC", "\u0226\xCC"},
   658  	// illegal runes
   659  	{"", "\x80", "\x80"},
   660  	{"", "\x80\x80\x80", "\x80\x80\x80"},
   661  	{"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
   662  	{"", "a\x80", "a\x80"},
   663  	{"", "a\x80\x80\x80", "a\x80\x80\x80"},
   664  	{"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
   665  	{"a", "\x80\x80\x80", "a\x80\x80\x80"},
   666  	// overflow
   667  	{"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
   668  	{strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
   669  	{strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
   670  	// overflow of combining characters
   671  	{"", grave(34), grave(30) + cgj + grave(4)},
   672  	{"", grave(36), grave(30) + cgj + grave(6)},
   673  	{grave(29), grave(5), grave(30) + cgj + grave(4)},
   674  	{grave(30), grave(4), grave(30) + cgj + grave(4)},
   675  	{grave(30), grave(3), grave(30) + cgj + grave(3)},
   676  	{grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
   677  	{"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
   678  	{"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
   679  	// - First rune has a trailing non-starter.
   680  	{"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
   681  	// - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
   682  	//   inserted even when FF9E starts a new segment.
   683  	{"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
   684  	{grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
   685  	// - Many non-starter decompositions in a row causing overflow.
   686  	{"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
   687  	{"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
   688  	// weird UTF-8
   689  	{"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
   690  	{"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
   691  	{"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
   692  	{"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
   693  	{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
   694  	{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   695  	{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
   696  	{"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   697  
   698  	{"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
   699  	// large input.
   700  	{"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
   701  	{"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
   702  	{"", "\u0041\u0307\u0304", "\u01E0"},
   703  }
   704  
   705  var appendTestsNFKD = []AppendTest{
   706  	{"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
   707  
   708  	{ // segment overflow on unchanged character
   709  		"",
   710  		"a" + grave(64) + "\u0316",
   711  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
   712  	},
   713  	{ // segment overflow on unchanged character + start value
   714  		"",
   715  		"a" + grave(98) + "\u0316",
   716  		"a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
   717  	},
   718  	{ // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
   719  		"",
   720  		"a" + grave(59) + "\u0340",
   721  		"a" + grave(30) + cgj + grave(30),
   722  	},
   723  	{ // segment overflow on non-starter decomposition
   724  		"",
   725  		"a" + grave(33) + "\u0340" + grave(30) + "\u0320",
   726  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
   727  	},
   728  	{ // start value after ASCII overflow
   729  		"",
   730  		rep('a', segSize) + grave(32) + "\u0320",
   731  		rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
   732  	},
   733  	{ // Jamo overflow
   734  		"",
   735  		"\u1100\u1161" + grave(30) + "\u0320" + grave(2),
   736  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   737  	},
   738  	{ // Hangul
   739  		"",
   740  		"\uac00",
   741  		"\u1100\u1161",
   742  	},
   743  	{ // Hangul overflow
   744  		"",
   745  		"\uac00" + grave(32) + "\u0320",
   746  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   747  	},
   748  	{ // Hangul overflow in Hangul mode.
   749  		"",
   750  		"\uac00\uac00" + grave(32) + "\u0320",
   751  		"\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   752  	},
   753  	{ // Hangul overflow in Hangul mode.
   754  		"",
   755  		strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
   756  		strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
   757  	},
   758  	{ // start value after cc=0
   759  		"",
   760  		"您您" + grave(34) + "\u0320",
   761  		"您您" + grave(30) + cgj + "\u0320" + grave(4),
   762  	},
   763  	{ // start value after normalization
   764  		"",
   765  		"\u0300\u0320a" + grave(34) + "\u0320",
   766  		"\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
   767  	},
   768  }
   769  
   770  func TestAppend(t *testing.T) {
   771  	runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
   772  		return f.Append(out, []byte(s)...)
   773  	})
   774  }
   775  
   776  func TestAppendString(t *testing.T) {
   777  	runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
   778  		return f.AppendString(out, s)
   779  	})
   780  }
   781  
   782  func TestBytes(t *testing.T) {
   783  	runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
   784  		buf := []byte{}
   785  		buf = append(buf, out...)
   786  		buf = append(buf, s...)
   787  		return f.Bytes(buf)
   788  	})
   789  }
   790  
   791  func TestString(t *testing.T) {
   792  	runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
   793  		outs := string(out) + s
   794  		return []byte(f.String(outs))
   795  	})
   796  }
   797  
   798  func appendBench(f Form, in []byte) func() {
   799  	buf := make([]byte, 0, 4*len(in))
   800  	return func() {
   801  		f.Append(buf, in...)
   802  	}
   803  }
   804  
   805  func bytesBench(f Form, in []byte) func() {
   806  	return func() {
   807  		f.Bytes(in)
   808  	}
   809  }
   810  
   811  func iterBench(f Form, in []byte) func() {
   812  	iter := Iter{}
   813  	return func() {
   814  		iter.Init(f, in)
   815  		for !iter.Done() {
   816  			iter.Next()
   817  		}
   818  	}
   819  }
   820  
   821  func transformBench(f Form, in []byte) func() {
   822  	buf := make([]byte, 4*len(in))
   823  	return func() {
   824  		if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
   825  			log.Panic(n, len(in), err)
   826  		}
   827  	}
   828  }
   829  
   830  func readerBench(f Form, in []byte) func() {
   831  	buf := make([]byte, 4*len(in))
   832  	return func() {
   833  		r := f.Reader(bytes.NewReader(in))
   834  		var err error
   835  		for err == nil {
   836  			_, err = r.Read(buf)
   837  		}
   838  		if err != io.EOF {
   839  			panic("")
   840  		}
   841  	}
   842  }
   843  
   844  func writerBench(f Form, in []byte) func() {
   845  	buf := make([]byte, 0, 4*len(in))
   846  	return func() {
   847  		r := f.Writer(bytes.NewBuffer(buf))
   848  		if _, err := r.Write(in); err != nil {
   849  			panic("")
   850  		}
   851  	}
   852  }
   853  
   854  func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
   855  	bm = append(bm, appendBench(f, in))
   856  	bm = append(bm, iterBench(f, in))
   857  	bm = append(bm, transformBench(f, in))
   858  	bm = append(bm, readerBench(f, in))
   859  	bm = append(bm, writerBench(f, in))
   860  	return bm
   861  }
   862  
   863  func doFormBenchmark(b *testing.B, inf, f Form, s string) {
   864  	b.StopTimer()
   865  	in := inf.Bytes([]byte(s))
   866  	bm := appendBenchmarks(nil, f, in)
   867  	b.SetBytes(int64(len(in) * len(bm)))
   868  	b.StartTimer()
   869  	for i := 0; i < b.N; i++ {
   870  		for _, fn := range bm {
   871  			fn()
   872  		}
   873  	}
   874  }
   875  
   876  func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
   877  	b.StopTimer()
   878  	fn := f(NFC, s)
   879  	b.SetBytes(int64(len(s)))
   880  	b.StartTimer()
   881  	for i := 0; i < b.N; i++ {
   882  		fn()
   883  	}
   884  }
   885  
   886  var (
   887  	smallNoChange = []byte("nörmalization")
   888  	smallChange   = []byte("No\u0308rmalization")
   889  	ascii         = strings.Repeat("There is nothing to change here! ", 500)
   890  )
   891  
   892  func lowerBench(f Form, in []byte) func() {
   893  	// Use package strings instead of bytes as it doesn't allocate memory
   894  	// if there aren't any changes.
   895  	s := string(in)
   896  	return func() {
   897  		strings.ToLower(s)
   898  	}
   899  }
   900  
   901  func BenchmarkLowerCaseNoChange(b *testing.B) {
   902  	doSingle(b, lowerBench, smallNoChange)
   903  }
   904  func BenchmarkLowerCaseChange(b *testing.B) {
   905  	doSingle(b, lowerBench, smallChange)
   906  }
   907  
   908  func quickSpanBench(f Form, in []byte) func() {
   909  	return func() {
   910  		f.QuickSpan(in)
   911  	}
   912  }
   913  
   914  func BenchmarkQuickSpanChangeNFC(b *testing.B) {
   915  	doSingle(b, quickSpanBench, smallNoChange)
   916  }
   917  
   918  func BenchmarkBytesNoChangeNFC(b *testing.B) {
   919  	doSingle(b, bytesBench, smallNoChange)
   920  }
   921  func BenchmarkBytesChangeNFC(b *testing.B) {
   922  	doSingle(b, bytesBench, smallChange)
   923  }
   924  
   925  func BenchmarkAppendNoChangeNFC(b *testing.B) {
   926  	doSingle(b, appendBench, smallNoChange)
   927  }
   928  func BenchmarkAppendChangeNFC(b *testing.B) {
   929  	doSingle(b, appendBench, smallChange)
   930  }
   931  func BenchmarkAppendLargeNFC(b *testing.B) {
   932  	doSingle(b, appendBench, txt_all_bytes)
   933  }
   934  
   935  func BenchmarkIterNoChangeNFC(b *testing.B) {
   936  	doSingle(b, iterBench, smallNoChange)
   937  }
   938  func BenchmarkIterChangeNFC(b *testing.B) {
   939  	doSingle(b, iterBench, smallChange)
   940  }
   941  func BenchmarkIterLargeNFC(b *testing.B) {
   942  	doSingle(b, iterBench, txt_all_bytes)
   943  }
   944  
   945  func BenchmarkTransformNoChangeNFC(b *testing.B) {
   946  	doSingle(b, transformBench, smallNoChange)
   947  }
   948  func BenchmarkTransformChangeNFC(b *testing.B) {
   949  	doSingle(b, transformBench, smallChange)
   950  }
   951  func BenchmarkTransformLargeNFC(b *testing.B) {
   952  	doSingle(b, transformBench, txt_all_bytes)
   953  }
   954  
   955  func BenchmarkNormalizeAsciiNFC(b *testing.B) {
   956  	doFormBenchmark(b, NFC, NFC, ascii)
   957  }
   958  func BenchmarkNormalizeAsciiNFD(b *testing.B) {
   959  	doFormBenchmark(b, NFC, NFD, ascii)
   960  }
   961  func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
   962  	doFormBenchmark(b, NFC, NFKC, ascii)
   963  }
   964  func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
   965  	doFormBenchmark(b, NFC, NFKD, ascii)
   966  }
   967  
   968  func BenchmarkNormalizeNFC2NFC(b *testing.B) {
   969  	doFormBenchmark(b, NFC, NFC, txt_all)
   970  }
   971  func BenchmarkNormalizeNFC2NFD(b *testing.B) {
   972  	doFormBenchmark(b, NFC, NFD, txt_all)
   973  }
   974  func BenchmarkNormalizeNFD2NFC(b *testing.B) {
   975  	doFormBenchmark(b, NFD, NFC, txt_all)
   976  }
   977  func BenchmarkNormalizeNFD2NFD(b *testing.B) {
   978  	doFormBenchmark(b, NFD, NFD, txt_all)
   979  }
   980  
   981  // Hangul is often special-cased, so we test it separately.
   982  func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
   983  	doFormBenchmark(b, NFC, NFC, txt_kr)
   984  }
   985  func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
   986  	doFormBenchmark(b, NFC, NFD, txt_kr)
   987  }
   988  func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
   989  	doFormBenchmark(b, NFD, NFC, txt_kr)
   990  }
   991  func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
   992  	doFormBenchmark(b, NFD, NFD, txt_kr)
   993  }
   994  
   995  var forms = []Form{NFC, NFD, NFKC, NFKD}
   996  
   997  func doTextBenchmark(b *testing.B, s string) {
   998  	b.StopTimer()
   999  	in := []byte(s)
  1000  	bm := []func(){}
  1001  	for _, f := range forms {
  1002  		bm = appendBenchmarks(bm, f, in)
  1003  	}
  1004  	b.SetBytes(int64(len(s) * len(bm)))
  1005  	b.StartTimer()
  1006  	for i := 0; i < b.N; i++ {
  1007  		for _, f := range bm {
  1008  			f()
  1009  		}
  1010  	}
  1011  }
  1012  
  1013  func BenchmarkCanonicalOrdering(b *testing.B) {
  1014  	doTextBenchmark(b, txt_canon)
  1015  }
  1016  func BenchmarkExtendedLatin(b *testing.B) {
  1017  	doTextBenchmark(b, txt_vn)
  1018  }
  1019  func BenchmarkMiscTwoByteUtf8(b *testing.B) {
  1020  	doTextBenchmark(b, twoByteUtf8)
  1021  }
  1022  func BenchmarkMiscThreeByteUtf8(b *testing.B) {
  1023  	doTextBenchmark(b, threeByteUtf8)
  1024  }
  1025  func BenchmarkHangul(b *testing.B) {
  1026  	doTextBenchmark(b, txt_kr)
  1027  }
  1028  func BenchmarkJapanese(b *testing.B) {
  1029  	doTextBenchmark(b, txt_jp)
  1030  }
  1031  func BenchmarkChinese(b *testing.B) {
  1032  	doTextBenchmark(b, txt_cn)
  1033  }
  1034  func BenchmarkOverflow(b *testing.B) {
  1035  	doTextBenchmark(b, overflow)
  1036  }
  1037  
  1038  var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
  1039  
  1040  // Tests sampled from the Canonical ordering tests (Part 2) of
  1041  // http://unicode.org/Public/UNIDATA/NormalizationTest.txt
  1042  const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
  1043  \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
  1044  \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
  1045  \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062 
  1046  \u0061\u059A\u0316\u302A\u0339       \u0061\u0341\u0315\u0300\u05AE\u0062
  1047  \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
  1048  \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
  1049  \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
  1050  \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
  1051  \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
  1052  \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
  1053  \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
  1054  \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
  1055  \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
  1056  \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
  1057  \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
  1058  \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
  1059  \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
  1060  
  1061  // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
  1062  const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. 
  1063  Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ 
  1064  nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc 
  1065  một giấy phép khác có các điều khoản tương tự như giấy phép này
  1066  cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
  1067  trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
  1068  người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
  1069  bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
  1070  chúng theo quy định của pháp luật thì tình trạng của nó không 
  1071  bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
  1072  
  1073  // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
  1074  const txt_ru = `При обязательном соблюдении следующих условий:
  1075  Attribution — Вы должны атрибутировать произведение (указывать
  1076  автора и источник) в порядке, предусмотренном автором или
  1077  лицензиаром (но только так, чтобы никоим образом не подразумевалось,
  1078  что они поддерживают вас или использование вами данного произведения).
  1079  Υπό τις ακόλουθες προϋποθέσεις:`
  1080  
  1081  // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
  1082  const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
  1083  τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
  1084  (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
  1085  τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
  1086  τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
  1087  μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
  1088  παρόμοια άδεια.`
  1089  
  1090  // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
  1091  const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
  1092  تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
  1093  الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
  1094  المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
  1095  من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
  1096  لهذا الترخيص.`
  1097  
  1098  // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
  1099  const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
  1100  המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
  1101  שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
  1102  לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
  1103  החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
  1104  
  1105  const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
  1106  
  1107  // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
  1108  const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
  1109  (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
  1110  원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
  1111  이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다). 
  1112  동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
  1113  라이선스와 동일한 라이선스를 적용해야 합니다.`
  1114  
  1115  // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
  1116  const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
  1117  มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
  1118  ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
  1119  คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
  1120  อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
  1121  อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
  1122  
  1123  const threeByteUtf8 = txt_th
  1124  
  1125  // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
  1126  const txt_jp = `あなたの従うべき条件は以下の通りです。
  1127  表示 — あなたは原著作者のクレジットを表示しなければなりません。
  1128  継承 — もしあなたがこの作品を改変、変形または加工した場合、
  1129  あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
  1130  頒布することができます。`
  1131  
  1132  // http://creativecommons.org/licenses/by-sa/2.5/cn/
  1133  const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
  1134  广播或通过信息网络传播本作品 创作演绎作品
  1135  对本作品进行商业性使用 惟须遵守下列条件:
  1136  署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
  1137  相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
  1138  您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
  1139  
  1140  const txt_cjk = txt_cn + txt_jp + txt_kr
  1141  const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
  1142  
  1143  var txt_all_bytes = []byte(txt_all)