github.com/lianghucheng/zrddz@v0.0.0-20200923083010-c71f680932e2/src/golang.org/x/text/unicode/norm/normalize_test.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  import (
     8  	"bytes"
     9  	"flag"
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"strings"
    14  	"testing"
    15  	"unicode/utf8"
    16  
    17  	"golang.org/x/text/internal/testtext"
    18  	"golang.org/x/text/transform"
    19  )
    20  
    21  var (
    22  	testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
    23  )
    24  
    25  // pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
    26  func pc(s string) []byte {
    27  	b := bytes.NewBuffer(make([]byte, 0, len(s)))
    28  	for i := 0; i < len(s); {
    29  		r, sz := utf8.DecodeRuneInString(s[i:])
    30  		n := 0
    31  		if sz == 1 {
    32  			// Special-case one-byte case to handle repetition for invalid UTF-8.
    33  			for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
    34  			}
    35  		} else {
    36  			for _, r2 := range s[i:] {
    37  				if r2 != r {
    38  					break
    39  				}
    40  				n++
    41  			}
    42  		}
    43  		b.WriteString(s[i : i+sz])
    44  		if n > 1 {
    45  			fmt.Fprintf(b, "{%d}", n)
    46  		}
    47  		i += sz * n
    48  	}
    49  	return b.Bytes()
    50  }
    51  
    52  // pidx finds the index from which two strings start to differ, plus context.
    53  // It returns the index and ellipsis if the index is greater than 0.
    54  func pidx(a, b string) (i int, prefix string) {
    55  	for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
    56  	}
    57  	if i < 8 {
    58  		return 0, ""
    59  	}
    60  	i -= 3 // ensure taking at least one full rune before the difference.
    61  	for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
    62  	}
    63  	return i, "..."
    64  }
    65  
    66  type PositionTest struct {
    67  	input  string
    68  	pos    int
    69  	buffer string // expected contents of reorderBuffer, if applicable
    70  }
    71  
    72  type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
    73  
    74  func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
    75  	rb := reorderBuffer{}
    76  	rb.init(f, nil)
    77  	for i, test := range tests {
    78  		rb.reset()
    79  		rb.src = inputString(test.input)
    80  		rb.nsrc = len(test.input)
    81  		pos, out := fn(&rb, test.input)
    82  		if pos != test.pos {
    83  			t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
    84  		}
    85  		if outs := string(out); outs != test.buffer {
    86  			k, pfx := pidx(outs, test.buffer)
    87  			t.Errorf("%s:%d: buffer \nwas  %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
    88  		}
    89  	}
    90  }
    91  
    92  func grave(n int) string {
    93  	return rep(0x0300, n)
    94  }
    95  
    96  func rep(r rune, n int) string {
    97  	return strings.Repeat(string(r), n)
    98  }
    99  
   100  const segSize = maxByteBufferSize
   101  
   102  var cgj = GraphemeJoiner
   103  
   104  var decomposeSegmentTests = []PositionTest{
   105  	// illegal runes
   106  	{"\xC2", 0, ""},
   107  	{"\xC0", 1, "\xC0"},
   108  	{"\u00E0\x80", 2, "\u0061\u0300"},
   109  	// starter
   110  	{"a", 1, "a"},
   111  	{"ab", 1, "a"},
   112  	// starter + composing
   113  	{"a\u0300", 3, "a\u0300"},
   114  	{"a\u0300b", 3, "a\u0300"},
   115  	// with decomposition
   116  	{"\u00C0", 2, "A\u0300"},
   117  	{"\u00C0b", 2, "A\u0300"},
   118  	// long
   119  	{grave(31), 60, grave(30) + cgj},
   120  	{"a" + grave(31), 61, "a" + grave(30) + cgj},
   121  
   122  	// Stability tests: see https://www.unicode.org/review/pr-29.html.
   123  	// U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
   124  	// U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
   125  	// U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
   126  	// U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
   127  	// U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
   128  	{"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
   129  	{"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
   130  	{"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
   131  	{"\u1100\u1161", 6, "\u1100\u1161"},
   132  
   133  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   134  	// Sequence of decomposing characters that are starters and modifiers.
   135  	{"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
   136  
   137  	{grave(30), 60, grave(30)},
   138  	// U+FF9E is a starter, but decomposes to U+3099, which is not.
   139  	{grave(30) + "\uff9e", 60, grave(30) + cgj},
   140  	// ends with incomplete UTF-8 encoding
   141  	{"\xCC", 0, ""},
   142  	{"\u0300\xCC", 2, "\u0300"},
   143  }
   144  
   145  func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
   146  	rb.initString(NFD, s)
   147  	rb.setFlusher(nil, appendFlush)
   148  	p := decomposeSegment(rb, 0, true)
   149  	return p, rb.out
   150  }
   151  
   152  func TestDecomposeSegment(t *testing.T) {
   153  	runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
   154  }
   155  
   156  var firstBoundaryTests = []PositionTest{
   157  	// no boundary
   158  	{"", -1, ""},
   159  	{"\u0300", -1, ""},
   160  	{"\x80\x80", -1, ""},
   161  	// illegal runes
   162  	{"\xff", 0, ""},
   163  	{"\u0300\xff", 2, ""},
   164  	{"\u0300\xc0\x80\x80", 2, ""},
   165  	// boundaries
   166  	{"a", 0, ""},
   167  	{"\u0300a", 2, ""},
   168  	// Hangul
   169  	{"\u1103\u1161", 0, ""},
   170  	{"\u110B\u1173\u11B7", 0, ""},
   171  	{"\u1161\u110B\u1173\u11B7", 3, ""},
   172  	{"\u1173\u11B7\u1103\u1161", 6, ""},
   173  	// too many combining characters.
   174  	{grave(maxNonStarters - 1), -1, ""},
   175  	{grave(maxNonStarters), 60, ""},
   176  	{grave(maxNonStarters + 1), 60, ""},
   177  }
   178  
   179  func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   180  	return rb.f.form.FirstBoundary([]byte(s)), nil
   181  }
   182  
   183  func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
   184  	return rb.f.form.FirstBoundaryInString(s), nil
   185  }
   186  
   187  func TestFirstBoundary(t *testing.T) {
   188  	runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
   189  	runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
   190  }
   191  
   192  func TestNextBoundary(t *testing.T) {
   193  	testCases := []struct {
   194  		input string
   195  		atEOF bool
   196  		want  int
   197  	}{
   198  		// no boundary
   199  		{"", true, 0},
   200  		{"", false, -1},
   201  		{"\u0300", true, 2},
   202  		{"\u0300", false, -1},
   203  		{"\x80\x80", true, 1},
   204  		{"\x80\x80", false, 1},
   205  		// illegal runes
   206  		{"\xff", false, 1},
   207  		{"\u0300\xff", false, 2},
   208  		{"\u0300\xc0\x80\x80", false, 2},
   209  		{"\xc2\x80\x80", false, 2},
   210  		{"\xc2", false, -1},
   211  		{"\xc2", true, 1},
   212  		{"a\u0300\xc2", false, -1},
   213  		{"a\u0300\xc2", true, 3},
   214  		// boundaries
   215  		{"a", true, 1},
   216  		{"a", false, -1},
   217  		{"aa", false, 1},
   218  		{"\u0300", true, 2},
   219  		{"\u0300", false, -1},
   220  		{"\u0300a", false, 2},
   221  		// Hangul
   222  		{"\u1103\u1161", true, 6},
   223  		{"\u1103\u1161", false, -1},
   224  		{"\u110B\u1173\u11B7", false, -1},
   225  		{"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
   226  		{"\u1161\u110B\u1173\u11B7", false, 3},
   227  		{"\u1173\u11B7\u1103\u1161", false, 6},
   228  		// too many combining characters.
   229  		{grave(maxNonStarters - 1), false, -1},
   230  		{grave(maxNonStarters), false, 60},
   231  		{grave(maxNonStarters + 1), false, 60},
   232  	}
   233  
   234  	for _, tc := range testCases {
   235  		if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
   236  			t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
   237  		}
   238  		if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
   239  			t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
   240  		}
   241  	}
   242  }
   243  
   244  var decomposeToLastTests = []PositionTest{
   245  	// ends with inert character
   246  	{"Hello!", 6, ""},
   247  	{"\u0632", 2, ""},
   248  	{"a\u0301\u0635", 5, ""},
   249  	// ends with non-inert starter
   250  	{"a", 0, "a"},
   251  	{"a\u0301a", 3, "a"},
   252  	{"a\u0301\u03B9", 3, "\u03B9"},
   253  	{"a\u0327", 0, "a\u0327"},
   254  	// illegal runes
   255  	{"\xFF", 1, ""},
   256  	{"aa\xFF", 3, ""},
   257  	{"\xC0\x80\x80", 3, ""},
   258  	{"\xCC\x80\x80", 3, ""},
   259  	// ends with incomplete UTF-8 encoding
   260  	{"a\xCC", 2, ""},
   261  	// ends with combining characters
   262  	{"\u0300\u0301", 0, "\u0300\u0301"},
   263  	{"a\u0300\u0301", 0, "a\u0300\u0301"},
   264  	{"a\u0301\u0308", 0, "a\u0301\u0308"},
   265  	{"a\u0308\u0301", 0, "a\u0308\u0301"},
   266  	{"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
   267  	{"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
   268  	{"\u00C0", 0, "A\u0300"},
   269  	{"a\u00C0", 1, "A\u0300"},
   270  	// decomposing
   271  	{"a\u0300\u00E0", 3, "a\u0300"},
   272  	// multisegment decompositions (flushes leading segments)
   273  	{"a\u0300\uFDC0", 7, "\u064A"},
   274  	{"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
   275  	{"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
   276  	{"\uFDC0" + grave(31), 5, grave(30)},
   277  	{"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
   278  	// Overflow
   279  	{"\u00E0" + grave(29), 0, "a" + grave(30)},
   280  	{"\u00E0" + grave(30), 2, grave(30)},
   281  	// Hangul
   282  	{"a\u1103", 1, "\u1103"},
   283  	{"a\u110B", 1, "\u110B"},
   284  	{"a\u110B\u1173", 1, "\u110B\u1173"},
   285  	// See comment in composition.go:compBoundaryAfter.
   286  	{"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
   287  	{"a\uC73C", 1, "\u110B\u1173"},
   288  	{"다음", 3, "\u110B\u1173\u11B7"},
   289  	{"다", 0, "\u1103\u1161"},
   290  	{"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
   291  	{"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
   292  	{"다음음", 6, "\u110B\u1173\u11B7"},
   293  	{"음다다", 6, "\u1103\u1161"},
   294  	// maximized buffer
   295  	{"a" + grave(30), 0, "a" + grave(30)},
   296  	// Buffer overflow
   297  	{"a" + grave(31), 3, grave(30)},
   298  	// weird UTF-8
   299  	{"a\u0300\u11B7", 0, "a\u0300\u11B7"},
   300  }
   301  
   302  func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
   303  	rb.setFlusher([]byte(s), appendFlush)
   304  	decomposeToLastBoundary(rb)
   305  	buf := rb.flush(nil)
   306  	return len(rb.out), buf
   307  }
   308  
   309  func TestDecomposeToLastBoundary(t *testing.T) {
   310  	runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
   311  }
   312  
   313  var lastBoundaryTests = []PositionTest{
   314  	// ends with inert character
   315  	{"Hello!", 6, ""},
   316  	{"\u0632", 2, ""},
   317  	// ends with non-inert starter
   318  	{"a", 0, ""},
   319  	// illegal runes
   320  	{"\xff", 1, ""},
   321  	{"aa\xff", 3, ""},
   322  	{"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
   323  	{"\xc0\x80\x80", 3, ""},
   324  	{"\xc0\x80\x80\u0300", 3, ""},
   325  	// ends with incomplete UTF-8 encoding
   326  	{"\xCC", -1, ""},
   327  	{"\xE0\x80", -1, ""},
   328  	{"\xF0\x80\x80", -1, ""},
   329  	{"a\xCC", 0, ""},
   330  	{"\x80\xCC", 1, ""},
   331  	{"\xCC\xCC", 1, ""},
   332  	// ends with combining characters
   333  	{"a\u0300\u0301", 0, ""},
   334  	{"aaaa\u0300\u0301", 3, ""},
   335  	{"\u0300a\u0300\u0301", 2, ""},
   336  	{"\u00C2", 0, ""},
   337  	{"a\u00C2", 1, ""},
   338  	// decomposition may recombine
   339  	{"\u0226", 0, ""},
   340  	// no boundary
   341  	{"", -1, ""},
   342  	{"\u0300\u0301", -1, ""},
   343  	{"\u0300", -1, ""},
   344  	{"\x80\x80", -1, ""},
   345  	{"\x80\x80\u0301", -1, ""},
   346  	// Hangul
   347  	{"다음", 3, ""},
   348  	{"다", 0, ""},
   349  	{"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
   350  	{"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
   351  	// too many combining characters.
   352  	{grave(maxNonStarters - 1), -1, ""},
   353  	// May still be preceded with a non-starter.
   354  	{grave(maxNonStarters), -1, ""},
   355  	// May still need to insert a cgj after the last combiner.
   356  	{grave(maxNonStarters + 1), 2, ""},
   357  	{grave(maxNonStarters + 2), 4, ""},
   358  
   359  	{"a" + grave(maxNonStarters-1), 0, ""},
   360  	{"a" + grave(maxNonStarters), 0, ""},
   361  	// May still need to insert a cgj after the last combiner.
   362  	{"a" + grave(maxNonStarters+1), 3, ""},
   363  	{"a" + grave(maxNonStarters+2), 5, ""},
   364  }
   365  
   366  func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   367  	return rb.f.form.LastBoundary([]byte(s)), nil
   368  }
   369  
   370  func TestLastBoundary(t *testing.T) {
   371  	runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
   372  }
   373  
   374  type spanTest struct {
   375  	input string
   376  	atEOF bool
   377  	n     int
   378  	err   error
   379  }
   380  
   381  var quickSpanTests = []spanTest{
   382  	{"", true, 0, nil},
   383  	// starters
   384  	{"a", true, 1, nil},
   385  	{"abc", true, 3, nil},
   386  	{"\u043Eb", true, 3, nil},
   387  	// incomplete last rune.
   388  	{"\xCC", true, 1, nil},
   389  	{"\xCC", false, 0, transform.ErrShortSrc},
   390  	{"a\xCC", true, 2, nil},
   391  	{"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
   392  	// incorrectly ordered combining characters
   393  	{"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
   394  	{"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
   395  	{"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
   396  	{"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
   397  	// have a maximum number of combining characters.
   398  	{rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   399  	{"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   400  	{"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   401  	{"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
   402  	{rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
   403  	{"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
   404  	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
   405  	{"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
   406  
   407  	{"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
   408  	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
   409  	{"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
   410  }
   411  
   412  var quickSpanNFDTests = []spanTest{
   413  	// needs decomposing
   414  	{"\u00C0", true, 0, transform.ErrEndOfSpan},
   415  	{"abc\u00C0", true, 3, transform.ErrEndOfSpan},
   416  	// correctly ordered combining characters
   417  	{"\u0300", true, 2, nil},
   418  	{"ab\u0300", true, 4, nil},
   419  	{"ab\u0300cd", true, 6, nil},
   420  	{"\u0300cd", true, 4, nil},
   421  	{"\u0316\u0300", true, 4, nil},
   422  	{"ab\u0316\u0300", true, 6, nil},
   423  	{"ab\u0316\u0300cd", true, 8, nil},
   424  	{"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
   425  	{"\u0316\u0300cd", true, 6, nil},
   426  	{"\u043E\u0308b", true, 5, nil},
   427  	// incorrectly ordered combining characters
   428  	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
   429  	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
   430  	// Hangul
   431  	{"같은", true, 0, transform.ErrEndOfSpan},
   432  }
   433  
   434  var quickSpanNFCTests = []spanTest{
   435  	// okay composed
   436  	{"\u00C0", true, 2, nil},
   437  	{"abc\u00C0", true, 5, nil},
   438  	// correctly ordered combining characters
   439  	// TODO: b may combine with modifiers, which is why this fails. We could
   440  	// make a more precise test that actually checks whether last
   441  	// characters combines. Probably not worth it.
   442  	{"ab\u0300", true, 1, transform.ErrEndOfSpan},
   443  	{"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
   444  	{"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
   445  	{"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
   446  	{"\u00C0\u035D", true, 4, nil},
   447  	// we do not special case leading combining characters
   448  	{"\u0300cd", true, 0, transform.ErrEndOfSpan},
   449  	{"\u0300", true, 0, transform.ErrEndOfSpan},
   450  	{"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
   451  	{"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
   452  	// incorrectly ordered combining characters
   453  	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
   454  	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
   455  	// Hangul
   456  	{"같은", true, 6, nil},
   457  	{"같은", false, 3, transform.ErrShortSrc},
   458  	// We return the start of the violating segment in case of overflow.
   459  	{grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
   460  	{grave(30), true, 0, transform.ErrEndOfSpan},
   461  }
   462  
   463  func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
   464  	for i, tc := range testCases {
   465  		s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
   466  		ok := testtext.Run(t, s, func(t *testing.T) {
   467  			n, err := f.Span([]byte(tc.input), tc.atEOF)
   468  			if n != tc.n || err != tc.err {
   469  				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
   470  			}
   471  		})
   472  		if !ok {
   473  			continue // Don't do the String variant if the Bytes variant failed.
   474  		}
   475  		s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
   476  		testtext.Run(t, s, func(t *testing.T) {
   477  			n, err := f.SpanString(tc.input, tc.atEOF)
   478  			if n != tc.n || err != tc.err {
   479  				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
   480  			}
   481  		})
   482  	}
   483  }
   484  
   485  func TestSpan(t *testing.T) {
   486  	runSpanTests(t, "NFD", NFD, quickSpanTests)
   487  	runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
   488  	runSpanTests(t, "NFC", NFC, quickSpanTests)
   489  	runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
   490  }
   491  
   492  var isNormalTests = []PositionTest{
   493  	{"", 1, ""},
   494  	// illegal runes
   495  	{"\xff", 1, ""},
   496  	// starters
   497  	{"a", 1, ""},
   498  	{"abc", 1, ""},
   499  	{"\u043Eb", 1, ""},
   500  	// incorrectly ordered combining characters
   501  	{"\u0300\u0316", 0, ""},
   502  	{"ab\u0300\u0316", 0, ""},
   503  	{"ab\u0300\u0316cd", 0, ""},
   504  	{"\u0300\u0316cd", 0, ""},
   505  }
   506  var isNormalNFDTests = []PositionTest{
   507  	// needs decomposing
   508  	{"\u00C0", 0, ""},
   509  	{"abc\u00C0", 0, ""},
   510  	// correctly ordered combining characters
   511  	{"\u0300", 1, ""},
   512  	{"ab\u0300", 1, ""},
   513  	{"ab\u0300cd", 1, ""},
   514  	{"\u0300cd", 1, ""},
   515  	{"\u0316\u0300", 1, ""},
   516  	{"ab\u0316\u0300", 1, ""},
   517  	{"ab\u0316\u0300cd", 1, ""},
   518  	{"\u0316\u0300cd", 1, ""},
   519  	{"\u043E\u0308b", 1, ""},
   520  	// Hangul
   521  	{"같은", 0, ""},
   522  }
   523  var isNormalNFCTests = []PositionTest{
   524  	// okay composed
   525  	{"\u00C0", 1, ""},
   526  	{"abc\u00C0", 1, ""},
   527  	// need reordering
   528  	{"a\u0300", 0, ""},
   529  	{"a\u0300cd", 0, ""},
   530  	{"a\u0316\u0300", 0, ""},
   531  	{"a\u0316\u0300cd", 0, ""},
   532  	// correctly ordered combining characters
   533  	{"ab\u0300", 1, ""},
   534  	{"ab\u0300cd", 1, ""},
   535  	{"ab\u0316\u0300", 1, ""},
   536  	{"ab\u0316\u0300cd", 1, ""},
   537  	{"\u00C0\u035D", 1, ""},
   538  	{"\u0300", 1, ""},
   539  	{"\u0316\u0300cd", 1, ""},
   540  	// Hangul
   541  	{"같은", 1, ""},
   542  }
   543  
   544  var isNormalNFKXTests = []PositionTest{
   545  	// Special case.
   546  	{"\u00BC", 0, ""},
   547  }
   548  
   549  func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
   550  	if rb.f.form.IsNormal([]byte(s)) {
   551  		return 1, nil
   552  	}
   553  	return 0, nil
   554  }
   555  
   556  func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
   557  	if rb.f.form.IsNormalString(s) {
   558  		return 1, nil
   559  	}
   560  	return 0, nil
   561  }
   562  
   563  func TestIsNormal(t *testing.T) {
   564  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
   565  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
   566  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
   567  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
   568  	runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
   569  	runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
   570  	runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
   571  	runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
   572  	runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
   573  	runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
   574  }
   575  
   576  func TestIsNormalString(t *testing.T) {
   577  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
   578  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
   579  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
   580  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
   581  }
   582  
   583  type AppendTest struct {
   584  	left  string
   585  	right string
   586  	out   string
   587  }
   588  
   589  type appendFunc func(f Form, out []byte, s string) []byte
   590  
   591  var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
   592  
   593  func runNormTests(t *testing.T, name string, fn appendFunc) {
   594  	for f := NFC; f <= NFKD; f++ {
   595  		runAppendTests(t, name, f, fn, normTests[f])
   596  	}
   597  }
   598  
   599  func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
   600  	for i, test := range tests {
   601  		t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
   602  			id := pc(test.left + test.right)
   603  			if *testn >= 0 && i != *testn {
   604  				return
   605  			}
   606  			t.Run("fn", func(t *testing.T) {
   607  				out := []byte(test.left)
   608  				have := string(fn(f, out, test.right))
   609  				if len(have) != len(test.out) {
   610  					t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
   611  				}
   612  				if have != test.out {
   613  					k, pf := pidx(have, test.out)
   614  					t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
   615  				}
   616  			})
   617  
   618  			// Bootstrap by normalizing input. Ensures that the various variants
   619  			// behave the same.
   620  			for g := NFC; g <= NFKD; g++ {
   621  				if f == g {
   622  					continue
   623  				}
   624  				t.Run(fstr[g], func(t *testing.T) {
   625  					want := g.String(test.left + test.right)
   626  					have := string(fn(g, g.AppendString(nil, test.left), test.right))
   627  					if len(have) != len(want) {
   628  						t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
   629  					}
   630  					if have != want {
   631  						k, pf := pidx(have, want)
   632  						t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
   633  					}
   634  				})
   635  			}
   636  		})
   637  	}
   638  }
   639  
   640  var normTests = [][]AppendTest{
   641  	appendTestsNFC,
   642  	appendTestsNFD,
   643  	appendTestsNFKC,
   644  	appendTestsNFKD,
   645  }
   646  
   647  var appendTestsNFC = []AppendTest{
   648  	{"", ascii, ascii},
   649  	{"", txt_all, txt_all},
   650  	{"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
   651  	{grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
   652  
   653  	// Tests designed for Iter.
   654  	{ // ordering of non-composing combining characters
   655  		"",
   656  		"\u0305\u0316",
   657  		"\u0316\u0305",
   658  	},
   659  	{ // segment overflow
   660  		"",
   661  		"a" + rep(0x0305, maxNonStarters+4) + "\u0316",
   662  		"a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
   663  	},
   664  
   665  	{ // Combine across non-blocking non-starters.
   666  		// U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
   667  		// U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
   668  		"", "a\u0327\u0325", "\u1e01\u0327",
   669  	},
   670  
   671  	{ // Jamo V+T does not combine.
   672  		"",
   673  		"\u1161\u11a8",
   674  		"\u1161\u11a8",
   675  	},
   676  
   677  	// Stability tests: see https://www.unicode.org/review/pr-29.html.
   678  	{"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
   679  	{"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
   680  	{"", "\u0b47\u0b3e", "\u0b4b"},
   681  	{"", "\u1100\u1161", "\uac00"},
   682  
   683  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   684  	{ // 0d4a starts a new segment.
   685  		"",
   686  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   687  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   688  	},
   689  
   690  	{ // Split combining characters.
   691  		// TODO: don't insert CGJ before starters.
   692  		"",
   693  		"\u0d46" + strings.Repeat("\u0d3e", 31),
   694  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   695  	},
   696  
   697  	{ // Split combining characters.
   698  		"",
   699  		"\u0d4a" + strings.Repeat("\u0d3e", 30),
   700  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   701  	},
   702  
   703  	{ //  https://golang.org/issues/20079
   704  		"",
   705  		"\xeb\u0344",
   706  		"\xeb\u0308\u0301",
   707  	},
   708  
   709  	{ //  https://golang.org/issues/20079
   710  		"",
   711  		"\uac00" + strings.Repeat("\u0300", 30),
   712  		"\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
   713  	},
   714  
   715  	{ //  https://golang.org/issues/20079
   716  		"",
   717  		"\xeb" + strings.Repeat("\u0300", 31),
   718  		"\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
   719  	},
   720  }
   721  
   722  var appendTestsNFD = []AppendTest{
   723  	// TODO: Move some of the tests here.
   724  }
   725  
   726  var appendTestsNFKC = []AppendTest{
   727  	// empty buffers
   728  	{"", "", ""},
   729  	{"a", "", "a"},
   730  	{"", "a", "a"},
   731  	{"", "\u0041\u0307\u0304", "\u01E0"},
   732  	// segment split across buffers
   733  	{"", "a\u0300b", "\u00E0b"},
   734  	{"a", "\u0300b", "\u00E0b"},
   735  	{"a", "\u0300\u0316", "\u00E0\u0316"},
   736  	{"a", "\u0316\u0300", "\u00E0\u0316"},
   737  	{"a", "\u0300a\u0300", "\u00E0\u00E0"},
   738  	{"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
   739  	{"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
   740  	{"a\u0300", "\u0327", "\u00E0\u0327"},
   741  	{"a\u0327", "\u0300", "\u00E0\u0327"},
   742  	{"a\u0316", "\u0300", "\u00E0\u0316"},
   743  	{"\u0041\u0307", "\u0304", "\u01E0"},
   744  	// Hangul
   745  	{"", "\u110B\u1173", "\uC73C"},
   746  	{"", "\u1103\u1161", "\uB2E4"},
   747  	{"", "\u110B\u1173\u11B7", "\uC74C"},
   748  	{"", "\u320E", "\x28\uAC00\x29"},
   749  	{"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
   750  	{"\u1103", "\u1161", "\uB2E4"},
   751  	{"\u110B", "\u1173\u11B7", "\uC74C"},
   752  	{"\u110B\u1173", "\u11B7", "\uC74C"},
   753  	{"\uC73C", "\u11B7", "\uC74C"},
   754  	// UTF-8 encoding split across buffers
   755  	{"a\xCC", "\x80", "\u00E0"},
   756  	{"a\xCC", "\x80b", "\u00E0b"},
   757  	{"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
   758  	{"a\xCC", "\x80\x80", "\u00E0\x80"},
   759  	{"a\xCC", "\x80\xCC", "\u00E0\xCC"},
   760  	{"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
   761  	// ending in incomplete UTF-8 encoding
   762  	{"", "\xCC", "\xCC"},
   763  	{"a", "\xCC", "a\xCC"},
   764  	{"a", "b\xCC", "ab\xCC"},
   765  	{"\u0226", "\xCC", "\u0226\xCC"},
   766  	// illegal runes
   767  	{"", "\x80", "\x80"},
   768  	{"", "\x80\x80\x80", "\x80\x80\x80"},
   769  	{"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
   770  	{"", "a\x80", "a\x80"},
   771  	{"", "a\x80\x80\x80", "a\x80\x80\x80"},
   772  	{"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
   773  	{"a", "\x80\x80\x80", "a\x80\x80\x80"},
   774  	// overflow
   775  	{"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
   776  	{strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
   777  	{strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
   778  	// overflow of combining characters
   779  	{"", grave(34), grave(30) + cgj + grave(4)},
   780  	{"", grave(36), grave(30) + cgj + grave(6)},
   781  	{grave(29), grave(5), grave(30) + cgj + grave(4)},
   782  	{grave(30), grave(4), grave(30) + cgj + grave(4)},
   783  	{grave(30), grave(3), grave(30) + cgj + grave(3)},
   784  	{grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
   785  	{"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
   786  	{"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
   787  	// - First rune has a trailing non-starter.
   788  	{"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
   789  	// - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
   790  	//   inserted even when FF9E starts a new segment.
   791  	{"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
   792  	{grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
   793  	// - Many non-starter decompositions in a row causing overflow.
   794  	{"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
   795  	{"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
   796  
   797  	{"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
   798  	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
   799  	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
   800  
   801  	// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
   802  	{"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
   803  	{"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
   804  	{"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
   805  
   806  	// weird UTF-8
   807  	{"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
   808  	{"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
   809  	{"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
   810  	{"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
   811  	{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
   812  	{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   813  	{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
   814  	{"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   815  
   816  	{"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
   817  	// large input.
   818  	{"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
   819  	{"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
   820  	{"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
   821  	{"", "\u0041\u0307\u0304", "\u01E0"},
   822  }
   823  
   824  var appendTestsNFKD = []AppendTest{
   825  	{"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
   826  
   827  	{ // segment overflow on unchanged character
   828  		"",
   829  		"a" + grave(64) + "\u0316",
   830  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
   831  	},
   832  	{ // segment overflow on unchanged character + start value
   833  		"",
   834  		"a" + grave(98) + "\u0316",
   835  		"a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
   836  	},
   837  	{ // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
   838  		"",
   839  		"a" + grave(59) + "\u0340",
   840  		"a" + grave(30) + cgj + grave(30),
   841  	},
   842  	{ // segment overflow on non-starter decomposition
   843  		"",
   844  		"a" + grave(33) + "\u0340" + grave(30) + "\u0320",
   845  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
   846  	},
   847  	{ // start value after ASCII overflow
   848  		"",
   849  		rep('a', segSize) + grave(32) + "\u0320",
   850  		rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
   851  	},
   852  	{ // Jamo overflow
   853  		"",
   854  		"\u1100\u1161" + grave(30) + "\u0320" + grave(2),
   855  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   856  	},
   857  	{ // Hangul
   858  		"",
   859  		"\uac00",
   860  		"\u1100\u1161",
   861  	},
   862  	{ // Hangul overflow
   863  		"",
   864  		"\uac00" + grave(32) + "\u0320",
   865  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   866  	},
   867  	{ // Hangul overflow in Hangul mode.
   868  		"",
   869  		"\uac00\uac00" + grave(32) + "\u0320",
   870  		"\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   871  	},
   872  	{ // Hangul overflow in Hangul mode.
   873  		"",
   874  		strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
   875  		strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
   876  	},
   877  	{ // start value after cc=0
   878  		"",
   879  		"您您" + grave(34) + "\u0320",
   880  		"您您" + grave(30) + cgj + "\u0320" + grave(4),
   881  	},
   882  	{ // start value after normalization
   883  		"",
   884  		"\u0300\u0320a" + grave(34) + "\u0320",
   885  		"\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
   886  	},
   887  	{
   888  		// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
   889  		"",
   890  		"a\u0f7f" + rep(0xf71, 29) + "\u0f81",
   891  		"a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
   892  	},
   893  }
   894  
   895  func TestAppend(t *testing.T) {
   896  	runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
   897  		return f.Append(out, []byte(s)...)
   898  	})
   899  }
   900  
   901  func TestAppendString(t *testing.T) {
   902  	runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
   903  		return f.AppendString(out, s)
   904  	})
   905  }
   906  
   907  func TestBytes(t *testing.T) {
   908  	runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
   909  		buf := []byte{}
   910  		buf = append(buf, out...)
   911  		buf = append(buf, s...)
   912  		return f.Bytes(buf)
   913  	})
   914  }
   915  
   916  func TestString(t *testing.T) {
   917  	runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
   918  		outs := string(out) + s
   919  		return []byte(f.String(outs))
   920  	})
   921  }
   922  
   923  func TestLinking(t *testing.T) {
   924  	const prog = `
   925  	package main
   926  	import "fmt"
   927  	import "golang.org/x/text/unicode/norm"
   928  	func main() { fmt.Println(norm.%s) }
   929  	`
   930  	baseline, errB := testtext.CodeSize(fmt.Sprintf(prog, "MaxSegmentSize"))
   931  	withTables, errT := testtext.CodeSize(fmt.Sprintf(prog, `NFC.String("")`))
   932  	if errB != nil || errT != nil {
   933  		t.Skipf("code size failed: %v and %v", errB, errT)
   934  	}
   935  	// Tables are at least 50K
   936  	if d := withTables - baseline; d < 50*1024 {
   937  		t.Errorf("tables appear not to be dropped: %d - %d = %d",
   938  			withTables, baseline, d)
   939  	}
   940  }
   941  
   942  func appendBench(f Form, in []byte) func() {
   943  	buf := make([]byte, 0, 4*len(in))
   944  	return func() {
   945  		f.Append(buf, in...)
   946  	}
   947  }
   948  
   949  func bytesBench(f Form, in []byte) func() {
   950  	return func() {
   951  		f.Bytes(in)
   952  	}
   953  }
   954  
   955  func iterBench(f Form, in []byte) func() {
   956  	iter := Iter{}
   957  	return func() {
   958  		iter.Init(f, in)
   959  		for !iter.Done() {
   960  			iter.Next()
   961  		}
   962  	}
   963  }
   964  
   965  func transformBench(f Form, in []byte) func() {
   966  	buf := make([]byte, 4*len(in))
   967  	return func() {
   968  		if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
   969  			log.Panic(n, len(in), err)
   970  		}
   971  	}
   972  }
   973  
   974  func readerBench(f Form, in []byte) func() {
   975  	buf := make([]byte, 4*len(in))
   976  	return func() {
   977  		r := f.Reader(bytes.NewReader(in))
   978  		var err error
   979  		for err == nil {
   980  			_, err = r.Read(buf)
   981  		}
   982  		if err != io.EOF {
   983  			panic("")
   984  		}
   985  	}
   986  }
   987  
   988  func writerBench(f Form, in []byte) func() {
   989  	buf := make([]byte, 0, 4*len(in))
   990  	return func() {
   991  		r := f.Writer(bytes.NewBuffer(buf))
   992  		if _, err := r.Write(in); err != nil {
   993  			panic("")
   994  		}
   995  	}
   996  }
   997  
   998  func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
   999  	bm = append(bm, appendBench(f, in))
  1000  	bm = append(bm, iterBench(f, in))
  1001  	bm = append(bm, transformBench(f, in))
  1002  	bm = append(bm, readerBench(f, in))
  1003  	bm = append(bm, writerBench(f, in))
  1004  	return bm
  1005  }
  1006  
  1007  func doFormBenchmark(b *testing.B, inf, f Form, s string) {
  1008  	b.StopTimer()
  1009  	in := inf.Bytes([]byte(s))
  1010  	bm := appendBenchmarks(nil, f, in)
  1011  	b.SetBytes(int64(len(in) * len(bm)))
  1012  	b.StartTimer()
  1013  	for i := 0; i < b.N; i++ {
  1014  		for _, fn := range bm {
  1015  			fn()
  1016  		}
  1017  	}
  1018  }
  1019  
  1020  func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
  1021  	b.StopTimer()
  1022  	fn := f(NFC, s)
  1023  	b.SetBytes(int64(len(s)))
  1024  	b.StartTimer()
  1025  	for i := 0; i < b.N; i++ {
  1026  		fn()
  1027  	}
  1028  }
  1029  
  1030  var (
  1031  	smallNoChange = []byte("nörmalization")
  1032  	smallChange   = []byte("No\u0308rmalization")
  1033  	ascii         = strings.Repeat("There is nothing to change here! ", 500)
  1034  )
  1035  
  1036  func lowerBench(f Form, in []byte) func() {
  1037  	// Use package strings instead of bytes as it doesn't allocate memory
  1038  	// if there aren't any changes.
  1039  	s := string(in)
  1040  	return func() {
  1041  		strings.ToLower(s)
  1042  	}
  1043  }
  1044  
  1045  func BenchmarkLowerCaseNoChange(b *testing.B) {
  1046  	doSingle(b, lowerBench, smallNoChange)
  1047  }
  1048  func BenchmarkLowerCaseChange(b *testing.B) {
  1049  	doSingle(b, lowerBench, smallChange)
  1050  }
  1051  
  1052  func quickSpanBench(f Form, in []byte) func() {
  1053  	return func() {
  1054  		f.QuickSpan(in)
  1055  	}
  1056  }
  1057  
  1058  func BenchmarkQuickSpanChangeNFC(b *testing.B) {
  1059  	doSingle(b, quickSpanBench, smallNoChange)
  1060  }
  1061  
  1062  func BenchmarkBytesNoChangeNFC(b *testing.B) {
  1063  	doSingle(b, bytesBench, smallNoChange)
  1064  }
  1065  func BenchmarkBytesChangeNFC(b *testing.B) {
  1066  	doSingle(b, bytesBench, smallChange)
  1067  }
  1068  
  1069  func BenchmarkAppendNoChangeNFC(b *testing.B) {
  1070  	doSingle(b, appendBench, smallNoChange)
  1071  }
  1072  func BenchmarkAppendChangeNFC(b *testing.B) {
  1073  	doSingle(b, appendBench, smallChange)
  1074  }
  1075  func BenchmarkAppendLargeNFC(b *testing.B) {
  1076  	doSingle(b, appendBench, txt_all_bytes)
  1077  }
  1078  
  1079  func BenchmarkIterNoChangeNFC(b *testing.B) {
  1080  	doSingle(b, iterBench, smallNoChange)
  1081  }
  1082  func BenchmarkIterChangeNFC(b *testing.B) {
  1083  	doSingle(b, iterBench, smallChange)
  1084  }
  1085  func BenchmarkIterLargeNFC(b *testing.B) {
  1086  	doSingle(b, iterBench, txt_all_bytes)
  1087  }
  1088  
  1089  func BenchmarkTransformNoChangeNFC(b *testing.B) {
  1090  	doSingle(b, transformBench, smallNoChange)
  1091  }
  1092  func BenchmarkTransformChangeNFC(b *testing.B) {
  1093  	doSingle(b, transformBench, smallChange)
  1094  }
  1095  func BenchmarkTransformLargeNFC(b *testing.B) {
  1096  	doSingle(b, transformBench, txt_all_bytes)
  1097  }
  1098  
  1099  func BenchmarkNormalizeAsciiNFC(b *testing.B) {
  1100  	doFormBenchmark(b, NFC, NFC, ascii)
  1101  }
  1102  func BenchmarkNormalizeAsciiNFD(b *testing.B) {
  1103  	doFormBenchmark(b, NFC, NFD, ascii)
  1104  }
  1105  func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
  1106  	doFormBenchmark(b, NFC, NFKC, ascii)
  1107  }
  1108  func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
  1109  	doFormBenchmark(b, NFC, NFKD, ascii)
  1110  }
  1111  
  1112  func BenchmarkNormalizeNFC2NFC(b *testing.B) {
  1113  	doFormBenchmark(b, NFC, NFC, txt_all)
  1114  }
  1115  func BenchmarkNormalizeNFC2NFD(b *testing.B) {
  1116  	doFormBenchmark(b, NFC, NFD, txt_all)
  1117  }
  1118  func BenchmarkNormalizeNFD2NFC(b *testing.B) {
  1119  	doFormBenchmark(b, NFD, NFC, txt_all)
  1120  }
  1121  func BenchmarkNormalizeNFD2NFD(b *testing.B) {
  1122  	doFormBenchmark(b, NFD, NFD, txt_all)
  1123  }
  1124  
  1125  // Hangul is often special-cased, so we test it separately.
  1126  func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
  1127  	doFormBenchmark(b, NFC, NFC, txt_kr)
  1128  }
  1129  func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
  1130  	doFormBenchmark(b, NFC, NFD, txt_kr)
  1131  }
  1132  func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
  1133  	doFormBenchmark(b, NFD, NFC, txt_kr)
  1134  }
  1135  func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
  1136  	doFormBenchmark(b, NFD, NFD, txt_kr)
  1137  }
  1138  
  1139  var forms = []Form{NFC, NFD, NFKC, NFKD}
  1140  
  1141  func doTextBenchmark(b *testing.B, s string) {
  1142  	b.StopTimer()
  1143  	in := []byte(s)
  1144  	bm := []func(){}
  1145  	for _, f := range forms {
  1146  		bm = appendBenchmarks(bm, f, in)
  1147  	}
  1148  	b.SetBytes(int64(len(s) * len(bm)))
  1149  	b.StartTimer()
  1150  	for i := 0; i < b.N; i++ {
  1151  		for _, f := range bm {
  1152  			f()
  1153  		}
  1154  	}
  1155  }
  1156  
  1157  func BenchmarkCanonicalOrdering(b *testing.B) {
  1158  	doTextBenchmark(b, txt_canon)
  1159  }
  1160  func BenchmarkExtendedLatin(b *testing.B) {
  1161  	doTextBenchmark(b, txt_vn)
  1162  }
  1163  func BenchmarkMiscTwoByteUtf8(b *testing.B) {
  1164  	doTextBenchmark(b, twoByteUtf8)
  1165  }
  1166  func BenchmarkMiscThreeByteUtf8(b *testing.B) {
  1167  	doTextBenchmark(b, threeByteUtf8)
  1168  }
  1169  func BenchmarkHangul(b *testing.B) {
  1170  	doTextBenchmark(b, txt_kr)
  1171  }
  1172  func BenchmarkJapanese(b *testing.B) {
  1173  	doTextBenchmark(b, txt_jp)
  1174  }
  1175  func BenchmarkChinese(b *testing.B) {
  1176  	doTextBenchmark(b, txt_cn)
  1177  }
  1178  func BenchmarkOverflow(b *testing.B) {
  1179  	doTextBenchmark(b, overflow)
  1180  }
  1181  
  1182  var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
  1183  
  1184  // Tests sampled from the Canonical ordering tests (Part 2) of
  1185  // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
  1186  const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
  1187  \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
  1188  \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
  1189  \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062 
  1190  \u0061\u059A\u0316\u302A\u0339       \u0061\u0341\u0315\u0300\u05AE\u0062
  1191  \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
  1192  \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
  1193  \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
  1194  \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
  1195  \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
  1196  \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
  1197  \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
  1198  \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
  1199  \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
  1200  \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
  1201  \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
  1202  \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
  1203  \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
  1204  
  1205  // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
  1206  const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. 
  1207  Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ 
  1208  nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc 
  1209  một giấy phép khác có các điều khoản tương tự như giấy phép này
  1210  cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
  1211  trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
  1212  người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
  1213  bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
  1214  chúng theo quy định của pháp luật thì tình trạng của nó không 
  1215  bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
  1216  
  1217  // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
  1218  const txt_ru = `При обязательном соблюдении следующих условий:
  1219  Attribution — Вы должны атрибутировать произведение (указывать
  1220  автора и источник) в порядке, предусмотренном автором или
  1221  лицензиаром (но только так, чтобы никоим образом не подразумевалось,
  1222  что они поддерживают вас или использование вами данного произведения).
  1223  Υπό τις ακόλουθες προϋποθέσεις:`
  1224  
  1225  // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
  1226  const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
  1227  τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
  1228  (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
  1229  τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
  1230  τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
  1231  μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
  1232  παρόμοια άδεια.`
  1233  
  1234  // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
  1235  const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
  1236  تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
  1237  الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
  1238  المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
  1239  من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
  1240  لهذا الترخيص.`
  1241  
  1242  // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
  1243  const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
  1244  המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
  1245  שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
  1246  לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
  1247  החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
  1248  
  1249  const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
  1250  
  1251  // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
  1252  const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
  1253  (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
  1254  원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
  1255  이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다). 
  1256  동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
  1257  라이선스와 동일한 라이선스를 적용해야 합니다.`
  1258  
  1259  // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
  1260  const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
  1261  มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
  1262  ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
  1263  คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
  1264  อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
  1265  อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
  1266  
  1267  const threeByteUtf8 = txt_th
  1268  
  1269  // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
  1270  const txt_jp = `あなたの従うべき条件は以下の通りです。
  1271  表示 — あなたは原著作者のクレジットを表示しなければなりません。
  1272  継承 — もしあなたがこの作品を改変、変形または加工した場合、
  1273  あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
  1274  頒布することができます。`
  1275  
  1276  // http://creativecommons.org/licenses/by-sa/2.5/cn/
  1277  const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
  1278  广播或通过信息网络传播本作品 创作演绎作品
  1279  对本作品进行商业性使用 惟须遵守下列条件:
  1280  署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
  1281  相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
  1282  您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
  1283  
  1284  const txt_cjk = txt_cn + txt_jp + txt_kr
  1285  const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
  1286  
  1287  var txt_all_bytes = []byte(txt_all)