github.com/primecitizens/pcz/std@v0.2.1/text/unicode/utf8/utf8_test.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2009 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  package utf8_test
     9  
    10  import (
    11  	"bytes"
    12  	"strings"
    13  	"testing"
    14  
    15  	stdstring "github.com/primecitizens/pcz/std/builtin/string"
    16  	. "github.com/primecitizens/pcz/std/text/unicode/common"
    17  	. "github.com/primecitizens/pcz/std/text/unicode/utf8"
    18  )
    19  
    20  type Utf8Map struct {
    21  	r   rune
    22  	str string
    23  }
    24  
    25  var utf8map = []Utf8Map{
    26  	{0x0000, "\x00"},
    27  	{0x0001, "\x01"},
    28  	{0x007e, "\x7e"},
    29  	{0x007f, "\x7f"},
    30  	{0x0080, "\xc2\x80"},
    31  	{0x0081, "\xc2\x81"},
    32  	{0x00bf, "\xc2\xbf"},
    33  	{0x00c0, "\xc3\x80"},
    34  	{0x00c1, "\xc3\x81"},
    35  	{0x00c8, "\xc3\x88"},
    36  	{0x00d0, "\xc3\x90"},
    37  	{0x00e0, "\xc3\xa0"},
    38  	{0x00f0, "\xc3\xb0"},
    39  	{0x00f8, "\xc3\xb8"},
    40  	{0x00ff, "\xc3\xbf"},
    41  	{0x0100, "\xc4\x80"},
    42  	{0x07ff, "\xdf\xbf"},
    43  	{0x0400, "\xd0\x80"},
    44  	{0x0800, "\xe0\xa0\x80"},
    45  	{0x0801, "\xe0\xa0\x81"},
    46  	{0x1000, "\xe1\x80\x80"},
    47  	{0xd000, "\xed\x80\x80"},
    48  	{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
    49  	{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
    50  	{0xfffe, "\xef\xbf\xbe"},
    51  	{0xffff, "\xef\xbf\xbf"},
    52  	{0x10000, "\xf0\x90\x80\x80"},
    53  	{0x10001, "\xf0\x90\x80\x81"},
    54  	{0x40000, "\xf1\x80\x80\x80"},
    55  	{0x10fffe, "\xf4\x8f\xbf\xbe"},
    56  	{0x10ffff, "\xf4\x8f\xbf\xbf"},
    57  	{0xFFFD, "\xef\xbf\xbd"},
    58  }
    59  
    60  var surrogateMap = []Utf8Map{
    61  	{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
    62  	{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
    63  }
    64  
    65  var testStrings = []string{
    66  	"",
    67  	"abcd",
    68  	"☺☻☹",
    69  	"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
    70  	"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
    71  	"\x80\x80\x80\x80",
    72  }
    73  
    74  func TestFullRune(t *testing.T) {
    75  	for _, m := range utf8map {
    76  		b := []byte(m.str)
    77  		s := m.str
    78  		if !FullRune(s) {
    79  			t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
    80  		}
    81  		b1 := b[0 : len(b)-1]
    82  		s1 := string(b1)
    83  		if FullRune(s1) {
    84  			t.Errorf("FullRune(%q) = true, want false", s1)
    85  		}
    86  	}
    87  	for _, s := range []string{"\xc0", "\xc1"} {
    88  		if !FullRune(s) {
    89  			t.Errorf("FullRuneInString(%q) = false, want true", s)
    90  		}
    91  	}
    92  }
    93  
    94  func TestEncodeRune(t *testing.T) {
    95  	for _, m := range utf8map {
    96  		b := []byte(m.str)
    97  		var buf [10]byte
    98  		b1, _ := EncodeRune(buf[0:0], m.r)
    99  		if !bytes.Equal(b, b1) {
   100  			t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
   101  		}
   102  	}
   103  }
   104  
   105  func TestAppendRune(t *testing.T) {
   106  	for _, m := range utf8map {
   107  		if buf := AppendRunes(nil, m.r); string(buf) != m.str {
   108  			t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
   109  		}
   110  		if buf := AppendRunes([]byte("init"), m.r); string(buf) != "init"+m.str {
   111  			t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
   112  		}
   113  	}
   114  }
   115  
   116  func TestDecodeRune(t *testing.T) {
   117  	for _, m := range utf8map {
   118  		b := []byte(m.str)
   119  		s := m.str
   120  		r, size := First(s)
   121  		if r != m.r || size != len(b) {
   122  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
   123  		}
   124  
   125  		s = m.str + "\x00"
   126  		r, size = First(s)
   127  		if r != m.r || size != len(b) {
   128  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
   129  		}
   130  
   131  		// make sure missing bytes fail
   132  		wantsize := 1
   133  		if wantsize >= len(b) {
   134  			wantsize = 0
   135  		}
   136  		s = m.str[0 : len(m.str)-1]
   137  		r, size = First(s)
   138  		if r != RuneError || size != wantsize {
   139  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
   140  		}
   141  
   142  		// make sure bad sequences fail
   143  		if len(b) == 1 {
   144  			b[0] = 0x80
   145  		} else {
   146  			b[len(b)-1] = 0x7F
   147  		}
   148  		s = string(b)
   149  		r, size = First(s)
   150  		if r != RuneError || size != 1 {
   151  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
   152  		}
   153  
   154  	}
   155  }
   156  
   157  func TestDecodeSurrogateRune(t *testing.T) {
   158  	for _, m := range surrogateMap {
   159  		b := []byte(m.str)
   160  		s := m.str
   161  		r, size := First(s)
   162  		if r != RuneError || size != 1 {
   163  			t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
   164  		}
   165  	}
   166  }
   167  
   168  // Check that DecodeRune and DecodeLastRune correspond to
   169  // the equivalent range loop.
   170  func TestSequencing(t *testing.T) {
   171  	for _, ts := range testStrings {
   172  		for _, m := range utf8map {
   173  			for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
   174  				testSequence(t, s)
   175  			}
   176  		}
   177  	}
   178  }
   179  
   180  func runtimeRuneCount(s string) int {
   181  	return len([]rune(s)) // Replaced by gc with call to runtime.countrunes(s).
   182  }
   183  
   184  // Check that a range loop, len([]rune(string)) optimization and
   185  // []rune conversions visit the same runes.
   186  // Not really a test of this package, but the assumption is used here and
   187  // it's good to verify.
   188  func TestRuntimeConversion(t *testing.T) {
   189  	for _, ts := range testStrings {
   190  		count := Count(ts)
   191  		if n := runtimeRuneCount(ts); n != count {
   192  			t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCountInString", ts, n, count)
   193  			break
   194  		}
   195  
   196  		runes := []rune(ts)
   197  		if n := len(runes); n != count {
   198  			t.Errorf("%q: []rune() has length %d; got %d from RuneCountInString", ts, n, count)
   199  			break
   200  		}
   201  		i := 0
   202  		for _, r := range ts {
   203  			if r != runes[i] {
   204  				t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
   205  			}
   206  			i++
   207  		}
   208  	}
   209  }
   210  
   211  var invalidSequenceTests = []string{
   212  	"\xed\xa0\x80\x80", // surrogate min
   213  	"\xed\xbf\xbf\x80", // surrogate max
   214  
   215  	// xx
   216  	"\x91\x80\x80\x80",
   217  
   218  	// s1
   219  	"\xC2\x7F\x80\x80",
   220  	"\xC2\xC0\x80\x80",
   221  	"\xDF\x7F\x80\x80",
   222  	"\xDF\xC0\x80\x80",
   223  
   224  	// s2
   225  	"\xE0\x9F\xBF\x80",
   226  	"\xE0\xA0\x7F\x80",
   227  	"\xE0\xBF\xC0\x80",
   228  	"\xE0\xC0\x80\x80",
   229  
   230  	// s3
   231  	"\xE1\x7F\xBF\x80",
   232  	"\xE1\x80\x7F\x80",
   233  	"\xE1\xBF\xC0\x80",
   234  	"\xE1\xC0\x80\x80",
   235  
   236  	//s4
   237  	"\xED\x7F\xBF\x80",
   238  	"\xED\x80\x7F\x80",
   239  	"\xED\x9F\xC0\x80",
   240  	"\xED\xA0\x80\x80",
   241  
   242  	// s5
   243  	"\xF0\x8F\xBF\xBF",
   244  	"\xF0\x90\x7F\xBF",
   245  	"\xF0\x90\x80\x7F",
   246  	"\xF0\xBF\xBF\xC0",
   247  	"\xF0\xBF\xC0\x80",
   248  	"\xF0\xC0\x80\x80",
   249  
   250  	// s6
   251  	"\xF1\x7F\xBF\xBF",
   252  	"\xF1\x80\x7F\xBF",
   253  	"\xF1\x80\x80\x7F",
   254  	"\xF1\xBF\xBF\xC0",
   255  	"\xF1\xBF\xC0\x80",
   256  	"\xF1\xC0\x80\x80",
   257  
   258  	// s7
   259  	"\xF4\x7F\xBF\xBF",
   260  	"\xF4\x80\x7F\xBF",
   261  	"\xF4\x80\x80\x7F",
   262  	"\xF4\x8F\xBF\xC0",
   263  	"\xF4\x8F\xC0\x80",
   264  	"\xF4\x90\x80\x80",
   265  }
   266  
   267  func runtimeDecodeRune(s string) rune {
   268  	for _, r := range s {
   269  		return r
   270  	}
   271  	return -1
   272  }
   273  
   274  func TestDecodeInvalidSequence(t *testing.T) {
   275  	for _, s := range invalidSequenceTests {
   276  		r2, _ := First(s)
   277  		if want := RuneError; r2 != want {
   278  			t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s, r2, want)
   279  			return
   280  		}
   281  
   282  		r3 := runtimeDecodeRune(s)
   283  		if r2 != r3 {
   284  			t.Errorf("DecodeRune(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
   285  			return
   286  		}
   287  	}
   288  }
   289  
   290  func testSequence(t *testing.T, s string) {
   291  	type info struct {
   292  		index int
   293  		r     rune
   294  	}
   295  	index := make([]info, len(s))
   296  	si := 0
   297  	j := 0
   298  	for i, r := range s {
   299  		if si != i {
   300  			t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
   301  			return
   302  		}
   303  		index[j] = info{i, r}
   304  		j++
   305  		r2, size2 := First(s[i:])
   306  		if r != r2 {
   307  			t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r2, r)
   308  			return
   309  		}
   310  		si += size2
   311  	}
   312  	j--
   313  	for si = len(s); si > 0; {
   314  		r2, size2 := Last(s[0:si])
   315  		if r2 != index[j].r {
   316  			t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
   317  			return
   318  		}
   319  		si -= size2
   320  		if si != index[j].index {
   321  			t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
   322  			return
   323  		}
   324  		j--
   325  	}
   326  	if si != 0 {
   327  		t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
   328  	}
   329  }
   330  
   331  // Check that negative runes encode as U+FFFD.
   332  func TestNegativeRune(t *testing.T) {
   333  	errorbuf := make([]byte, 0, MaxRuneLen)
   334  	errorbuf, _ = EncodeRune(errorbuf, RuneError)
   335  	buf := make([]byte, 0, MaxRuneLen)
   336  	buf, _ = EncodeRune(buf, -1)
   337  	if !bytes.Equal(buf, errorbuf) {
   338  		t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
   339  	}
   340  }
   341  
   342  type RuneCountTest struct {
   343  	in  string
   344  	out int
   345  }
   346  
   347  var runecounttests = []RuneCountTest{
   348  	{"abcd", 4},
   349  	{"☺☻☹", 3},
   350  	{"1,2,3,4", 7},
   351  	{"\xe2\x00", 2},
   352  	{"\xe2\x80", 2},
   353  	{"a\xe2\x80", 3},
   354  }
   355  
   356  func TestRuneCount(t *testing.T) {
   357  	for _, tt := range runecounttests {
   358  		if out := Count(tt.in); out != tt.out {
   359  			t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
   360  		}
   361  	}
   362  }
   363  
   364  type RuneLenTest struct {
   365  	r    rune
   366  	size int
   367  }
   368  
   369  var runelentests = []RuneLenTest{
   370  	{0, 1},
   371  	{'e', 1},
   372  	{'é', 2},
   373  	{'☺', 3},
   374  	{RuneError, 3},
   375  	{MaxRune, 4},
   376  	{0xD800, -1},
   377  	{0xDFFF, -1},
   378  	{MaxRune + 1, -1},
   379  	{-1, -1},
   380  }
   381  
   382  func TestRuneLen(t *testing.T) {
   383  	for _, tt := range runelentests {
   384  		if size := RuneLen(tt.r); size != tt.size {
   385  			t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
   386  		}
   387  	}
   388  }
   389  
   390  type ValidTest struct {
   391  	in  string
   392  	out bool
   393  }
   394  
   395  var validTests = []ValidTest{
   396  	{"", true},
   397  	{"a", true},
   398  	{"abc", true},
   399  	{"Ж", true},
   400  	{"ЖЖ", true},
   401  	{"брэд-ЛГТМ", true},
   402  	{"☺☻☹", true},
   403  	{"aa\xe2", false},
   404  	{string([]byte{66, 250}), false},
   405  	{string([]byte{66, 250, 67}), false},
   406  	{"a\uFFFDb", true},
   407  	{string("\xF4\x8F\xBF\xBF"), true},      // U+10FFFF
   408  	{string("\xF4\x90\x80\x80"), false},     // U+10FFFF+1; out of range
   409  	{string("\xF7\xBF\xBF\xBF"), false},     // 0x1FFFFF; out of range
   410  	{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
   411  	{string("\xc0\x80"), false},             // U+0000 encoded in two bytes: incorrect
   412  	{string("\xed\xa0\x80"), false},         // U+D800 high surrogate (sic)
   413  	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
   414  }
   415  
   416  func TestValid(t *testing.T) {
   417  	for _, tt := range validTests {
   418  		if Valid(tt.in) != tt.out {
   419  			t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
   420  		}
   421  	}
   422  }
   423  
   424  type ValidRuneTest struct {
   425  	r  rune
   426  	ok bool
   427  }
   428  
   429  var validrunetests = []ValidRuneTest{
   430  	{0, true},
   431  	{'e', true},
   432  	{'é', true},
   433  	{'☺', true},
   434  	{RuneError, true},
   435  	{MaxRune, true},
   436  	{0xD7FF, true},
   437  	{0xD800, false},
   438  	{0xDFFF, false},
   439  	{0xE000, true},
   440  	{MaxRune + 1, false},
   441  	{-1, false},
   442  }
   443  
   444  func TestValidRune(t *testing.T) {
   445  	for _, tt := range validrunetests {
   446  		if ok := RuneValid(tt.r); ok != tt.ok {
   447  			t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
   448  		}
   449  	}
   450  }
   451  
   452  func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
   453  	for i := 0; i < b.N; i++ {
   454  		Count("0123456789")
   455  	}
   456  }
   457  
   458  func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
   459  	for i := 0; i < b.N; i++ {
   460  		Count("日本語日本語日本語日")
   461  	}
   462  }
   463  
   464  var ascii100000 = strings.Repeat("0123456789", 10000)
   465  
   466  func BenchmarkValidStringTenASCIIChars(b *testing.B) {
   467  	for i := 0; i < b.N; i++ {
   468  		Valid("0123456789")
   469  	}
   470  }
   471  
   472  func BenchmarkValidString100KASCIIChars(b *testing.B) {
   473  	for i := 0; i < b.N; i++ {
   474  		Valid(ascii100000)
   475  	}
   476  }
   477  
   478  func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
   479  	for i := 0; i < b.N; i++ {
   480  		Valid("日本語日本語日本語日")
   481  	}
   482  }
   483  
   484  func BenchmarkValidStringLongMostlyASCII(b *testing.B) {
   485  	for i := 0; i < b.N; i++ {
   486  		Valid(longStringMostlyASCII)
   487  	}
   488  }
   489  
   490  func BenchmarkValidStringLongJapanese(b *testing.B) {
   491  	for i := 0; i < b.N; i++ {
   492  		Valid(longStringJapanese)
   493  	}
   494  }
   495  
   496  var longStringMostlyASCII string // ~100KB, ~97% ASCII
   497  var longStringJapanese string    // ~100KB, non-ASCII
   498  
   499  func init() {
   500  	const japanese = "日本語日本語日本語日"
   501  	var b strings.Builder
   502  	for i := 0; b.Len() < 100_000; i++ {
   503  		if i%100 == 0 {
   504  			b.WriteString(japanese)
   505  		} else {
   506  			b.WriteString("0123456789")
   507  		}
   508  	}
   509  	longStringMostlyASCII = b.String()
   510  	longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese))
   511  }
   512  
   513  func BenchmarkEncodeASCIIRune(b *testing.B) {
   514  	buf := make([]byte, 0, MaxRuneLen)
   515  	for i := 0; i < b.N; i++ {
   516  		EncodeRune(buf, 'a')
   517  	}
   518  }
   519  
   520  func BenchmarkEncodeJapaneseRune(b *testing.B) {
   521  	buf := make([]byte, 0, MaxRuneLen)
   522  	for i := 0; i < b.N; i++ {
   523  		EncodeRune(buf, '本')
   524  	}
   525  }
   526  
   527  func BenchmarkAppendASCIIRune(b *testing.B) {
   528  	buf := make([]byte, MaxRuneLen)
   529  	for i := 0; i < b.N; i++ {
   530  		AppendRunes(buf[:0], 'a')
   531  	}
   532  }
   533  
   534  func BenchmarkAppendJapaneseRune(b *testing.B) {
   535  	buf := make([]byte, MaxRuneLen)
   536  	for i := 0; i < b.N; i++ {
   537  		AppendRunes(buf[:0], '本')
   538  	}
   539  }
   540  
   541  func BenchmarkDecodeASCIIRune(b *testing.B) {
   542  	a := "a"
   543  	for i := 0; i < b.N; i++ {
   544  		First(a)
   545  	}
   546  }
   547  
   548  func BenchmarkDecodeJapaneseRune(b *testing.B) {
   549  	nihon := "本"
   550  	for i := 0; i < b.N; i++ {
   551  		First(nihon)
   552  	}
   553  }
   554  
   555  // boolSink is used to reference the return value of benchmarked
   556  // functions to avoid dead code elimination.
   557  var boolSink bool
   558  
   559  func BenchmarkFullRune(b *testing.B) {
   560  	benchmarks := []struct {
   561  		name string
   562  		data []byte
   563  	}{
   564  		{"ASCII", []byte("a")},
   565  		{"Incomplete", []byte("\xf0\x90\x80")},
   566  		{"Japanese", []byte("本")},
   567  	}
   568  	for _, bm := range benchmarks {
   569  		b.Run(bm.name, func(b *testing.B) {
   570  			for i := 0; i < b.N; i++ {
   571  				boolSink = FullRune(stdstring.FromBytes(bm.data))
   572  			}
   573  		})
   574  	}
   575  }