
     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     5  package utf8_test
     7  import (
     8  	"bytes"
     9  	"testing"
    10  	"unicode"
    11  	. "unicode/utf8"
    12  )
    14  // Validate the constants redefined from unicode.
    15  func init() {
    16  	if MaxRune != unicode.MaxRune {
    17  		panic("utf8.MaxRune is wrong")
    18  	}
    19  	if RuneError != unicode.ReplacementChar {
    20  		panic("utf8.RuneError is wrong")
    21  	}
    22  }
    24  // Validate the constants redefined from unicode.
    25  func TestConstants(t *testing.T) {
    26  	if MaxRune != unicode.MaxRune {
    27  		t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
    28  	}
    29  	if RuneError != unicode.ReplacementChar {
    30  		t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
    31  	}
    32  }
    34  type Utf8Map struct {
    35  	r   rune
    36  	str string
    37  }
    39  var utf8map = []Utf8Map{
    40  	{0x0000, "\x00"},
    41  	{0x0001, "\x01"},
    42  	{0x007e, "\x7e"},
    43  	{0x007f, "\x7f"},
    44  	{0x0080, "\xc2\x80"},
    45  	{0x0081, "\xc2\x81"},
    46  	{0x00bf, "\xc2\xbf"},
    47  	{0x00c0, "\xc3\x80"},
    48  	{0x00c1, "\xc3\x81"},
    49  	{0x00c8, "\xc3\x88"},
    50  	{0x00d0, "\xc3\x90"},
    51  	{0x00e0, "\xc3\xa0"},
    52  	{0x00f0, "\xc3\xb0"},
    53  	{0x00f8, "\xc3\xb8"},
    54  	{0x00ff, "\xc3\xbf"},
    55  	{0x0100, "\xc4\x80"},
    56  	{0x07ff, "\xdf\xbf"},
    57  	{0x0400, "\xd0\x80"},
    58  	{0x0800, "\xe0\xa0\x80"},
    59  	{0x0801, "\xe0\xa0\x81"},
    60  	{0x1000, "\xe1\x80\x80"},
    61  	{0xd000, "\xed\x80\x80"},
    62  	{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
    63  	{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
    64  	{0xfffe, "\xef\xbf\xbe"},
    65  	{0xffff, "\xef\xbf\xbf"},
    66  	{0x10000, "\xf0\x90\x80\x80"},
    67  	{0x10001, "\xf0\x90\x80\x81"},
    68  	{0x40000, "\xf1\x80\x80\x80"},
    69  	{0x10fffe, "\xf4\x8f\xbf\xbe"},
    70  	{0x10ffff, "\xf4\x8f\xbf\xbf"},
    71  	{0xFFFD, "\xef\xbf\xbd"},
    72  }
    74  var surrogateMap = []Utf8Map{
    75  	{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
    76  	{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
    77  }
    79  var testStrings = []string{
    80  	"",
    81  	"abcd",
    82  	"☺☻☹",
    83  	"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
    84  	"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
    85  	"\x80\x80\x80\x80",
    86  }
    88  func TestFullRune(t *testing.T) {
    89  	for _, m := range utf8map {
    90  		b := []byte(m.str)
    91  		if !FullRune(b) {
    92  			t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
    93  		}
    94  		s := m.str
    95  		if !FullRuneInString(s) {
    96  			t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
    97  		}
    98  		b1 := b[0 : len(b)-1]
    99  		if FullRune(b1) {
   100  			t.Errorf("FullRune(%q) = true, want false", b1)
   101  		}
   102  		s1 := string(b1)
   103  		if FullRuneInString(s1) {
   104  			t.Errorf("FullRune(%q) = true, want false", s1)
   105  		}
   106  	}
   107  	for _, s := range []string{"\xc0", "\xc1"} {
   108  		b := []byte(s)
   109  		if !FullRune(b) {
   110  			t.Errorf("FullRune(%q) = false, want true", s)
   111  		}
   112  		if !FullRuneInString(s) {
   113  			t.Errorf("FullRuneInString(%q) = false, want true", s)
   114  		}
   115  	}
   116  }
   118  func TestEncodeRune(t *testing.T) {
   119  	for _, m := range utf8map {
   120  		b := []byte(m.str)
   121  		var buf [10]byte
   122  		n := EncodeRune(buf[0:], m.r)
   123  		b1 := buf[0:n]
   124  		if !bytes.Equal(b, b1) {
   125  			t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
   126  		}
   127  	}
   128  }
   130  func TestDecodeRune(t *testing.T) {
   131  	for _, m := range utf8map {
   132  		b := []byte(m.str)
   133  		r, size := DecodeRune(b)
   134  		if r != m.r || size != len(b) {
   135  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
   136  		}
   137  		s := m.str
   138  		r, size = DecodeRuneInString(s)
   139  		if r != m.r || size != len(b) {
   140  			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
   141  		}
   143  		// there's an extra byte that bytes left behind - make sure trailing byte works
   144  		r, size = DecodeRune(b[0:cap(b)])
   145  		if r != m.r || size != len(b) {
   146  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
   147  		}
   148  		s = m.str + "\x00"
   149  		r, size = DecodeRuneInString(s)
   150  		if r != m.r || size != len(b) {
   151  			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
   152  		}
   154  		// make sure missing bytes fail
   155  		wantsize := 1
   156  		if wantsize >= len(b) {
   157  			wantsize = 0
   158  		}
   159  		r, size = DecodeRune(b[0 : len(b)-1])
   160  		if r != RuneError || size != wantsize {
   161  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
   162  		}
   163  		s = m.str[0 : len(m.str)-1]
   164  		r, size = DecodeRuneInString(s)
   165  		if r != RuneError || size != wantsize {
   166  			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
   167  		}
   169  		// make sure bad sequences fail
   170  		if len(b) == 1 {
   171  			b[0] = 0x80
   172  		} else {
   173  			b[len(b)-1] = 0x7F
   174  		}
   175  		r, size = DecodeRune(b)
   176  		if r != RuneError || size != 1 {
   177  			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
   178  		}
   179  		s = string(b)
   180  		r, size = DecodeRuneInString(s)
   181  		if r != RuneError || size != 1 {
   182  			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
   183  		}
   185  	}
   186  }
   188  func TestDecodeSurrogateRune(t *testing.T) {
   189  	for _, m := range surrogateMap {
   190  		b := []byte(m.str)
   191  		r, size := DecodeRune(b)
   192  		if r != RuneError || size != 1 {
   193  			t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
   194  		}
   195  		s := m.str
   196  		r, size = DecodeRuneInString(s)
   197  		if r != RuneError || size != 1 {
   198  			t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
   199  		}
   200  	}
   201  }
   203  // Check that DecodeRune and DecodeLastRune correspond to
   204  // the equivalent range loop.
   205  func TestSequencing(t *testing.T) {
   206  	for _, ts := range testStrings {
   207  		for _, m := range utf8map {
   208  			for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
   209  				testSequence(t, s)
   210  			}
   211  		}
   212  	}
   213  }
   215  // Check that a range loop and a []int conversion visit the same runes.
   216  // Not really a test of this package, but the assumption is used here and
   217  // it's good to verify
   218  func TestIntConversion(t *testing.T) {
   219  	for _, ts := range testStrings {
   220  		runes := []rune(ts)
   221  		if RuneCountInString(ts) != len(runes) {
   222  			t.Errorf("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
   223  			break
   224  		}
   225  		i := 0
   226  		for _, r := range ts {
   227  			if r != runes[i] {
   228  				t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
   229  			}
   230  			i++
   231  		}
   232  	}
   233  }
   235  var invalidSequenceTests = []string{
   236  	"\xed\xa0\x80\x80", // surrogate min
   237  	"\xed\xbf\xbf\x80", // surrogate max
   239  	// xx
   240  	"\x91\x80\x80\x80",
   242  	// s1
   243  	"\xC2\x7F\x80\x80",
   244  	"\xC2\xC0\x80\x80",
   245  	"\xDF\x7F\x80\x80",
   246  	"\xDF\xC0\x80\x80",
   248  	// s2
   249  	"\xE0\x9F\xBF\x80",
   250  	"\xE0\xA0\x7F\x80",
   251  	"\xE0\xBF\xC0\x80",
   252  	"\xE0\xC0\x80\x80",
   254  	// s3
   255  	"\xE1\x7F\xBF\x80",
   256  	"\xE1\x80\x7F\x80",
   257  	"\xE1\xBF\xC0\x80",
   258  	"\xE1\xC0\x80\x80",
   260  	//s4
   261  	"\xED\x7F\xBF\x80",
   262  	"\xED\x80\x7F\x80",
   263  	"\xED\x9F\xC0\x80",
   264  	"\xED\xA0\x80\x80",
   266  	// s5
   267  	"\xF0\x8F\xBF\xBF",
   268  	"\xF0\x90\x7F\xBF",
   269  	"\xF0\x90\x80\x7F",
   270  	"\xF0\xBF\xBF\xC0",
   271  	"\xF0\xBF\xC0\x80",
   272  	"\xF0\xC0\x80\x80",
   274  	// s6
   275  	"\xF1\x7F\xBF\xBF",
   276  	"\xF1\x80\x7F\xBF",
   277  	"\xF1\x80\x80\x7F",
   278  	"\xF1\xBF\xBF\xC0",
   279  	"\xF1\xBF\xC0\x80",
   280  	"\xF1\xC0\x80\x80",
   282  	// s7
   283  	"\xF4\x7F\xBF\xBF",
   284  	"\xF4\x80\x7F\xBF",
   285  	"\xF4\x80\x80\x7F",
   286  	"\xF4\x8F\xBF\xC0",
   287  	"\xF4\x8F\xC0\x80",
   288  	"\xF4\x90\x80\x80",
   289  }
   291  func runtimeDecodeRune(s string) rune {
   292  	for _, r := range s {
   293  		return r
   294  	}
   295  	return -1
   296  }
   298  func TestDecodeInvalidSequence(t *testing.T) {
   299  	for _, s := range invalidSequenceTests {
   300  		r1, _ := DecodeRune([]byte(s))
   301  		if want := RuneError; r1 != want {
   302  			t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want)
   303  			return
   304  		}
   305  		r2, _ := DecodeRuneInString(s)
   306  		if want := RuneError; r2 != want {
   307  			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want)
   308  			return
   309  		}
   310  		if r1 != r2 {
   311  			t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2)
   312  			return
   313  		}
   314  		r3 := runtimeDecodeRune(s)
   315  		if r2 != r3 {
   316  			t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
   317  			return
   318  		}
   319  	}
   320  }
   322  func testSequence(t *testing.T, s string) {
   323  	type info struct {
   324  		index int
   325  		r     rune
   326  	}
   327  	index := make([]info, len(s))
   328  	b := []byte(s)
   329  	si := 0
   330  	j := 0
   331  	for i, r := range s {
   332  		if si != i {
   333  			t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
   334  			return
   335  		}
   336  		index[j] = info{i, r}
   337  		j++
   338  		r1, size1 := DecodeRune(b[i:])
   339  		if r != r1 {
   340  			t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
   341  			return
   342  		}
   343  		r2, size2 := DecodeRuneInString(s[i:])
   344  		if r != r2 {
   345  			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
   346  			return
   347  		}
   348  		if size1 != size2 {
   349  			t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
   350  			return
   351  		}
   352  		si += size1
   353  	}
   354  	j--
   355  	for si = len(s); si > 0; {
   356  		r1, size1 := DecodeLastRune(b[0:si])
   357  		r2, size2 := DecodeLastRuneInString(s[0:si])
   358  		if size1 != size2 {
   359  			t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
   360  			return
   361  		}
   362  		if r1 != index[j].r {
   363  			t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
   364  			return
   365  		}
   366  		if r2 != index[j].r {
   367  			t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
   368  			return
   369  		}
   370  		si -= size1
   371  		if si != index[j].index {
   372  			t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
   373  			return
   374  		}
   375  		j--
   376  	}
   377  	if si != 0 {
   378  		t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
   379  	}
   380  }
   382  // Check that negative runes encode as U+FFFD.
   383  func TestNegativeRune(t *testing.T) {
   384  	errorbuf := make([]byte, UTFMax)
   385  	errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
   386  	buf := make([]byte, UTFMax)
   387  	buf = buf[0:EncodeRune(buf, -1)]
   388  	if !bytes.Equal(buf, errorbuf) {
   389  		t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
   390  	}
   391  }
   393  type RuneCountTest struct {
   394  	in  string
   395  	out int
   396  }
   398  var runecounttests = []RuneCountTest{
   399  	{"abcd", 4},
   400  	{"☺☻☹", 3},
   401  	{"1,2,3,4", 7},
   402  	{"\xe2\x00", 2},
   403  	{"\xe2\x80", 2},
   404  	{"a\xe2\x80", 3},
   405  }
   407  func TestRuneCount(t *testing.T) {
   408  	for _, tt := range runecounttests {
   409  		if out := RuneCountInString(; out != tt.out {
   410  			t.Errorf("RuneCountInString(%q) = %d, want %d",, out, tt.out)
   411  		}
   412  		if out := RuneCount([]byte(; out != tt.out {
   413  			t.Errorf("RuneCount(%q) = %d, want %d",, out, tt.out)
   414  		}
   415  	}
   416  }
   418  type RuneLenTest struct {
   419  	r    rune
   420  	size int
   421  }
   423  var runelentests = []RuneLenTest{
   424  	{0, 1},
   425  	{'e', 1},
   426  	{'é', 2},
   427  	{'☺', 3},
   428  	{RuneError, 3},
   429  	{MaxRune, 4},
   430  	{0xD800, -1},
   431  	{0xDFFF, -1},
   432  	{MaxRune + 1, -1},
   433  	{-1, -1},
   434  }
   436  func TestRuneLen(t *testing.T) {
   437  	for _, tt := range runelentests {
   438  		if size := RuneLen(tt.r); size != tt.size {
   439  			t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
   440  		}
   441  	}
   442  }
   444  type ValidTest struct {
   445  	in  string
   446  	out bool
   447  }
   449  var validTests = []ValidTest{
   450  	{"", true},
   451  	{"a", true},
   452  	{"abc", true},
   453  	{"Ж", true},
   454  	{"ЖЖ", true},
   455  	{"брэд-ЛГТМ", true},
   456  	{"☺☻☹", true},
   457  	{"aa\xe2", false},
   458  	{string([]byte{66, 250}), false},
   459  	{string([]byte{66, 250, 67}), false},
   460  	{"a\uFFFDb", true},
   461  	{string("\xF4\x8F\xBF\xBF"), true},      // U+10FFFF
   462  	{string("\xF4\x90\x80\x80"), false},     // U+10FFFF+1; out of range
   463  	{string("\xF7\xBF\xBF\xBF"), false},     // 0x1FFFFF; out of range
   464  	{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
   465  	{string("\xc0\x80"), false},             // U+0000 encoded in two bytes: incorrect
   466  	{string("\xed\xa0\x80"), false},         // U+D800 high surrogate (sic)
   467  	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
   468  }
   470  func TestValid(t *testing.T) {
   471  	for _, tt := range validTests {
   472  		if Valid([]byte( != tt.out {
   473  			t.Errorf("Valid(%q) = %v; want %v",, !tt.out, tt.out)
   474  		}
   475  		if ValidString( != tt.out {
   476  			t.Errorf("ValidString(%q) = %v; want %v",, !tt.out, tt.out)
   477  		}
   478  	}
   479  }
   481  type ValidRuneTest struct {
   482  	r  rune
   483  	ok bool
   484  }
   486  var validrunetests = []ValidRuneTest{
   487  	{0, true},
   488  	{'e', true},
   489  	{'é', true},
   490  	{'☺', true},
   491  	{RuneError, true},
   492  	{MaxRune, true},
   493  	{0xD7FF, true},
   494  	{0xD800, false},
   495  	{0xDFFF, false},
   496  	{0xE000, true},
   497  	{MaxRune + 1, false},
   498  	{-1, false},
   499  }
   501  func TestValidRune(t *testing.T) {
   502  	for _, tt := range validrunetests {
   503  		if ok := ValidRune(tt.r); ok != tt.ok {
   504  			t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
   505  		}
   506  	}
   507  }
   509  func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
   510  	s := []byte("0123456789")
   511  	for i := 0; i < b.N; i++ {
   512  		RuneCount(s)
   513  	}
   514  }
   516  func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
   517  	s := []byte("日本語日本語日本語日")
   518  	for i := 0; i < b.N; i++ {
   519  		RuneCount(s)
   520  	}
   521  }
   523  func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) {
   524  	for i := 0; i < b.N; i++ {
   525  		RuneCountInString("0123456789")
   526  	}
   527  }
   529  func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
   530  	for i := 0; i < b.N; i++ {
   531  		RuneCountInString("日本語日本語日本語日")
   532  	}
   533  }
   535  func BenchmarkValidTenASCIIChars(b *testing.B) {
   536  	s := []byte("0123456789")
   537  	for i := 0; i < b.N; i++ {
   538  		Valid(s)
   539  	}
   540  }
   542  func BenchmarkValidTenJapaneseChars(b *testing.B) {
   543  	s := []byte("日本語日本語日本語日")
   544  	for i := 0; i < b.N; i++ {
   545  		Valid(s)
   546  	}
   547  }
   549  func BenchmarkValidStringTenASCIIChars(b *testing.B) {
   550  	for i := 0; i < b.N; i++ {
   551  		ValidString("0123456789")
   552  	}
   553  }
   555  func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
   556  	for i := 0; i < b.N; i++ {
   557  		ValidString("日本語日本語日本語日")
   558  	}
   559  }
   561  func BenchmarkEncodeASCIIRune(b *testing.B) {
   562  	buf := make([]byte, UTFMax)
   563  	for i := 0; i < b.N; i++ {
   564  		EncodeRune(buf, 'a')
   565  	}
   566  }
   568  func BenchmarkEncodeJapaneseRune(b *testing.B) {
   569  	buf := make([]byte, UTFMax)
   570  	for i := 0; i < b.N; i++ {
   571  		EncodeRune(buf, '本')
   572  	}
   573  }
   575  func BenchmarkDecodeASCIIRune(b *testing.B) {
   576  	a := []byte{'a'}
   577  	for i := 0; i < b.N; i++ {
   578  		DecodeRune(a)
   579  	}
   580  }
   582  func BenchmarkDecodeJapaneseRune(b *testing.B) {
   583  	nihon := []byte("本")
   584  	for i := 0; i < b.N; i++ {
   585  		DecodeRune(nihon)
   586  	}
   587  }
   589  func BenchmarkFullASCIIRune(b *testing.B) {
   590  	a := []byte{'a'}
   591  	for i := 0; i < b.N; i++ {
   592  		FullRune(a)
   593  	}
   594  }
   596  func BenchmarkFullJapaneseRune(b *testing.B) {
   597  	nihon := []byte("本")
   598  	for i := 0; i < b.N; i++ {
   599  		FullRune(nihon)
   600  	}
   601  }