github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/textutil/utf8_test.go (about)

     1  // Copyright 2015 The Vanadium Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package textutil
     6  
     7  import (
     8  	"reflect"
     9  	"strings"
    10  	"testing"
    11  )
    12  
    13  func TestUTF8ChunkDecoder(t *testing.T) {
    14  	r2 := "Δ"
    15  	r3 := "王"
    16  	r4 := "\U0001F680"
    17  	tests := []struct {
    18  		Text  string
    19  		Write []rune
    20  		Flush []rune
    21  	}{
    22  		{"", nil, nil},
    23  		{"a", []rune{'a'}, nil},
    24  		{"abc", []rune{'a', 'b', 'c'}, nil},
    25  		{"abc def ghi", []rune{'a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i'}, nil},
    26  		// 2-byte runes.
    27  		{"ΔΘΠΣΦ", []rune{'Δ', 'Θ', 'Π', 'Σ', 'Φ'}, nil},
    28  		// 3-byte runes.
    29  		{"王普澤世界", []rune{'王', '普', '澤', '世', '界'}, nil},
    30  		// 4-byte runes.
    31  		{"\U0001F680\U0001F681\U0001F682\U0001F683", []rune{'\U0001F680', '\U0001F681', '\U0001F682', '\U0001F683'}, nil},
    32  		// Mixed-bytes.
    33  		{"aΔ王\U0001F680普Θb", []rune{'a', 'Δ', '王', '\U0001F680', '普', 'Θ', 'b'}, nil},
    34  		// Error runes translated to U+FFFD.
    35  		{"\uFFFD", []rune{'\uFFFD'}, nil},
    36  		{"a\uFFFDb", []rune{'a', '\uFFFD', 'b'}, nil},
    37  		{"\x80", []rune{'\uFFFD'}, nil},
    38  		{"\xFF", []rune{'\uFFFD'}, nil},
    39  		{"a\x80b", []rune{'a', '\uFFFD', 'b'}, nil},
    40  		{"a\xFFb", []rune{'a', '\uFFFD', 'b'}, nil},
    41  		// Multi-byte full runes.
    42  		{r2, []rune{[]rune(r2)[0]}, nil},
    43  		{r3, []rune{[]rune(r3)[0]}, nil},
    44  		{r4, []rune{[]rune(r4)[0]}, nil},
    45  		// Partial runes translated to one or more U+FFFD.  Since each case is a
    46  		// multi-byte encoding that's missing one or more bytes, the FFFD bytes are
    47  		// all returned in Flush rather than Write.
    48  		{r2[:1], nil, []rune{'\uFFFD'}},
    49  		{r3[:1], nil, []rune{'\uFFFD'}},
    50  		{r3[:2], nil, []rune{'\uFFFD', '\uFFFD'}},
    51  		{r4[:1], nil, []rune{'\uFFFD'}},
    52  		{r4[:2], nil, []rune{'\uFFFD', '\uFFFD'}},
    53  		{r4[:3], nil, []rune{'\uFFFD', '\uFFFD', '\uFFFD'}},
    54  		// Trailing partial runes translated to U+FFFD.  Similar to above, the FFFD
    55  		// bytes are all returned in Flush rather than Write
    56  		{"a" + r2[:1], []rune{'a'}, []rune{'\uFFFD'}},
    57  		{"a" + r3[:1], []rune{'a'}, []rune{'\uFFFD'}},
    58  		{"a" + r3[:2], []rune{'a'}, []rune{'\uFFFD', '\uFFFD'}},
    59  		{"a" + r4[:1], []rune{'a'}, []rune{'\uFFFD'}},
    60  		{"a" + r4[:2], []rune{'a'}, []rune{'\uFFFD', '\uFFFD'}},
    61  		{"a" + r4[:3], []rune{'a'}, []rune{'\uFFFD', '\uFFFD', '\uFFFD'}},
    62  		// Leading partial runes translated to U+FFFD.  The "b" suffix causes us to
    63  		// discover that the encoding is invalid during Write.
    64  		{r2[:1] + "b", []rune{'\uFFFD', 'b'}, nil},
    65  		{r3[:1] + "b", []rune{'\uFFFD', 'b'}, nil},
    66  		{r3[:2] + "b", []rune{'\uFFFD', '\uFFFD', 'b'}, nil},
    67  		{r4[:1] + "b", []rune{'\uFFFD', 'b'}, nil},
    68  		{r4[:2] + "b", []rune{'\uFFFD', '\uFFFD', 'b'}, nil},
    69  		{r4[:3] + "b", []rune{'\uFFFD', '\uFFFD', '\uFFFD', 'b'}, nil},
    70  		// Bracketed partial runes translated to U+FFFD.
    71  		{"a" + r2[:1] + "b", []rune{'a', '\uFFFD', 'b'}, nil},
    72  		{"a" + r3[:1] + "b", []rune{'a', '\uFFFD', 'b'}, nil},
    73  		{"a" + r3[:2] + "b", []rune{'a', '\uFFFD', '\uFFFD', 'b'}, nil},
    74  		{"a" + r4[:1] + "b", []rune{'a', '\uFFFD', 'b'}, nil},
    75  		{"a" + r4[:2] + "b", []rune{'a', '\uFFFD', '\uFFFD', 'b'}, nil},
    76  		{"a" + r4[:3] + "b", []rune{'a', '\uFFFD', '\uFFFD', '\uFFFD', 'b'}, nil},
    77  	}
    78  	for _, test := range tests {
    79  		// Run with a variety of chunk sizes.
    80  		for _, sizes := range [][]int{nil, {1}, {2}, {1, 2}, {2, 1}, {3}, {1, 2, 3}} {
    81  			dec := new(UTF8ChunkDecoder)
    82  			if got, want := writeRuneChunk(t, dec, test.Text, sizes), test.Write; !reflect.DeepEqual(got, want) {
    83  				t.Errorf("%q write got %v, want %v", test.Text, got, want)
    84  			}
    85  			if got, want := flushRuneChunk(t, dec, test.Text), test.Flush; !reflect.DeepEqual(got, want) {
    86  				t.Errorf("%q flush got %v, want %v", test.Text, got, want)
    87  			}
    88  		}
    89  	}
    90  }
    91  
    92  func writeRuneChunk(t testing.TB, dec RuneChunkDecoder, text string, sizes []int) []rune {
    93  	var runes []rune
    94  	addRune := func(r rune) error {
    95  		runes = append(runes, r)
    96  		return nil
    97  	}
    98  	// Write chunks of different sizes until we've exhausted the input text.
    99  	remain := []byte(text)
   100  	for ix := 0; len(remain) > 0; ix++ {
   101  		var chunk []byte
   102  		chunk, remain = nextChunk(remain, sizes, ix)
   103  		got, err := WriteRuneChunk(dec, addRune, chunk)
   104  		if want := len(chunk); got != want || err != nil {
   105  			t.Errorf("%q WriteRuneChunk(%q) got (%d,%v), want (%d,nil)", text, chunk, got, err, want)
   106  		}
   107  	}
   108  	return runes
   109  }
   110  
   111  func flushRuneChunk(t testing.TB, dec RuneChunkDecoder, text string) []rune {
   112  	var runes []rune
   113  	addRune := func(r rune) error {
   114  		runes = append(runes, r)
   115  		return nil
   116  	}
   117  	// Flush the decoder.
   118  	if err := FlushRuneChunk(dec, addRune); err != nil {
   119  		t.Errorf("%q FlushRuneChunk got %v, want nil", text, err)
   120  	}
   121  	return runes
   122  }
   123  
   124  func nextChunk(text []byte, sizes []int, index int) (chunk, remain []byte) {
   125  	if len(sizes) == 0 {
   126  		return text, nil
   127  	}
   128  	size := sizes[index%len(sizes)]
   129  	if size >= len(text) {
   130  		return text, nil
   131  	}
   132  	return text[:size], text[size:]
   133  }
   134  
   135  // benchText contains a mix of 1, 2, 3 and 4 byte runes, and invalid encodings.
   136  var benchText = strings.Repeat("a bc def ghij klmno pqrstu vwxyz A BC DEF GHIJ KLMNO PQRSTU VWXYZ 0123456789 !@#$%^&*()ΔΘΠΣΦ王普澤世界\U0001F680\U0001F681\U0001F682\U0001F683\uFFFD\xFF ", 100)
   137  
   138  func benchRuneChunkDecoder(b *testing.B, dec RuneChunkDecoder, sizes []int) {
   139  	for i := 0; i < b.N; i++ {
   140  		writeRuneChunk(b, dec, benchText, sizes)
   141  		flushRuneChunk(b, dec, benchText)
   142  	}
   143  }
   144  
   145  func BenchmarkUTF8ChunkDecoder_Sizes_0(b *testing.B) {
   146  	benchRuneChunkDecoder(b, new(UTF8ChunkDecoder), nil)
   147  }
   148  func BenchmarkUTF8ChunkDecoder_Sizes_1(b *testing.B) {
   149  	benchRuneChunkDecoder(b, new(UTF8ChunkDecoder), []int{1})
   150  }
   151  func BenchmarkUTF8ChunkDecoder_Sizes_1_2_3(b *testing.B) {
   152  	benchRuneChunkDecoder(b, new(UTF8ChunkDecoder), []int{1, 2, 3})
   153  }