github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/textutil/utf8_test.go (about) 1 // Copyright 2015 The Vanadium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package textutil 6 7 import ( 8 "reflect" 9 "strings" 10 "testing" 11 ) 12 13 func TestUTF8ChunkDecoder(t *testing.T) { 14 r2 := "Δ" 15 r3 := "王" 16 r4 := "\U0001F680" 17 tests := []struct { 18 Text string 19 Write []rune 20 Flush []rune 21 }{ 22 {"", nil, nil}, 23 {"a", []rune{'a'}, nil}, 24 {"abc", []rune{'a', 'b', 'c'}, nil}, 25 {"abc def ghi", []rune{'a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i'}, nil}, 26 // 2-byte runes. 27 {"ΔΘΠΣΦ", []rune{'Δ', 'Θ', 'Π', 'Σ', 'Φ'}, nil}, 28 // 3-byte runes. 29 {"王普澤世界", []rune{'王', '普', '澤', '世', '界'}, nil}, 30 // 4-byte runes. 31 {"\U0001F680\U0001F681\U0001F682\U0001F683", []rune{'\U0001F680', '\U0001F681', '\U0001F682', '\U0001F683'}, nil}, 32 // Mixed-bytes. 33 {"aΔ王\U0001F680普Θb", []rune{'a', 'Δ', '王', '\U0001F680', '普', 'Θ', 'b'}, nil}, 34 // Error runes translated to U+FFFD. 35 {"\uFFFD", []rune{'\uFFFD'}, nil}, 36 {"a\uFFFDb", []rune{'a', '\uFFFD', 'b'}, nil}, 37 {"\x80", []rune{'\uFFFD'}, nil}, 38 {"\xFF", []rune{'\uFFFD'}, nil}, 39 {"a\x80b", []rune{'a', '\uFFFD', 'b'}, nil}, 40 {"a\xFFb", []rune{'a', '\uFFFD', 'b'}, nil}, 41 // Multi-byte full runes. 42 {r2, []rune{[]rune(r2)[0]}, nil}, 43 {r3, []rune{[]rune(r3)[0]}, nil}, 44 {r4, []rune{[]rune(r4)[0]}, nil}, 45 // Partial runes translated to one or more U+FFFD. Since each case is a 46 // multi-byte encoding that's missing one or more bytes, the FFFD bytes are 47 // all returned in Flush rather than Write. 48 {r2[:1], nil, []rune{'\uFFFD'}}, 49 {r3[:1], nil, []rune{'\uFFFD'}}, 50 {r3[:2], nil, []rune{'\uFFFD', '\uFFFD'}}, 51 {r4[:1], nil, []rune{'\uFFFD'}}, 52 {r4[:2], nil, []rune{'\uFFFD', '\uFFFD'}}, 53 {r4[:3], nil, []rune{'\uFFFD', '\uFFFD', '\uFFFD'}}, 54 // Trailing partial runes translated to U+FFFD. Similar to above, the FFFD 55 // bytes are all returned in Flush rather than Write 56 {"a" + r2[:1], []rune{'a'}, []rune{'\uFFFD'}}, 57 {"a" + r3[:1], []rune{'a'}, []rune{'\uFFFD'}}, 58 {"a" + r3[:2], []rune{'a'}, []rune{'\uFFFD', '\uFFFD'}}, 59 {"a" + r4[:1], []rune{'a'}, []rune{'\uFFFD'}}, 60 {"a" + r4[:2], []rune{'a'}, []rune{'\uFFFD', '\uFFFD'}}, 61 {"a" + r4[:3], []rune{'a'}, []rune{'\uFFFD', '\uFFFD', '\uFFFD'}}, 62 // Leading partial runes translated to U+FFFD. The "b" suffix causes us to 63 // discover that the encoding is invalid during Write. 64 {r2[:1] + "b", []rune{'\uFFFD', 'b'}, nil}, 65 {r3[:1] + "b", []rune{'\uFFFD', 'b'}, nil}, 66 {r3[:2] + "b", []rune{'\uFFFD', '\uFFFD', 'b'}, nil}, 67 {r4[:1] + "b", []rune{'\uFFFD', 'b'}, nil}, 68 {r4[:2] + "b", []rune{'\uFFFD', '\uFFFD', 'b'}, nil}, 69 {r4[:3] + "b", []rune{'\uFFFD', '\uFFFD', '\uFFFD', 'b'}, nil}, 70 // Bracketed partial runes translated to U+FFFD. 71 {"a" + r2[:1] + "b", []rune{'a', '\uFFFD', 'b'}, nil}, 72 {"a" + r3[:1] + "b", []rune{'a', '\uFFFD', 'b'}, nil}, 73 {"a" + r3[:2] + "b", []rune{'a', '\uFFFD', '\uFFFD', 'b'}, nil}, 74 {"a" + r4[:1] + "b", []rune{'a', '\uFFFD', 'b'}, nil}, 75 {"a" + r4[:2] + "b", []rune{'a', '\uFFFD', '\uFFFD', 'b'}, nil}, 76 {"a" + r4[:3] + "b", []rune{'a', '\uFFFD', '\uFFFD', '\uFFFD', 'b'}, nil}, 77 } 78 for _, test := range tests { 79 // Run with a variety of chunk sizes. 80 for _, sizes := range [][]int{nil, {1}, {2}, {1, 2}, {2, 1}, {3}, {1, 2, 3}} { 81 dec := new(UTF8ChunkDecoder) 82 if got, want := writeRuneChunk(t, dec, test.Text, sizes), test.Write; !reflect.DeepEqual(got, want) { 83 t.Errorf("%q write got %v, want %v", test.Text, got, want) 84 } 85 if got, want := flushRuneChunk(t, dec, test.Text), test.Flush; !reflect.DeepEqual(got, want) { 86 t.Errorf("%q flush got %v, want %v", test.Text, got, want) 87 } 88 } 89 } 90 } 91 92 func writeRuneChunk(t testing.TB, dec RuneChunkDecoder, text string, sizes []int) []rune { 93 var runes []rune 94 addRune := func(r rune) error { 95 runes = append(runes, r) 96 return nil 97 } 98 // Write chunks of different sizes until we've exhausted the input text. 99 remain := []byte(text) 100 for ix := 0; len(remain) > 0; ix++ { 101 var chunk []byte 102 chunk, remain = nextChunk(remain, sizes, ix) 103 got, err := WriteRuneChunk(dec, addRune, chunk) 104 if want := len(chunk); got != want || err != nil { 105 t.Errorf("%q WriteRuneChunk(%q) got (%d,%v), want (%d,nil)", text, chunk, got, err, want) 106 } 107 } 108 return runes 109 } 110 111 func flushRuneChunk(t testing.TB, dec RuneChunkDecoder, text string) []rune { 112 var runes []rune 113 addRune := func(r rune) error { 114 runes = append(runes, r) 115 return nil 116 } 117 // Flush the decoder. 118 if err := FlushRuneChunk(dec, addRune); err != nil { 119 t.Errorf("%q FlushRuneChunk got %v, want nil", text, err) 120 } 121 return runes 122 } 123 124 func nextChunk(text []byte, sizes []int, index int) (chunk, remain []byte) { 125 if len(sizes) == 0 { 126 return text, nil 127 } 128 size := sizes[index%len(sizes)] 129 if size >= len(text) { 130 return text, nil 131 } 132 return text[:size], text[size:] 133 } 134 135 // benchText contains a mix of 1, 2, 3 and 4 byte runes, and invalid encodings. 136 var benchText = strings.Repeat("a bc def ghij klmno pqrstu vwxyz A BC DEF GHIJ KLMNO PQRSTU VWXYZ 0123456789 !@#$%^&*()ΔΘΠΣΦ王普澤世界\U0001F680\U0001F681\U0001F682\U0001F683\uFFFD\xFF ", 100) 137 138 func benchRuneChunkDecoder(b *testing.B, dec RuneChunkDecoder, sizes []int) { 139 for i := 0; i < b.N; i++ { 140 writeRuneChunk(b, dec, benchText, sizes) 141 flushRuneChunk(b, dec, benchText) 142 } 143 } 144 145 func BenchmarkUTF8ChunkDecoder_Sizes_0(b *testing.B) { 146 benchRuneChunkDecoder(b, new(UTF8ChunkDecoder), nil) 147 } 148 func BenchmarkUTF8ChunkDecoder_Sizes_1(b *testing.B) { 149 benchRuneChunkDecoder(b, new(UTF8ChunkDecoder), []int{1}) 150 } 151 func BenchmarkUTF8ChunkDecoder_Sizes_1_2_3(b *testing.B) { 152 benchRuneChunkDecoder(b, new(UTF8ChunkDecoder), []int{1, 2, 3}) 153 }