github.com/primecitizens/pcz/std@v0.2.1/text/unicode/wtf16/wtf16_test.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2023 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 package wtf16 9 10 import ( 11 "fmt" 12 "slices" 13 "testing" 14 "unsafe" 15 16 stdstring "github.com/primecitizens/pcz/std/builtin/string" 17 "github.com/primecitizens/pcz/std/text/unicode/utf16" 18 "github.com/primecitizens/pcz/std/text/unicode/utf8" 19 ) 20 21 var wtf8tests = []struct { 22 str string 23 wstr []uint16 24 }{ 25 { 26 str: "\x00", 27 wstr: []uint16{0x00}, 28 }, 29 { 30 str: "\x5C", 31 wstr: []uint16{0x5C}, 32 }, 33 { 34 str: "\x7F", 35 wstr: []uint16{0x7F}, 36 }, 37 38 // 2-byte 39 { 40 str: "\xC2\x80", 41 wstr: []uint16{0x80}, 42 }, 43 { 44 str: "\xD7\x8A", 45 wstr: []uint16{0x05CA}, 46 }, 47 { 48 str: "\xDF\xBF", 49 wstr: []uint16{0x07FF}, 50 }, 51 52 // 3-byte 53 { 54 str: "\xE0\xA0\x80", 55 wstr: []uint16{0x0800}, 56 }, 57 { 58 str: "\xE2\xB0\xBC", 59 wstr: []uint16{0x2C3C}, 60 }, 61 { 62 str: "\xEF\xBF\xBF", 63 wstr: []uint16{0xFFFF}, 64 }, 65 // unmatched surrogate halves 66 // high surrogates: 0xD800 to 0xDBFF 67 { 68 str: "\xED\xA0\x80", 69 wstr: []uint16{0xD800}, 70 }, 71 { 72 // "High surrogate followed by another high surrogate" 73 str: "\xED\xA0\x80\xED\xA0\x80", 74 wstr: []uint16{0xD800, 0xD800}, 75 }, 76 { 77 // "High surrogate followed by a symbol that is not a surrogate" 78 str: string([]byte{0xED, 0xA0, 0x80, 0xA}), 79 wstr: []uint16{0xD800, 0xA}, 80 }, 81 { 82 // "Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate" 83 str: string([]byte{0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80}), 84 wstr: []uint16{0xD800, 0xD834, 0xDF06, 0xD800}, 85 }, 86 { 87 str: "\xED\xA6\xAF", 88 wstr: []uint16{0xD9AF}, 89 }, 90 { 91 str: "\xED\xAF\xBF", 92 wstr: []uint16{0xDBFF}, 93 }, 94 // low surrogates: 0xDC00 to 0xDFFF 95 { 96 str: "\xED\xB0\x80", 97 wstr: []uint16{0xDC00}, 98 }, 99 { 100 // "Low surrogate followed by another low surrogate" 101 str: "\xED\xB0\x80\xED\xB0\x80", 102 wstr: []uint16{0xDC00, 0xDC00}, 103 }, 104 { 105 // "Low surrogate followed by a symbol that is not a surrogate" 106 str: string([]byte{0xED, 0xB0, 0x80, 0xA}), 107 wstr: []uint16{0xDC00, 0xA}, 108 }, 109 { 110 // "Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate" 111 str: string([]byte{0xED, 0xB0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xB0, 0x80}), 112 wstr: []uint16{0xDC00, 0xD834, 0xDF06, 0xDC00}, 113 }, 114 { 115 str: "\xED\xBB\xAE", 116 wstr: []uint16{0xDEEE}, 117 }, 118 { 119 str: "\xED\xBF\xBF", 120 wstr: []uint16{0xDFFF}, 121 }, 122 123 // 4-byte 124 { 125 str: "\xF0\x90\x80\x80", 126 wstr: []uint16{0xD800, 0xDC00}, 127 }, 128 { 129 str: "\xF0\x9D\x8C\x86", 130 wstr: []uint16{0xD834, 0xDF06}, 131 }, 132 { 133 str: "\xF4\x8F\xBF\xBF", 134 wstr: []uint16{0xDBFF, 0xDFFF}, 135 }, 136 } 137 138 func TestWTF16Rountrip(t *testing.T) { 139 for _, tt := range wtf8tests { 140 t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) { 141 got := Encode(nil, tt.str) 142 got2 := string(WTF8DecodeAll(nil, got...)) 143 if got2 != tt.str { 144 t.Errorf("got:\n%s\nwant:\n%s", got2, tt.str) 145 } 146 }) 147 } 148 } 149 150 func TestWTF16Golden(t *testing.T) { 151 for _, tt := range wtf8tests { 152 t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) { 153 got := Encode(nil, tt.str) 154 if !slices.Equal(got, tt.wstr) { 155 t.Errorf("got:\n%v\nwant:\n%v", got, tt.wstr) 156 } 157 }) 158 } 159 } 160 161 func FuzzEncodeWTF16(f *testing.F) { 162 for _, tt := range wtf8tests { 163 f.Add(tt.str) 164 } 165 f.Fuzz(func(t *testing.T, b string) { 166 // test that there are no panics 167 got := Encode(nil, b) 168 WTF8Decode(nil, got...) 169 if utf8.Valid(b) { 170 // if the input is a valid UTF-8 string, then 171 // test that encodeWTF16 behaves as 172 // utf16.Encode 173 want := utf16.AppendRunes(nil, []rune(b)...) 174 if !slices.Equal(got, want) { 175 t.Errorf("got:\n%v\nwant:\n%v", got, want) 176 } 177 } 178 }) 179 } 180 181 func FuzzDecodeWTF16(f *testing.F) { 182 for _, tt := range wtf8tests { 183 b := unsafe.Slice((*uint8)(unsafe.Pointer(unsafe.SliceData(tt.wstr))), len(tt.wstr)*2) 184 f.Add(b) 185 } 186 f.Fuzz(func(t *testing.T, b []byte) { 187 u16 := unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(b))), len(b)/2) 188 got := WTF8DecodeAll(nil, u16...) 189 if utf8.Valid(stdstring.FromBytes(got)) { 190 // if the input is a valid UTF-8 string, then 191 // test that decodeWTF16 behaves as 192 // utf16.Decode 193 want := utf16.RunesAppend(nil, u16...) 194 if string(got) != string(want) { 195 t.Errorf("got:\n%s\nwant:\n%s", string(got), string(want)) 196 } 197 } 198 // WTF-8 should always roundtrip 199 got2 := Encode(nil, string(got)) 200 if !slices.Equal(got2, u16) { 201 t.Errorf("got:\n%v\nwant:\n%v", got2, u16) 202 } 203 }) 204 }