github.com/liquid-dev/text@v0.3.3-liquid/encoding/unicode/unicode_test.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package unicode 6 7 import ( 8 "testing" 9 10 "github.com/liquid-dev/text/encoding" 11 "github.com/liquid-dev/text/encoding/charmap" 12 "github.com/liquid-dev/text/encoding/internal/enctest" 13 "github.com/liquid-dev/text/transform" 14 ) 15 16 func TestBasics(t *testing.T) { 17 testCases := []struct { 18 e encoding.Encoding 19 encPrefix string 20 encSuffix string 21 encoded string 22 utf8 string 23 }{{ 24 e: utf16BEIB, 25 encoded: "\x00\x57\x00\xe4\xd8\x35\xdd\x65", 26 utf8: "\x57\u00e4\U0001d565", 27 }, { 28 e: utf16BEEB, 29 encPrefix: "\xfe\xff", 30 encoded: "\x00\x57\x00\xe4\xd8\x35\xdd\x65", 31 utf8: "\x57\u00e4\U0001d565", 32 }, { 33 e: utf16LEIB, 34 encoded: "\x57\x00\xe4\x00\x35\xd8\x65\xdd", 35 utf8: "\x57\u00e4\U0001d565", 36 }, { 37 e: utf16LEEB, 38 encPrefix: "\xff\xfe", 39 encoded: "\x57\x00\xe4\x00\x35\xd8\x65\xdd", 40 utf8: "\x57\u00e4\U0001d565", 41 }} 42 43 for _, tc := range testCases { 44 enctest.TestEncoding(t, tc.e, tc.encoded, tc.utf8, tc.encPrefix, tc.encSuffix) 45 } 46 } 47 48 func TestFiles(t *testing.T) { 49 enctest.TestFile(t, UTF8) 50 enctest.TestFile(t, utf16LEIB) 51 } 52 53 func BenchmarkEncoding(b *testing.B) { 54 enctest.Benchmark(b, UTF8) 55 enctest.Benchmark(b, utf16LEIB) 56 } 57 58 var ( 59 utf16LEIB = UTF16(LittleEndian, IgnoreBOM) // UTF-16LE (atypical interpretation) 60 utf16LEUB = UTF16(LittleEndian, UseBOM) // UTF-16, LE 61 utf16LEEB = UTF16(LittleEndian, ExpectBOM) // UTF-16, LE, Expect 62 utf16BEIB = UTF16(BigEndian, IgnoreBOM) // UTF-16BE (atypical interpretation) 63 utf16BEUB = UTF16(BigEndian, UseBOM) // UTF-16 default 64 utf16BEEB = UTF16(BigEndian, ExpectBOM) // UTF-16 Expect 65 ) 66 67 func TestUTF16(t *testing.T) { 68 testCases := []struct { 69 desc string 70 src string 71 notEOF bool // the inverse of atEOF 72 sizeDst int 73 want string 74 nSrc int 75 err error 76 t transform.Transformer 77 }{{ 78 desc: "utf-16 IgnoreBOM dec: empty string", 79 t: utf16BEIB.NewDecoder(), 80 }, { 81 desc: "utf-16 UseBOM dec: empty string", 82 t: utf16BEUB.NewDecoder(), 83 }, { 84 desc: "utf-16 ExpectBOM dec: empty string", 85 err: ErrMissingBOM, 86 t: utf16BEEB.NewDecoder(), 87 }, { 88 desc: "utf-16 dec: BOM determines encoding BE (RFC 2781:3.3)", 89 src: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 90 sizeDst: 100, 91 want: "\U00012345=Ra", 92 nSrc: 12, 93 t: utf16BEUB.NewDecoder(), 94 }, { 95 desc: "utf-16 dec: BOM determines encoding LE (RFC 2781:3.3)", 96 src: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 97 sizeDst: 100, 98 want: "\U00012345=Ra", 99 nSrc: 12, 100 t: utf16LEUB.NewDecoder(), 101 }, { 102 desc: "utf-16 dec: BOM determines encoding LE, change default (RFC 2781:3.3)", 103 src: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 104 sizeDst: 100, 105 want: "\U00012345=Ra", 106 nSrc: 12, 107 t: utf16BEUB.NewDecoder(), 108 }, { 109 desc: "utf-16 dec: Fail on missing BOM when required", 110 src: "\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x00\x52\x00\x61", 111 sizeDst: 100, 112 want: "", 113 nSrc: 0, 114 err: ErrMissingBOM, 115 t: utf16BEEB.NewDecoder(), 116 }, { 117 desc: "utf-16 dec: SHOULD interpret text as big-endian when BOM not present (RFC 2781:4.3)", 118 src: "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 119 sizeDst: 100, 120 want: "\U00012345=Ra", 121 nSrc: 10, 122 t: utf16BEUB.NewDecoder(), 123 }, { 124 // This is an error according to RFC 2781. But errors in RFC 2781 are 125 // open to interpretations, so I guess this is fine. 126 desc: "utf-16le dec: incorrect BOM is an error (RFC 2781:4.1)", 127 src: "\xFE\xFF\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 128 sizeDst: 100, 129 want: "\uFFFE\U00012345=Ra", 130 nSrc: 12, 131 t: utf16LEIB.NewDecoder(), 132 }, { 133 desc: "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)", 134 src: "\U00012345=Ra", 135 sizeDst: 100, 136 want: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 137 nSrc: 7, 138 t: utf16LEUB.NewEncoder(), 139 }, { 140 desc: "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)", 141 src: "\U00012345=Ra", 142 sizeDst: 100, 143 want: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 144 nSrc: 7, 145 t: utf16BEUB.NewEncoder(), 146 }, { 147 desc: "utf-16le enc: MUST NOT write BOM (RFC 2781:3.3)", 148 src: "\U00012345=Ra", 149 sizeDst: 100, 150 want: "\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 151 nSrc: 7, 152 t: utf16LEIB.NewEncoder(), 153 }, { 154 desc: "utf-16be dec: incorrect UTF-16: odd bytes", 155 src: "\x00", 156 sizeDst: 100, 157 want: "\uFFFD", 158 nSrc: 1, 159 t: utf16BEIB.NewDecoder(), 160 }, { 161 desc: "utf-16be dec: unpaired surrogate, odd bytes", 162 src: "\xD8\x45\x00", 163 sizeDst: 100, 164 want: "\uFFFD\uFFFD", 165 nSrc: 3, 166 t: utf16BEIB.NewDecoder(), 167 }, { 168 desc: "utf-16be dec: unpaired low surrogate + valid text", 169 src: "\xD8\x45\x00a", 170 sizeDst: 100, 171 want: "\uFFFDa", 172 nSrc: 4, 173 t: utf16BEIB.NewDecoder(), 174 }, { 175 desc: "utf-16be dec: unpaired low surrogate + valid text + single byte", 176 src: "\xD8\x45\x00ab", 177 sizeDst: 100, 178 want: "\uFFFDa\uFFFD", 179 nSrc: 5, 180 t: utf16BEIB.NewDecoder(), 181 }, { 182 desc: "utf-16le dec: unpaired high surrogate", 183 src: "\x00\x00\x00\xDC\x12\xD8", 184 sizeDst: 100, 185 want: "\x00\uFFFD\uFFFD", 186 nSrc: 6, 187 t: utf16LEIB.NewDecoder(), 188 }, { 189 desc: "utf-16be dec: two unpaired low surrogates", 190 src: "\xD8\x45\xD8\x12", 191 sizeDst: 100, 192 want: "\uFFFD\uFFFD", 193 nSrc: 4, 194 t: utf16BEIB.NewDecoder(), 195 }, { 196 desc: "utf-16be dec: short dst", 197 src: "\x00a", 198 sizeDst: 0, 199 want: "", 200 nSrc: 0, 201 t: utf16BEIB.NewDecoder(), 202 err: transform.ErrShortDst, 203 }, { 204 desc: "utf-16be dec: short dst surrogate", 205 src: "\xD8\xF5\xDC\x12", 206 sizeDst: 3, 207 want: "", 208 nSrc: 0, 209 t: utf16BEIB.NewDecoder(), 210 err: transform.ErrShortDst, 211 }, { 212 desc: "utf-16be dec: short dst trailing byte", 213 src: "\x00", 214 sizeDst: 2, 215 want: "", 216 nSrc: 0, 217 t: utf16BEIB.NewDecoder(), 218 err: transform.ErrShortDst, 219 }, { 220 desc: "utf-16be dec: short src", 221 src: "\x00", 222 notEOF: true, 223 sizeDst: 3, 224 want: "", 225 nSrc: 0, 226 t: utf16BEIB.NewDecoder(), 227 err: transform.ErrShortSrc, 228 }, { 229 desc: "utf-16 enc", 230 src: "\U00012345=Ra", 231 sizeDst: 100, 232 want: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 233 nSrc: 7, 234 t: utf16BEUB.NewEncoder(), 235 }, { 236 desc: "utf-16 enc: short dst normal", 237 src: "\U00012345=Ra", 238 sizeDst: 9, 239 want: "\xD8\x08\xDF\x45\x00\x3D\x00\x52", 240 nSrc: 6, 241 t: utf16BEIB.NewEncoder(), 242 err: transform.ErrShortDst, 243 }, { 244 desc: "utf-16 enc: short dst surrogate", 245 src: "\U00012345=Ra", 246 sizeDst: 3, 247 want: "", 248 nSrc: 0, 249 t: utf16BEIB.NewEncoder(), 250 err: transform.ErrShortDst, 251 }, { 252 desc: "utf-16 enc: short src", 253 src: "\U00012345=Ra\xC2", 254 notEOF: true, 255 sizeDst: 100, 256 want: "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 257 nSrc: 7, 258 t: utf16BEIB.NewEncoder(), 259 err: transform.ErrShortSrc, 260 }, { 261 desc: "utf-16be dec: don't change byte order mid-stream", 262 src: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\xFF\xFE\x00\x52\x00\x61", 263 sizeDst: 100, 264 want: "\U00012345=\ufffeRa", 265 nSrc: 14, 266 t: utf16BEUB.NewDecoder(), 267 }, { 268 desc: "utf-16le dec: don't change byte order mid-stream", 269 src: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x52\x00\x61\x00", 270 sizeDst: 100, 271 want: "\U00012345=\ufeff\ufffeRa", 272 nSrc: 16, 273 t: utf16LEUB.NewDecoder(), 274 }} 275 for i, tc := range testCases { 276 b := make([]byte, tc.sizeDst) 277 nDst, nSrc, err := tc.t.Transform(b, []byte(tc.src), !tc.notEOF) 278 if err != tc.err { 279 t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err) 280 } 281 if got := string(b[:nDst]); got != tc.want { 282 t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want) 283 } 284 if nSrc != tc.nSrc { 285 t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc) 286 } 287 } 288 } 289 290 func TestUTF8Decoder(t *testing.T) { 291 testCases := []struct { 292 desc string 293 src string 294 notEOF bool // the inverse of atEOF 295 sizeDst int 296 want string 297 nSrc int 298 err error 299 }{{ 300 desc: "empty string, empty dest buffer", 301 }, { 302 desc: "empty string", 303 sizeDst: 8, 304 }, { 305 desc: "empty string, streaming", 306 notEOF: true, 307 sizeDst: 8, 308 }, { 309 desc: "ascii", 310 src: "abcde", 311 sizeDst: 8, 312 want: "abcde", 313 nSrc: 5, 314 }, { 315 desc: "ascii and error", 316 src: "ab\x80de", 317 sizeDst: 7, 318 want: "ab\ufffdde", 319 nSrc: 5, 320 }, { 321 desc: "valid two-byte sequence", 322 src: "a\u0300bc", 323 sizeDst: 7, 324 want: "a\u0300bc", 325 nSrc: 5, 326 }, { 327 desc: "valid three-byte sequence", 328 src: "a\u0300中", 329 sizeDst: 7, 330 want: "a\u0300中", 331 nSrc: 6, 332 }, { 333 desc: "valid four-byte sequence", 334 src: "a中\U00016F50", 335 sizeDst: 8, 336 want: "a中\U00016F50", 337 nSrc: 8, 338 }, { 339 desc: "short source buffer", 340 src: "abc\xf0\x90", 341 notEOF: true, 342 sizeDst: 10, 343 want: "abc", 344 nSrc: 3, 345 err: transform.ErrShortSrc, 346 }, { 347 // We don't check for the maximal subpart of an ill-formed subsequence 348 // at the end of an open segment. 349 desc: "complete invalid that looks like short at end", 350 src: "abc\xf0\x80", 351 notEOF: true, 352 sizeDst: 10, 353 want: "abc", // instead of "abc\ufffd\ufffd", 354 nSrc: 3, 355 err: transform.ErrShortSrc, 356 }, { 357 desc: "incomplete sequence at end", 358 src: "a\x80bc\xf0\x90", 359 sizeDst: 9, 360 want: "a\ufffdbc\ufffd", 361 nSrc: 6, 362 }, { 363 desc: "invalid second byte", 364 src: "abc\xf0dddd", 365 sizeDst: 10, 366 want: "abc\ufffddddd", 367 nSrc: 8, 368 }, { 369 desc: "invalid second byte at end", 370 src: "abc\xf0d", 371 sizeDst: 10, 372 want: "abc\ufffdd", 373 nSrc: 5, 374 }, { 375 desc: "invalid third byte", 376 src: "a\u0300bc\xf0\x90dddd", 377 sizeDst: 12, 378 want: "a\u0300bc\ufffddddd", 379 nSrc: 11, 380 }, { 381 desc: "invalid third byte at end", 382 src: "a\u0300bc\xf0\x90d", 383 sizeDst: 12, 384 want: "a\u0300bc\ufffdd", 385 nSrc: 8, 386 }, { 387 desc: "invalid fourth byte, tight buffer", 388 src: "a\u0300bc\xf0\x90\x80d", 389 sizeDst: 9, 390 want: "a\u0300bc\ufffdd", 391 nSrc: 9, 392 }, { 393 desc: "invalid fourth byte at end", 394 src: "a\u0300bc\xf0\x90\x80", 395 sizeDst: 8, 396 want: "a\u0300bc\ufffd", 397 nSrc: 8, 398 }, { 399 desc: "invalid fourth byte and short four byte sequence", 400 src: "a\u0300bc\xf0\x90\x80\xf0\x90\x80", 401 notEOF: true, 402 sizeDst: 20, 403 want: "a\u0300bc\ufffd", 404 nSrc: 8, 405 err: transform.ErrShortSrc, 406 }, { 407 desc: "valid four-byte sequence overflowing short buffer", 408 src: "a\u0300bc\xf0\x90\x80\x80", 409 notEOF: true, 410 sizeDst: 8, 411 want: "a\u0300bc", 412 nSrc: 5, 413 err: transform.ErrShortDst, 414 }, { 415 desc: "invalid fourth byte at end short, but short dst", 416 src: "a\u0300bc\xf0\x90\x80\xf0\x90\x80", 417 notEOF: true, 418 sizeDst: 8, 419 // More bytes would fit in the buffer, but this seems to require a more 420 // complicated and slower algorithm. 421 want: "a\u0300bc", // instead of "a\u0300bc" 422 nSrc: 5, 423 err: transform.ErrShortDst, 424 }, { 425 desc: "short dst for error", 426 src: "abc\x80", 427 notEOF: true, 428 sizeDst: 5, 429 want: "abc", 430 nSrc: 3, 431 err: transform.ErrShortDst, 432 }, { 433 desc: "adjusting short dst buffer", 434 src: "abc\x80ef", 435 notEOF: true, 436 sizeDst: 6, 437 want: "abc\ufffd", 438 nSrc: 4, 439 err: transform.ErrShortDst, 440 }} 441 tr := UTF8.NewDecoder() 442 for i, tc := range testCases { 443 b := make([]byte, tc.sizeDst) 444 nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF) 445 if err != tc.err { 446 t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err) 447 } 448 if got := string(b[:nDst]); got != tc.want { 449 t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want) 450 } 451 if nSrc != tc.nSrc { 452 t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc) 453 } 454 } 455 } 456 457 func TestBOMOverride(t *testing.T) { 458 dec := BOMOverride(charmap.CodePage437.NewDecoder()) 459 dst := make([]byte, 100) 460 for i, tc := range []struct { 461 src string 462 atEOF bool 463 dst string 464 nSrc int 465 err error 466 }{ 467 0: {"H\x82ll\x93", true, "Héllô", 5, nil}, 468 1: {"\uFEFFHéllö", true, "Héllö", 10, nil}, 469 2: {"\xFE\xFF\x00H\x00e\x00l\x00l\x00o", true, "Hello", 12, nil}, 470 3: {"\xFF\xFEH\x00e\x00l\x00l\x00o\x00", true, "Hello", 12, nil}, 471 4: {"\uFEFF", true, "", 3, nil}, 472 5: {"\xFE\xFF", true, "", 2, nil}, 473 6: {"\xFF\xFE", true, "", 2, nil}, 474 7: {"\xEF\xBB", true, "\u2229\u2557", 2, nil}, 475 8: {"\xEF", true, "\u2229", 1, nil}, 476 9: {"", true, "", 0, nil}, 477 10: {"\xFE", true, "\u25a0", 1, nil}, 478 11: {"\xFF", true, "\u00a0", 1, nil}, 479 12: {"\xEF\xBB", false, "", 0, transform.ErrShortSrc}, 480 13: {"\xEF", false, "", 0, transform.ErrShortSrc}, 481 14: {"", false, "", 0, transform.ErrShortSrc}, 482 15: {"\xFE", false, "", 0, transform.ErrShortSrc}, 483 16: {"\xFF", false, "", 0, transform.ErrShortSrc}, 484 17: {"\xFF\xFE", false, "", 0, transform.ErrShortSrc}, 485 } { 486 dec.Reset() 487 nDst, nSrc, err := dec.Transform(dst, []byte(tc.src), tc.atEOF) 488 got := string(dst[:nDst]) 489 if nSrc != tc.nSrc { 490 t.Errorf("%d: nSrc: got %d; want %d", i, nSrc, tc.nSrc) 491 } 492 if got != tc.dst { 493 t.Errorf("%d: got %+q; want %+q", i, got, tc.dst) 494 } 495 if err != tc.err { 496 t.Errorf("%d: error: got %v; want %v", i, err, tc.err) 497 } 498 } 499 }