github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/encoding/encoding_test.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package encoding_test 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "io/ioutil" 12 "strings" 13 "testing" 14 15 "golang.org/x/text/encoding" 16 "golang.org/x/text/encoding/charmap" 17 "golang.org/x/text/encoding/japanese" 18 "golang.org/x/text/encoding/korean" 19 "golang.org/x/text/encoding/simplifiedchinese" 20 "golang.org/x/text/encoding/traditionalchinese" 21 "golang.org/x/text/encoding/unicode" 22 "golang.org/x/text/transform" 23 ) 24 25 func trim(s string) string { 26 if len(s) < 120 { 27 return s 28 } 29 return s[:50] + "..." + s[len(s)-50:] 30 } 31 32 var basicTestCases = []struct { 33 e encoding.Encoding 34 encPrefix string 35 encSuffix string 36 encoded string 37 utf8 string 38 }{ 39 // The encoded forms can be verified by the iconv program: 40 // $ echo 月日は百代 | iconv -f UTF-8 -t SHIFT-JIS | xxd 41 42 // Charmap tests. 43 { 44 e: charmap.CodePage437, 45 encoded: "H\x82ll\x93 \x9d\xa7\xf4\x9c\xbe", 46 utf8: "Héllô ¥º⌠£╛", 47 }, 48 { 49 e: charmap.CodePage866, 50 encoded: "H\xf3\xd3o \x98\xfd\x9f\xdd\xa1", 51 utf8: "Hє╙o Ш¤Я▌б", 52 }, 53 { 54 e: charmap.ISO8859_2, 55 encoded: "Hel\xe5\xf5", 56 utf8: "Helĺő", 57 }, 58 { 59 e: charmap.ISO8859_3, 60 encoded: "He\xbd\xd4", 61 utf8: "He½Ô", 62 }, 63 { 64 e: charmap.ISO8859_4, 65 encoded: "Hel\xb6\xf8", 66 utf8: "Helļø", 67 }, 68 { 69 e: charmap.ISO8859_5, 70 encoded: "H\xd7\xc6o", 71 utf8: "HзЦo", 72 }, 73 { 74 e: charmap.ISO8859_6, 75 encoded: "Hel\xc2\xc9", 76 utf8: "Helآة", 77 }, 78 { 79 e: charmap.ISO8859_7, 80 encoded: "H\xeel\xebo", 81 utf8: "Hξlλo", 82 }, 83 { 84 e: charmap.ISO8859_8, 85 encoded: "Hel\xf5\xed", 86 utf8: "Helץם", 87 }, 88 { 89 e: charmap.ISO8859_10, 90 encoded: "H\xea\xbfo", 91 utf8: "Hęŋo", 92 }, 93 { 94 e: charmap.ISO8859_13, 95 encoded: "H\xe6l\xf9o", 96 utf8: "Hęlło", 97 }, 98 { 99 e: charmap.ISO8859_14, 100 encoded: "He\xfe\xd0o", 101 utf8: "HeŷŴo", 102 }, 103 { 104 e: charmap.ISO8859_15, 105 encoded: "H\xa4ll\xd8", 106 utf8: "H€llØ", 107 }, 108 { 109 e: charmap.ISO8859_16, 110 encoded: "H\xe6ll\xbd", 111 utf8: "Hællœ", 112 }, 113 { 114 e: charmap.KOI8R, 115 encoded: "He\x93\xad\x9c", 116 utf8: "He⌠╜°", 117 }, 118 { 119 e: charmap.KOI8U, 120 encoded: "He\x93\xad\x9c", 121 utf8: "He⌠ґ°", 122 }, 123 { 124 e: charmap.Macintosh, 125 encoded: "He\xdf\xd7", 126 utf8: "Hefl◊", 127 }, 128 { 129 e: charmap.MacintoshCyrillic, 130 encoded: "He\xbe\x94", 131 utf8: "HeЊФ", 132 }, 133 { 134 e: charmap.Windows874, 135 encoded: "He\xb7\xf0", 136 utf8: "Heท๐", 137 }, 138 { 139 e: charmap.Windows1250, 140 encoded: "He\xe5\xe5o", 141 utf8: "Heĺĺo", 142 }, 143 { 144 e: charmap.Windows1251, 145 encoded: "H\xball\xfe", 146 utf8: "Hєllю", 147 }, 148 { 149 e: charmap.Windows1252, 150 encoded: "H\xe9ll\xf4 \xa5\xbA\xae\xa3\xd0", 151 utf8: "Héllô ¥º®£Ð", 152 }, 153 { 154 e: charmap.Windows1253, 155 encoded: "H\xe5ll\xd6", 156 utf8: "HεllΦ", 157 }, 158 { 159 e: charmap.Windows1254, 160 encoded: "\xd0ello", 161 utf8: "Ğello", 162 }, 163 { 164 e: charmap.Windows1255, 165 encoded: "He\xd4o", 166 utf8: "Heװo", 167 }, 168 { 169 e: charmap.Windows1256, 170 encoded: "H\xdbllo", 171 utf8: "Hغllo", 172 }, 173 { 174 e: charmap.Windows1257, 175 encoded: "He\xeflo", 176 utf8: "Heļlo", 177 }, 178 { 179 e: charmap.Windows1258, 180 encoded: "Hell\xf5", 181 utf8: "Hellơ", 182 }, 183 { 184 e: charmap.XUserDefined, 185 encoded: "\x00\x40\x7f\x80\xab\xff", 186 utf8: "\u0000\u0040\u007f\uf780\uf7ab\uf7ff", 187 }, 188 189 // UTF-16 tests. 190 { 191 e: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), 192 encoded: "\x00\x57\x00\xe4\xd8\x35\xdd\x65", 193 utf8: "\x57\u00e4\U0001d565", 194 }, 195 { 196 e: utf16BEEB, 197 encPrefix: "\xfe\xff", 198 encoded: "\x00\x57\x00\xe4\xd8\x35\xdd\x65", 199 utf8: "\x57\u00e4\U0001d565", 200 }, 201 { 202 e: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), 203 encoded: "\x57\x00\xe4\x00\x35\xd8\x65\xdd", 204 utf8: "\x57\u00e4\U0001d565", 205 }, 206 { 207 e: utf16LEEB, 208 encPrefix: "\xff\xfe", 209 encoded: "\x57\x00\xe4\x00\x35\xd8\x65\xdd", 210 utf8: "\x57\u00e4\U0001d565", 211 }, 212 213 // Chinese tests. 214 // 215 // "\u0081\u00de\u00df\u00e0\u00e1\u00e2\u00e3\uffff\U00010000" is a 216 // nonsense string that contains GB18030 encodable codepoints of which 217 // only U+00E0 and U+00E1 are GBK encodable. 218 // 219 // "A\u3000\u554a\u4e02\u4e90\u72dc\u7349\u02ca\u2588Z€" is a nonsense 220 // string that contains ASCII and GBK encodable codepoints from Levels 221 // 1-5 as well as the Euro sign. 222 // 223 // "A\u43f0\u4c32\U00027267\u3000\U0002910d\u79d4Z€" is a nonsense string 224 // that contains ASCII and Big5 encodable codepoints from the Basic 225 // Multilingual Plane and the Supplementary Ideographic Plane as well as 226 // the Euro sign. 227 // 228 // "花间一壶酒,独酌无相亲。" (simplified) and 229 // "花間一壺酒,獨酌無相親。" (traditional) 230 // are from the 8th century poem "Yuè Xià Dú Zhuó". 231 { 232 e: simplifiedchinese.GB18030, 233 encoded: "\x81\x30\x81\x31\x81\x30\x89\x37\x81\x30\x89\x38\xa8\xa4\xa8\xa2" + 234 "\x81\x30\x89\x39\x81\x30\x8a\x30\x84\x31\xa4\x39\x90\x30\x81\x30", 235 utf8: "\u0081\u00de\u00df\u00e0\u00e1\u00e2\u00e3\uffff\U00010000", 236 }, 237 { 238 e: simplifiedchinese.GB18030, 239 encoded: "\xbb\xa8\xbc\xe4\xd2\xbb\xba\xf8\xbe\xc6\xa3\xac\xb6\xc0\xd7\xc3" + 240 "\xce\xde\xcf\xe0\xc7\xd7\xa1\xa3", 241 utf8: "花间一壶酒,独酌无相亲。", 242 }, 243 { 244 e: simplifiedchinese.GBK, 245 encoded: "A\xa1\xa1\xb0\xa1\x81\x40\x81\x80\xaa\x40\xaa\x80\xa8\x40\xa8\x80Z\x80", 246 utf8: "A\u3000\u554a\u4e02\u4e90\u72dc\u7349\u02ca\u2588Z€", 247 }, 248 { 249 e: simplifiedchinese.GBK, 250 encoded: "\xbb\xa8\xbc\xe4\xd2\xbb\xba\xf8\xbe\xc6\xa3\xac\xb6\xc0\xd7\xc3" + 251 "\xce\xde\xcf\xe0\xc7\xd7\xa1\xa3", 252 utf8: "花间一壶酒,独酌无相亲。", 253 }, 254 { 255 e: simplifiedchinese.HZGB2312, 256 encoded: "A~{\x21\x21~~\x30\x21~}Z~~", 257 utf8: "A\u3000~\u554aZ~", 258 }, 259 { 260 e: simplifiedchinese.HZGB2312, 261 encPrefix: "~{", 262 encoded: ";(<dR;:x>F#,6@WCN^O`GW!#", 263 utf8: "花间一壶酒,独酌无相亲。", 264 }, 265 { 266 e: traditionalchinese.Big5, 267 encoded: "A\x87\x40\x87\x41\x87\x45\xa1\x40\xfe\xfd\xfe\xfeZ\xa3\xe1", 268 utf8: "A\u43f0\u4c32\U00027267\u3000\U0002910d\u79d4Z€", 269 }, 270 { 271 e: traditionalchinese.Big5, 272 encoded: "\xaa\xe1\xb6\xa1\xa4\x40\xb3\xfd\xb0\x73\xa1\x41\xbf\x57\xb0\x75" + 273 "\xb5\x4c\xac\xdb\xbf\xcb\xa1\x43", 274 utf8: "花間一壺酒,獨酌無相親。", 275 }, 276 277 // Japanese tests. 278 // 279 // "A。カ゚ 0208: etc 0212: etc" is a nonsense string that contains ASCII, half-width 280 // kana, JIS X 0208 (including two near the kink in the Shift JIS second byte 281 // encoding) and JIS X 0212 encodable codepoints. 282 // 283 // "月日は百代の過客にして、行かふ年も又旅人也。" is from the 17th century poem 284 // "Oku no Hosomichi" and contains both hiragana and kanji. 285 { 286 e: japanese.EUCJP, 287 encoded: "A\x8e\xa1\x8e\xb6\x8e\xdf " + 288 "0208: \xa1\xa1\xa1\xa2\xa1\xdf\xa1\xe0\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2\xf4\xa6 " + 289 "0212: \x8f\xa2\xaf\x8f\xed\xe3", 290 utf8: "A。カ゚ " + 291 "0208: \u3000\u3001\u00d7\u00f7\u25ce\u25c7\u25c6\u25a1\u7199 " + 292 "0212: \u02d8\u9fa5", 293 }, 294 { 295 e: japanese.EUCJP, 296 encoded: "\xb7\xee\xc6\xfc\xa4\xcf\xc9\xb4\xc2\xe5\xa4\xce\xb2\xe1\xb5\xd2" + 297 "\xa4\xcb\xa4\xb7\xa4\xc6\xa1\xa2\xb9\xd4\xa4\xab\xa4\xd5\xc7\xaf" + 298 "\xa4\xe2\xcb\xf4\xce\xb9\xbf\xcd\xcc\xe9\xa1\xa3", 299 utf8: "月日は百代の過客にして、行かふ年も又旅人也。", 300 }, 301 { 302 e: japanese.ISO2022JP, 303 encSuffix: "\x1b\x28\x42", 304 encoded: "\x1b\x28\x49\x21\x36\x5f\x1b\x28\x42 " + 305 "0208: \x1b\x24\x42\x21\x21\x21\x22\x21\x5f\x21\x60\x21\x7d\x21\x7e\x22\x21\x22\x22\x74\x26", 306 utf8: "。カ゚ " + 307 "0208: \u3000\u3001\u00d7\u00f7\u25ce\u25c7\u25c6\u25a1\u7199", 308 }, 309 { 310 e: japanese.ISO2022JP, 311 encPrefix: "\x1b\x24\x42", 312 encSuffix: "\x1b\x28\x42", 313 encoded: "\x37\x6e\x46\x7c\x24\x4f\x49\x34\x42\x65\x24\x4e\x32\x61\x35\x52" + 314 "\x24\x4b\x24\x37\x24\x46\x21\x22\x39\x54\x24\x2b\x24\x55\x47\x2f" + 315 "\x24\x62\x4b\x74\x4e\x39\x3f\x4d\x4c\x69\x21\x23", 316 utf8: "月日は百代の過客にして、行かふ年も又旅人也。", 317 }, 318 { 319 e: japanese.ShiftJIS, 320 encoded: "A\xa1\xb6\xdf " + 321 "0208: \x81\x40\x81\x41\x81\x7e\x81\x80\x81\x9d\x81\x9e\x81\x9f\x81\xa0\xea\xa4", 322 utf8: "A。カ゚ " + 323 "0208: \u3000\u3001\u00d7\u00f7\u25ce\u25c7\u25c6\u25a1\u7199", 324 }, 325 { 326 e: japanese.ShiftJIS, 327 encoded: "\x8c\x8e\x93\xfa\x82\xcd\x95\x53\x91\xe3\x82\xcc\x89\xdf\x8b\x71" + 328 "\x82\xc9\x82\xb5\x82\xc4\x81\x41\x8d\x73\x82\xa9\x82\xd3\x94\x4e" + 329 "\x82\xe0\x96\x94\x97\xb7\x90\x6c\x96\xe7\x81\x42", 330 utf8: "月日は百代の過客にして、行かふ年も又旅人也。", 331 }, 332 333 // Korean tests. 334 // 335 // "A\uac02\uac35\uac56\ud401B\ud408\ud620\ud624C\u4f3d\u8a70D" is a 336 // nonsense string that contains ASCII, Hangul and CJK ideographs. 337 // 338 // "세계야, 안녕" translates as "Hello, world". 339 { 340 e: korean.EUCKR, 341 encoded: "A\x81\x41\x81\x61\x81\x81\xc6\xfeB\xc7\xa1\xc7\xfe\xc8\xa1C\xca\xa1\xfd\xfeD", 342 utf8: "A\uac02\uac35\uac56\ud401B\ud408\ud620\ud624C\u4f3d\u8a70D", 343 }, 344 { 345 e: korean.EUCKR, 346 encoded: "\xbc\xbc\xb0\xe8\xbe\xdf\x2c\x20\xbe\xc8\xb3\xe7", 347 utf8: "세계야, 안녕", 348 }, 349 } 350 351 func TestBasics(t *testing.T) { 352 for _, tc := range basicTestCases { 353 for _, direction := range []string{"Decode", "Encode"} { 354 var coder Transcoder 355 var want, src, wPrefix, sPrefix, wSuffix, sSuffix string 356 if direction == "Decode" { 357 coder, want, src = tc.e.NewDecoder(), tc.utf8, tc.encoded 358 wPrefix, sPrefix, wSuffix, sSuffix = "", tc.encPrefix, "", tc.encSuffix 359 } else { 360 coder, want, src = tc.e.NewEncoder(), tc.encoded, tc.utf8 361 wPrefix, sPrefix, wSuffix, sSuffix = tc.encPrefix, "", tc.encSuffix, "" 362 } 363 364 dst := make([]byte, len(wPrefix)+len(want)+len(wSuffix)) 365 nDst, nSrc, err := coder.Transform(dst, []byte(sPrefix+src+sSuffix), true) 366 if err != nil { 367 t.Errorf("%v: %s: %v", tc.e, direction, err) 368 continue 369 } 370 if nDst != len(wPrefix)+len(want)+len(wSuffix) { 371 t.Errorf("%v: %s: nDst got %d, want %d", 372 tc.e, direction, nDst, len(wPrefix)+len(want)+len(wSuffix)) 373 continue 374 } 375 if nSrc != len(sPrefix)+len(src)+len(sSuffix) { 376 t.Errorf("%v: %s: nSrc got %d, want %d", 377 tc.e, direction, nSrc, len(sPrefix)+len(src)+len(sSuffix)) 378 continue 379 } 380 if got := string(dst); got != wPrefix+want+wSuffix { 381 t.Errorf("%v: %s:\ngot %q\nwant %q", 382 tc.e, direction, got, wPrefix+want+wSuffix) 383 continue 384 } 385 386 for _, n := range []int{0, 1, 2, 10, 123, 4567} { 387 input := sPrefix + strings.Repeat(src, n) + sSuffix 388 g, err := coder.String(input) 389 if err != nil { 390 t.Errorf("%v: %s: Bytes: n=%d: %v", tc.e, direction, n, err) 391 continue 392 } 393 if len(g) == 0 && len(input) == 0 { 394 // If the input is empty then the output can be empty, 395 // regardless of whatever wPrefix is. 396 continue 397 } 398 got1, want1 := string(g), wPrefix+strings.Repeat(want, n)+wSuffix 399 if got1 != want1 { 400 t.Errorf("%v: %s: ReadAll: n=%d\ngot %q\nwant %q", 401 tc.e, direction, n, trim(got1), trim(want1)) 402 continue 403 } 404 } 405 } 406 } 407 } 408 409 // TestBig5CircumflexAndMacron tests the special cases listed in 410 // http://encoding.spec.whatwg.org/#big5 411 // Note that these special cases aren't preserved by round-tripping through 412 // decoding and encoding (since 413 // http://encoding.spec.whatwg.org/index-big5.txt does not have an entry for 414 // U+0304 or U+030C), so we can't test this in TestBasics. 415 func TestBig5CircumflexAndMacron(t *testing.T) { 416 src := "\x88\x5f\x88\x60\x88\x61\x88\x62\x88\x63\x88\x64\x88\x65\x88\x66 " + 417 "\x88\xa2\x88\xa3\x88\xa4\x88\xa5\x88\xa6" 418 want := "ÓǑÒ\u00ca\u0304Ế\u00ca\u030cỀÊ " + 419 "ü\u00ea\u0304ế\u00ea\u030cề" 420 dst, err := ioutil.ReadAll(transform.NewReader( 421 strings.NewReader(src), traditionalchinese.Big5.NewDecoder())) 422 if err != nil { 423 t.Fatal(err) 424 } 425 if got := string(dst); got != want { 426 t.Fatalf("\ngot %q\nwant %q", got, want) 427 } 428 } 429 430 func TestEncodeInvalidUTF8(t *testing.T) { 431 inputs := []string{ 432 "hello.", 433 "wo\ufffdld.", 434 "ABC\xff\x80\x80", // Invalid UTF-8. 435 "\x80\x80\x80\x80\x80", 436 "\x80\x80D\x80\x80", // Valid rune at "D". 437 "E\xed\xa0\x80\xed\xbf\xbfF", // Two invalid UTF-8 runes (surrogates). 438 "G", 439 "H\xe2\x82", // U+20AC in UTF-8 is "\xe2\x82\xac", which we split over two 440 "\xacI\xe2\x82", // input lines. It maps to 0x80 in the Windows-1252 encoding. 441 } 442 // Each invalid source byte becomes '\x1a'. 443 want := strings.Replace("hello.wo?ld.ABC??????????D??E??????FGH\x80I??", "?", "\x1a", -1) 444 445 transformer := encoding.ReplaceUnsupported(charmap.Windows1252.NewEncoder()) 446 gotBuf := make([]byte, 0, 1024) 447 src := make([]byte, 0, 1024) 448 for i, input := range inputs { 449 dst := make([]byte, 1024) 450 src = append(src, input...) 451 atEOF := i == len(inputs)-1 452 nDst, nSrc, err := transformer.Transform(dst, src, atEOF) 453 gotBuf = append(gotBuf, dst[:nDst]...) 454 src = src[nSrc:] 455 if err != nil && err != transform.ErrShortSrc { 456 t.Fatalf("i=%d: %v", i, err) 457 } 458 if atEOF && err != nil { 459 t.Fatalf("i=%d: atEOF: %v", i, err) 460 } 461 } 462 if got := string(gotBuf); got != want { 463 t.Fatalf("\ngot %+q\nwant %+q", got, want) 464 } 465 } 466 467 func TestReplacement(t *testing.T) { 468 for _, direction := range []string{"Decode", "Encode"} { 469 enc, want := (transform.Transformer)(nil), "" 470 if direction == "Decode" { 471 enc = encoding.Replacement.NewDecoder() 472 want = "\ufffd" 473 } else { 474 enc = encoding.Replacement.NewEncoder() 475 want = "AB\x00CD\ufffdYZ" 476 } 477 sr := strings.NewReader("AB\x00CD\x80YZ") 478 g, err := ioutil.ReadAll(transform.NewReader(sr, enc)) 479 if err != nil { 480 t.Errorf("%s: ReadAll: %v", direction, err) 481 continue 482 } 483 if got := string(g); got != want { 484 t.Errorf("%s:\ngot %q\nwant %q", direction, got, want) 485 continue 486 } 487 } 488 } 489 490 func TestUTF8Validator(t *testing.T) { 491 testCases := []struct { 492 desc string 493 dstSize int 494 src string 495 atEOF bool 496 want string 497 wantErr error 498 }{ 499 { 500 "empty input", 501 100, 502 "", 503 false, 504 "", 505 nil, 506 }, 507 { 508 "valid 1-byte 1-rune input", 509 100, 510 "a", 511 false, 512 "a", 513 nil, 514 }, 515 { 516 "valid 3-byte 1-rune input", 517 100, 518 "\u1234", 519 false, 520 "\u1234", 521 nil, 522 }, 523 { 524 "valid 5-byte 3-rune input", 525 100, 526 "a\u0100\u0101", 527 false, 528 "a\u0100\u0101", 529 nil, 530 }, 531 { 532 "perfectly sized dst (non-ASCII)", 533 5, 534 "a\u0100\u0101", 535 false, 536 "a\u0100\u0101", 537 nil, 538 }, 539 { 540 "short dst (non-ASCII)", 541 4, 542 "a\u0100\u0101", 543 false, 544 "a\u0100", 545 transform.ErrShortDst, 546 }, 547 { 548 "perfectly sized dst (ASCII)", 549 5, 550 "abcde", 551 false, 552 "abcde", 553 nil, 554 }, 555 { 556 "short dst (ASCII)", 557 4, 558 "abcde", 559 false, 560 "abcd", 561 transform.ErrShortDst, 562 }, 563 { 564 "partial input (!EOF)", 565 100, 566 "a\u0100\xf1", 567 false, 568 "a\u0100", 569 transform.ErrShortSrc, 570 }, 571 { 572 "invalid input (EOF)", 573 100, 574 "a\u0100\xf1", 575 true, 576 "a\u0100", 577 encoding.ErrInvalidUTF8, 578 }, 579 { 580 "invalid input (!EOF)", 581 100, 582 "a\u0100\x80", 583 false, 584 "a\u0100", 585 encoding.ErrInvalidUTF8, 586 }, 587 { 588 "invalid input (above U+10FFFF)", 589 100, 590 "a\u0100\xf7\xbf\xbf\xbf", 591 false, 592 "a\u0100", 593 encoding.ErrInvalidUTF8, 594 }, 595 { 596 "invalid input (surrogate half)", 597 100, 598 "a\u0100\xed\xa0\x80", 599 false, 600 "a\u0100", 601 encoding.ErrInvalidUTF8, 602 }, 603 } 604 for _, tc := range testCases { 605 dst := make([]byte, tc.dstSize) 606 nDst, nSrc, err := encoding.UTF8Validator.Transform(dst, []byte(tc.src), tc.atEOF) 607 if nDst < 0 || len(dst) < nDst { 608 t.Errorf("%s: nDst=%d out of range", tc.desc, nDst) 609 continue 610 } 611 got := string(dst[:nDst]) 612 if got != tc.want || nSrc != len(tc.want) || err != tc.wantErr { 613 t.Errorf("%s:\ngot %+q, %d, %v\nwant %+q, %d, %v", 614 tc.desc, got, nSrc, err, tc.want, len(tc.want), tc.wantErr) 615 continue 616 } 617 } 618 } 619 620 var ( 621 utf16LEIB = unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM) // UTF-16LE (atypical interpretation) 622 utf16LEUB = unicode.UTF16(unicode.LittleEndian, unicode.UseBOM) // UTF-16, LE 623 utf16LEEB = unicode.UTF16(unicode.LittleEndian, unicode.ExpectBOM) // UTF-16, LE, Expect 624 utf16BEIB = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM) // UTF-16BE (atypical interpretation) 625 utf16BEUB = unicode.UTF16(unicode.BigEndian, unicode.UseBOM) // UTF-16 default 626 utf16BEEB = unicode.UTF16(unicode.BigEndian, unicode.ExpectBOM) // UTF-16 Expect 627 ) 628 629 func TestUTF16(t *testing.T) { 630 testCases := []struct { 631 desc string 632 src string 633 notEOF bool // the inverse of atEOF 634 sizeDst int 635 want string 636 nSrc int 637 err error 638 t transform.Transformer 639 }{{ 640 desc: "utf-16 dec: BOM determines encoding BE (RFC 2781:3.3)", 641 src: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 642 sizeDst: 100, 643 want: "\U00012345=Ra", 644 nSrc: 12, 645 t: utf16BEUB.NewDecoder(), 646 }, { 647 desc: "utf-16 dec: BOM determines encoding LE (RFC 2781:3.3)", 648 src: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 649 sizeDst: 100, 650 want: "\U00012345=Ra", 651 nSrc: 12, 652 t: utf16LEUB.NewDecoder(), 653 }, { 654 desc: "utf-16 dec: BOM determines encoding LE, change default (RFC 2781:3.3)", 655 src: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 656 sizeDst: 100, 657 want: "\U00012345=Ra", 658 nSrc: 12, 659 t: utf16BEUB.NewDecoder(), 660 }, { 661 desc: "utf-16 dec: Fail on missing BOM when required", 662 src: "\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x00\x52\x00\x61", 663 sizeDst: 100, 664 want: "", 665 nSrc: 0, 666 err: unicode.ErrMissingBOM, 667 t: utf16BEEB.NewDecoder(), 668 }, { 669 desc: "utf-16 dec: SHOULD interpret text as big-endian when BOM not present (RFC 2781:4.3)", 670 src: "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 671 sizeDst: 100, 672 want: "\U00012345=Ra", 673 nSrc: 10, 674 t: utf16BEUB.NewDecoder(), 675 }, { 676 // This is an error according to RFC 2781. But errors in RFC 2781 are 677 // open to interpretations, so I guess this is fine. 678 desc: "utf-16le dec: incorrect BOM is an error (RFC 2781:4.1)", 679 src: "\xFE\xFF\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 680 sizeDst: 100, 681 want: "\uFFFE\U00012345=Ra", 682 nSrc: 12, 683 t: utf16LEIB.NewDecoder(), 684 }, { 685 desc: "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)", 686 src: "\U00012345=Ra", 687 sizeDst: 100, 688 want: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 689 nSrc: 7, 690 t: utf16LEUB.NewEncoder(), 691 }, { 692 desc: "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)", 693 src: "\U00012345=Ra", 694 sizeDst: 100, 695 want: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 696 nSrc: 7, 697 t: utf16BEUB.NewEncoder(), 698 }, { 699 desc: "utf-16le enc: MUST NOT write BOM (RFC 2781:3.3)", 700 src: "\U00012345=Ra", 701 sizeDst: 100, 702 want: "\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00", 703 nSrc: 7, 704 t: utf16LEIB.NewEncoder(), 705 }, { 706 desc: "utf-16be dec: incorrect UTF-16: odd bytes", 707 src: "\x00", 708 sizeDst: 100, 709 want: "\uFFFD", 710 nSrc: 1, 711 t: utf16BEIB.NewDecoder(), 712 }, { 713 desc: "utf-16be dec: unpaired surrogate, odd bytes", 714 src: "\xD8\x45\x00", 715 sizeDst: 100, 716 want: "\uFFFD\uFFFD", 717 nSrc: 3, 718 t: utf16BEIB.NewDecoder(), 719 }, { 720 desc: "utf-16be dec: unpaired low surrogate + valid text", 721 src: "\xD8\x45\x00a", 722 sizeDst: 100, 723 want: "\uFFFDa", 724 nSrc: 4, 725 t: utf16BEIB.NewDecoder(), 726 }, { 727 desc: "utf-16be dec: unpaired low surrogate + valid text + single byte", 728 src: "\xD8\x45\x00ab", 729 sizeDst: 100, 730 want: "\uFFFDa\uFFFD", 731 nSrc: 5, 732 t: utf16BEIB.NewDecoder(), 733 }, { 734 desc: "utf-16le dec: unpaired high surrogate", 735 src: "\x00\x00\x00\xDC\x12\xD8", 736 sizeDst: 100, 737 want: "\x00\uFFFD\uFFFD", 738 nSrc: 6, 739 t: utf16LEIB.NewDecoder(), 740 }, { 741 desc: "utf-16be dec: two unpaired low surrogates", 742 src: "\xD8\x45\xD8\x12", 743 sizeDst: 100, 744 want: "\uFFFD\uFFFD", 745 nSrc: 4, 746 t: utf16BEIB.NewDecoder(), 747 }, { 748 desc: "utf-16be dec: short dst", 749 src: "\x00a", 750 sizeDst: 0, 751 want: "", 752 nSrc: 0, 753 t: utf16BEIB.NewDecoder(), 754 err: transform.ErrShortDst, 755 }, { 756 desc: "utf-16be dec: short dst surrogate", 757 src: "\xD8\xF5\xDC\x12", 758 sizeDst: 3, 759 want: "", 760 nSrc: 0, 761 t: utf16BEIB.NewDecoder(), 762 err: transform.ErrShortDst, 763 }, { 764 desc: "utf-16be dec: short dst trailing byte", 765 src: "\x00", 766 sizeDst: 2, 767 want: "", 768 nSrc: 0, 769 t: utf16BEIB.NewDecoder(), 770 err: transform.ErrShortDst, 771 }, { 772 desc: "utf-16be dec: short src", 773 src: "\x00", 774 notEOF: true, 775 sizeDst: 3, 776 want: "", 777 nSrc: 0, 778 t: utf16BEIB.NewDecoder(), 779 err: transform.ErrShortSrc, 780 }, { 781 desc: "utf-16 enc", 782 src: "\U00012345=Ra", 783 sizeDst: 100, 784 want: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 785 nSrc: 7, 786 t: utf16BEUB.NewEncoder(), 787 }, { 788 desc: "utf-16 enc: short dst normal", 789 src: "\U00012345=Ra", 790 sizeDst: 9, 791 want: "\xD8\x08\xDF\x45\x00\x3D\x00\x52", 792 nSrc: 6, 793 t: utf16BEIB.NewEncoder(), 794 err: transform.ErrShortDst, 795 }, { 796 desc: "utf-16 enc: short dst surrogate", 797 src: "\U00012345=Ra", 798 sizeDst: 3, 799 want: "", 800 nSrc: 0, 801 t: utf16BEIB.NewEncoder(), 802 err: transform.ErrShortDst, 803 }, { 804 desc: "utf-16 enc: short src", 805 src: "\U00012345=Ra\xC2", 806 notEOF: true, 807 sizeDst: 100, 808 want: "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61", 809 nSrc: 7, 810 t: utf16BEIB.NewEncoder(), 811 err: transform.ErrShortSrc, 812 }, { 813 desc: "utf-16be dec: don't change byte order mid-stream", 814 src: "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\xFF\xFE\x00\x52\x00\x61", 815 sizeDst: 100, 816 want: "\U00012345=\ufffeRa", 817 nSrc: 14, 818 t: utf16BEUB.NewDecoder(), 819 }, { 820 desc: "utf-16le dec: don't change byte order mid-stream", 821 src: "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x52\x00\x61\x00", 822 sizeDst: 100, 823 want: "\U00012345=\ufeff\ufffeRa", 824 nSrc: 16, 825 t: utf16LEUB.NewDecoder(), 826 }} 827 for i, tc := range testCases { 828 b := make([]byte, tc.sizeDst) 829 nDst, nSrc, err := tc.t.Transform(b, []byte(tc.src), !tc.notEOF) 830 if err != tc.err { 831 t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err) 832 } 833 if got := string(b[:nDst]); got != tc.want { 834 t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want) 835 } 836 if nSrc != tc.nSrc { 837 t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc) 838 } 839 } 840 } 841 842 func TestErrorHandler(t *testing.T) { 843 testCases := []struct { 844 desc string 845 handler func(*encoding.Encoder) *encoding.Encoder 846 sizeDst int 847 src, want string 848 nSrc int 849 err error 850 }{ 851 { 852 desc: "one rune replacement", 853 handler: encoding.ReplaceUnsupported, 854 sizeDst: 100, 855 src: "\uAC00", 856 want: "\x1a", 857 nSrc: 3, 858 }, 859 { 860 desc: "mid-stream rune replacement", 861 handler: encoding.ReplaceUnsupported, 862 sizeDst: 100, 863 src: "a\uAC00bcd\u00e9", 864 want: "a\x1abcd\xe9", 865 nSrc: 9, 866 }, 867 { 868 desc: "at end rune replacement", 869 handler: encoding.ReplaceUnsupported, 870 sizeDst: 10, 871 src: "\u00e9\uAC00", 872 want: "\xe9\x1a", 873 nSrc: 5, 874 }, 875 { 876 desc: "short buffer replacement", 877 handler: encoding.ReplaceUnsupported, 878 sizeDst: 1, 879 src: "\u00e9\uAC00", 880 want: "\xe9", 881 nSrc: 2, 882 err: transform.ErrShortDst, 883 }, 884 { 885 desc: "one rune html escape", 886 handler: encoding.HTMLEscapeUnsupported, 887 sizeDst: 100, 888 src: "\uAC00", 889 want: "가", 890 nSrc: 3, 891 }, 892 { 893 desc: "mid-stream html escape", 894 handler: encoding.HTMLEscapeUnsupported, 895 sizeDst: 100, 896 src: "\u00e9\uAC00dcba", 897 want: "\xe9가dcba", 898 nSrc: 9, 899 }, 900 { 901 desc: "short buffer html escape", 902 handler: encoding.HTMLEscapeUnsupported, 903 sizeDst: 9, 904 src: "ab\uAC01", 905 want: "ab", 906 nSrc: 2, 907 err: transform.ErrShortDst, 908 }, 909 } 910 for i, tc := range testCases { 911 tr := tc.handler(charmap.Windows1250.NewEncoder()) 912 b := make([]byte, tc.sizeDst) 913 nDst, nSrc, err := tr.Transform(b, []byte(tc.src), true) 914 if err != tc.err { 915 t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err) 916 } 917 if got := string(b[:nDst]); got != tc.want { 918 t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want) 919 } 920 if nSrc != tc.nSrc { 921 t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc) 922 } 923 924 } 925 } 926 func TestBOMOverride(t *testing.T) { 927 dec := unicode.BOMOverride(charmap.CodePage437.NewDecoder()) 928 dst := make([]byte, 100) 929 for i, tc := range []struct { 930 src string 931 atEOF bool 932 dst string 933 nSrc int 934 err error 935 }{ 936 0: {"H\x82ll\x93", true, "Héllô", 5, nil}, 937 1: {"\uFEFFHéllö", true, "Héllö", 10, nil}, 938 2: {"\xFE\xFF\x00H\x00e\x00l\x00l\x00o", true, "Hello", 12, nil}, 939 3: {"\xFF\xFEH\x00e\x00l\x00l\x00o\x00", true, "Hello", 12, nil}, 940 4: {"\uFEFF", true, "", 3, nil}, 941 5: {"\xFE\xFF", true, "", 2, nil}, 942 6: {"\xFF\xFE", true, "", 2, nil}, 943 7: {"\xEF\xBB", true, "\u2229\u2557", 2, nil}, 944 8: {"\xEF", true, "\u2229", 1, nil}, 945 9: {"", true, "", 0, nil}, 946 10: {"\xFE", true, "\u25a0", 1, nil}, 947 11: {"\xFF", true, "\u00a0", 1, nil}, 948 12: {"\xEF\xBB", false, "", 0, transform.ErrShortSrc}, 949 13: {"\xEF", false, "", 0, transform.ErrShortSrc}, 950 14: {"", false, "", 0, transform.ErrShortSrc}, 951 15: {"\xFE", false, "", 0, transform.ErrShortSrc}, 952 16: {"\xFF", false, "", 0, transform.ErrShortSrc}, 953 17: {"\xFF\xFE", false, "", 0, transform.ErrShortSrc}, 954 } { 955 dec.Reset() 956 nDst, nSrc, err := dec.Transform(dst, []byte(tc.src), tc.atEOF) 957 got := string(dst[:nDst]) 958 if nSrc != tc.nSrc { 959 t.Errorf("%d: nSrc: got %d; want %d", i, nSrc, tc.nSrc) 960 } 961 if got != tc.dst { 962 t.Errorf("%d: got %+q; want %+q", i, got, tc.dst) 963 } 964 if err != tc.err { 965 t.Errorf("%d: error: got %v; want %v", i, err, tc.err) 966 } 967 } 968 } 969 970 // testdataFiles are files in testdata/*.txt. 971 var testdataFiles = []struct { 972 enc encoding.Encoding 973 basename, ext string 974 }{ 975 {charmap.Windows1252, "candide", "windows-1252"}, 976 {japanese.EUCJP, "rashomon", "euc-jp"}, 977 {japanese.ISO2022JP, "rashomon", "iso-2022-jp"}, 978 {japanese.ShiftJIS, "rashomon", "shift-jis"}, 979 {korean.EUCKR, "unsu-joh-eun-nal", "euc-kr"}, 980 {simplifiedchinese.GBK, "sunzi-bingfa-simplified", "gbk"}, 981 {simplifiedchinese.HZGB2312, "sunzi-bingfa-gb-levels-1-and-2", "hz-gb2312"}, 982 {traditionalchinese.Big5, "sunzi-bingfa-traditional", "big5"}, 983 {utf16LEIB, "candide", "utf-16le"}, 984 {unicode.UTF8, "candide", "utf-8"}, 985 986 // GB18030 is a superset of GBK and is nominally a Simplified Chinese 987 // encoding, but it can also represent the entire Basic Multilingual 988 // Plane, including codepoints like 'â' that aren't encodable by GBK. 989 // GB18030 on Simplified Chinese should perform similarly to GBK on 990 // Simplified Chinese. GB18030 on "candide" is more interesting. 991 {simplifiedchinese.GB18030, "candide", "gb18030"}, 992 } 993 994 // Encoder or Decoder 995 type Transcoder interface { 996 transform.Transformer 997 Bytes([]byte) ([]byte, error) 998 String(string) (string, error) 999 } 1000 1001 func load(direction string, enc encoding.Encoding) ([]byte, []byte, Transcoder, error) { 1002 basename, ext, count := "", "", 0 1003 for _, tf := range testdataFiles { 1004 if tf.enc == enc { 1005 basename, ext = tf.basename, tf.ext 1006 count++ 1007 } 1008 } 1009 if count != 1 { 1010 if count == 0 { 1011 return nil, nil, nil, fmt.Errorf("no testdataFiles for %s", enc) 1012 } 1013 return nil, nil, nil, fmt.Errorf("too many testdataFiles for %s", enc) 1014 } 1015 dstFile := fmt.Sprintf("testdata/%s-%s.txt", basename, ext) 1016 srcFile := fmt.Sprintf("testdata/%s-utf-8.txt", basename) 1017 var coder Transcoder = encoding.ReplaceUnsupported(enc.NewEncoder()) 1018 if direction == "Decode" { 1019 dstFile, srcFile = srcFile, dstFile 1020 coder = enc.NewDecoder() 1021 } 1022 dst, err := ioutil.ReadFile(dstFile) 1023 if err != nil { 1024 return nil, nil, nil, err 1025 } 1026 src, err := ioutil.ReadFile(srcFile) 1027 if err != nil { 1028 return nil, nil, nil, err 1029 } 1030 return dst, src, coder, nil 1031 } 1032 1033 func TestFiles(t *testing.T) { 1034 for _, dir := range []string{"Decode", "Encode"} { 1035 for _, tf := range testdataFiles { 1036 dst, src, transformer, err := load(dir, tf.enc) 1037 if err != nil { 1038 t.Errorf("%s, %s: load: %v", dir, tf.enc, err) 1039 continue 1040 } 1041 buf, err := transformer.Bytes(src) 1042 if err != nil { 1043 t.Errorf("%s, %s: transform: %v", dir, tf.enc, err) 1044 continue 1045 } 1046 if !bytes.Equal(buf, dst) { 1047 t.Errorf("%s, %s: transformed bytes did not match golden file", dir, tf.enc) 1048 continue 1049 } 1050 } 1051 } 1052 } 1053 1054 func benchmark(b *testing.B, direction string, enc encoding.Encoding) { 1055 _, src, transformer, err := load(direction, enc) 1056 if err != nil { 1057 b.Fatal(err) 1058 } 1059 b.SetBytes(int64(len(src))) 1060 b.ResetTimer() 1061 for i := 0; i < b.N; i++ { 1062 r := transform.NewReader(bytes.NewReader(src), transformer) 1063 io.Copy(ioutil.Discard, r) 1064 } 1065 } 1066 1067 func BenchmarkBig5Decoder(b *testing.B) { benchmark(b, "Decode", traditionalchinese.Big5) } 1068 func BenchmarkBig5Encoder(b *testing.B) { benchmark(b, "Encode", traditionalchinese.Big5) } 1069 func BenchmarkCharmapDecoder(b *testing.B) { benchmark(b, "Decode", charmap.Windows1252) } 1070 func BenchmarkCharmapEncoder(b *testing.B) { benchmark(b, "Encode", charmap.Windows1252) } 1071 func BenchmarkEUCJPDecoder(b *testing.B) { benchmark(b, "Decode", japanese.EUCJP) } 1072 func BenchmarkEUCJPEncoder(b *testing.B) { benchmark(b, "Encode", japanese.EUCJP) } 1073 func BenchmarkEUCKRDecoder(b *testing.B) { benchmark(b, "Decode", korean.EUCKR) } 1074 func BenchmarkEUCKREncoder(b *testing.B) { benchmark(b, "Encode", korean.EUCKR) } 1075 func BenchmarkGB18030Decoder(b *testing.B) { benchmark(b, "Decode", simplifiedchinese.GB18030) } 1076 func BenchmarkGB18030Encoder(b *testing.B) { benchmark(b, "Encode", simplifiedchinese.GB18030) } 1077 func BenchmarkGBKDecoder(b *testing.B) { benchmark(b, "Decode", simplifiedchinese.GBK) } 1078 func BenchmarkGBKEncoder(b *testing.B) { benchmark(b, "Encode", simplifiedchinese.GBK) } 1079 func BenchmarkHZGB2312Decoder(b *testing.B) { benchmark(b, "Decode", simplifiedchinese.HZGB2312) } 1080 func BenchmarkHZGB2312Encoder(b *testing.B) { benchmark(b, "Encode", simplifiedchinese.HZGB2312) } 1081 func BenchmarkISO2022JPDecoder(b *testing.B) { benchmark(b, "Decode", japanese.ISO2022JP) } 1082 func BenchmarkISO2022JPEncoder(b *testing.B) { benchmark(b, "Encode", japanese.ISO2022JP) } 1083 func BenchmarkShiftJISDecoder(b *testing.B) { benchmark(b, "Decode", japanese.ShiftJIS) } 1084 func BenchmarkShiftJISEncoder(b *testing.B) { benchmark(b, "Encode", japanese.ShiftJIS) } 1085 func BenchmarkUTF8Decoder(b *testing.B) { benchmark(b, "Decode", unicode.UTF8) } 1086 func BenchmarkUTF8Encoder(b *testing.B) { benchmark(b, "Encode", unicode.UTF8) } 1087 func BenchmarkUTF16Decoder(b *testing.B) { benchmark(b, "Decode", utf16LEIB) } 1088 func BenchmarkUTF16Encoder(b *testing.B) { benchmark(b, "Encode", utf16LEIB) }