github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/unicode/utf8/utf8_test.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package utf8_test 6 7 import ( 8 "bytes" 9 "testing" 10 "unicode" 11 . "unicode/utf8" 12 ) 13 14 // Validate the constants redefined from unicode. 15 func init() { 16 if MaxRune != unicode.MaxRune { 17 panic("utf8.MaxRune is wrong") 18 } 19 if RuneError != unicode.ReplacementChar { 20 panic("utf8.RuneError is wrong") 21 } 22 } 23 24 // Validate the constants redefined from unicode. 25 func TestConstants(t *testing.T) { 26 if MaxRune != unicode.MaxRune { 27 t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune) 28 } 29 if RuneError != unicode.ReplacementChar { 30 t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar) 31 } 32 } 33 34 type Utf8Map struct { 35 r rune 36 str string 37 } 38 39 var utf8map = []Utf8Map{ 40 {0x0000, "\x00"}, 41 {0x0001, "\x01"}, 42 {0x007e, "\x7e"}, 43 {0x007f, "\x7f"}, 44 {0x0080, "\xc2\x80"}, 45 {0x0081, "\xc2\x81"}, 46 {0x00bf, "\xc2\xbf"}, 47 {0x00c0, "\xc3\x80"}, 48 {0x00c1, "\xc3\x81"}, 49 {0x00c8, "\xc3\x88"}, 50 {0x00d0, "\xc3\x90"}, 51 {0x00e0, "\xc3\xa0"}, 52 {0x00f0, "\xc3\xb0"}, 53 {0x00f8, "\xc3\xb8"}, 54 {0x00ff, "\xc3\xbf"}, 55 {0x0100, "\xc4\x80"}, 56 {0x07ff, "\xdf\xbf"}, 57 {0x0400, "\xd0\x80"}, 58 {0x0800, "\xe0\xa0\x80"}, 59 {0x0801, "\xe0\xa0\x81"}, 60 {0x1000, "\xe1\x80\x80"}, 61 {0xd000, "\xed\x80\x80"}, 62 {0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half. 63 {0xe000, "\xee\x80\x80"}, // first code point after surrogate half. 64 {0xfffe, "\xef\xbf\xbe"}, 65 {0xffff, "\xef\xbf\xbf"}, 66 {0x10000, "\xf0\x90\x80\x80"}, 67 {0x10001, "\xf0\x90\x80\x81"}, 68 {0x40000, "\xf1\x80\x80\x80"}, 69 {0x10fffe, "\xf4\x8f\xbf\xbe"}, 70 {0x10ffff, "\xf4\x8f\xbf\xbf"}, 71 {0xFFFD, "\xef\xbf\xbd"}, 72 } 73 74 var surrogateMap = []Utf8Map{ 75 {0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1) 76 {0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1) 77 } 78 79 var testStrings = []string{ 80 "", 81 "abcd", 82 "☺☻☹", 83 "日a本b語ç日ð本Ê語þ日¥本¼語i日©", 84 "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©", 85 "\x80\x80\x80\x80", 86 } 87 88 func TestFullRune(t *testing.T) { 89 for _, m := range utf8map { 90 b := []byte(m.str) 91 if !FullRune(b) { 92 t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r) 93 } 94 s := m.str 95 if !FullRuneInString(s) { 96 t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r) 97 } 98 b1 := b[0 : len(b)-1] 99 if FullRune(b1) { 100 t.Errorf("FullRune(%q) = true, want false", b1) 101 } 102 s1 := string(b1) 103 if FullRuneInString(s1) { 104 t.Errorf("FullRune(%q) = true, want false", s1) 105 } 106 } 107 for _, s := range []string{"\xc0", "\xc1"} { 108 b := []byte(s) 109 if !FullRune(b) { 110 t.Errorf("FullRune(%q) = false, want true", s) 111 } 112 if !FullRuneInString(s) { 113 t.Errorf("FullRuneInString(%q) = false, want true", s) 114 } 115 } 116 } 117 118 func TestEncodeRune(t *testing.T) { 119 for _, m := range utf8map { 120 b := []byte(m.str) 121 var buf [10]byte 122 n := EncodeRune(buf[0:], m.r) 123 b1 := buf[0:n] 124 if !bytes.Equal(b, b1) { 125 t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b) 126 } 127 } 128 } 129 130 func TestDecodeRune(t *testing.T) { 131 for _, m := range utf8map { 132 b := []byte(m.str) 133 r, size := DecodeRune(b) 134 if r != m.r || size != len(b) { 135 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b)) 136 } 137 s := m.str 138 r, size = DecodeRuneInString(s) 139 if r != m.r || size != len(b) { 140 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) 141 } 142 143 // there's an extra byte that bytes left behind - make sure trailing byte works 144 r, size = DecodeRune(b[0:cap(b)]) 145 if r != m.r || size != len(b) { 146 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b)) 147 } 148 s = m.str + "\x00" 149 r, size = DecodeRuneInString(s) 150 if r != m.r || size != len(b) { 151 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) 152 } 153 154 // make sure missing bytes fail 155 wantsize := 1 156 if wantsize >= len(b) { 157 wantsize = 0 158 } 159 r, size = DecodeRune(b[0 : len(b)-1]) 160 if r != RuneError || size != wantsize { 161 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize) 162 } 163 s = m.str[0 : len(m.str)-1] 164 r, size = DecodeRuneInString(s) 165 if r != RuneError || size != wantsize { 166 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize) 167 } 168 169 // make sure bad sequences fail 170 if len(b) == 1 { 171 b[0] = 0x80 172 } else { 173 b[len(b)-1] = 0x7F 174 } 175 r, size = DecodeRune(b) 176 if r != RuneError || size != 1 { 177 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1) 178 } 179 s = string(b) 180 r, size = DecodeRuneInString(s) 181 if r != RuneError || size != 1 { 182 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1) 183 } 184 185 } 186 } 187 188 func TestDecodeSurrogateRune(t *testing.T) { 189 for _, m := range surrogateMap { 190 b := []byte(m.str) 191 r, size := DecodeRune(b) 192 if r != RuneError || size != 1 { 193 t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1) 194 } 195 s := m.str 196 r, size = DecodeRuneInString(s) 197 if r != RuneError || size != 1 { 198 t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1) 199 } 200 } 201 } 202 203 // Check that DecodeRune and DecodeLastRune correspond to 204 // the equivalent range loop. 205 func TestSequencing(t *testing.T) { 206 for _, ts := range testStrings { 207 for _, m := range utf8map { 208 for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} { 209 testSequence(t, s) 210 } 211 } 212 } 213 } 214 215 // Check that a range loop and a []int conversion visit the same runes. 216 // Not really a test of this package, but the assumption is used here and 217 // it's good to verify 218 func TestIntConversion(t *testing.T) { 219 for _, ts := range testStrings { 220 runes := []rune(ts) 221 if RuneCountInString(ts) != len(runes) { 222 t.Errorf("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts)) 223 break 224 } 225 i := 0 226 for _, r := range ts { 227 if r != runes[i] { 228 t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r) 229 } 230 i++ 231 } 232 } 233 } 234 235 var invalidSequenceTests = []string{ 236 "\xed\xa0\x80\x80", // surrogate min 237 "\xed\xbf\xbf\x80", // surrogate max 238 239 // xx 240 "\x91\x80\x80\x80", 241 242 // s1 243 "\xC2\x7F\x80\x80", 244 "\xC2\xC0\x80\x80", 245 "\xDF\x7F\x80\x80", 246 "\xDF\xC0\x80\x80", 247 248 // s2 249 "\xE0\x9F\xBF\x80", 250 "\xE0\xA0\x7F\x80", 251 "\xE0\xBF\xC0\x80", 252 "\xE0\xC0\x80\x80", 253 254 // s3 255 "\xE1\x7F\xBF\x80", 256 "\xE1\x80\x7F\x80", 257 "\xE1\xBF\xC0\x80", 258 "\xE1\xC0\x80\x80", 259 260 //s4 261 "\xED\x7F\xBF\x80", 262 "\xED\x80\x7F\x80", 263 "\xED\x9F\xC0\x80", 264 "\xED\xA0\x80\x80", 265 266 // s5 267 "\xF0\x8F\xBF\xBF", 268 "\xF0\x90\x7F\xBF", 269 "\xF0\x90\x80\x7F", 270 "\xF0\xBF\xBF\xC0", 271 "\xF0\xBF\xC0\x80", 272 "\xF0\xC0\x80\x80", 273 274 // s6 275 "\xF1\x7F\xBF\xBF", 276 "\xF1\x80\x7F\xBF", 277 "\xF1\x80\x80\x7F", 278 "\xF1\xBF\xBF\xC0", 279 "\xF1\xBF\xC0\x80", 280 "\xF1\xC0\x80\x80", 281 282 // s7 283 "\xF4\x7F\xBF\xBF", 284 "\xF4\x80\x7F\xBF", 285 "\xF4\x80\x80\x7F", 286 "\xF4\x8F\xBF\xC0", 287 "\xF4\x8F\xC0\x80", 288 "\xF4\x90\x80\x80", 289 } 290 291 func runtimeDecodeRune(s string) rune { 292 for _, r := range s { 293 return r 294 } 295 return -1 296 } 297 298 func TestDecodeInvalidSequence(t *testing.T) { 299 for _, s := range invalidSequenceTests { 300 r1, _ := DecodeRune([]byte(s)) 301 if want := RuneError; r1 != want { 302 t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want) 303 return 304 } 305 r2, _ := DecodeRuneInString(s) 306 if want := RuneError; r2 != want { 307 t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want) 308 return 309 } 310 if r1 != r2 { 311 t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2) 312 return 313 } 314 r3 := runtimeDecodeRune(s) 315 if r2 != r3 { 316 t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3) 317 return 318 } 319 } 320 } 321 322 func testSequence(t *testing.T, s string) { 323 type info struct { 324 index int 325 r rune 326 } 327 index := make([]info, len(s)) 328 b := []byte(s) 329 si := 0 330 j := 0 331 for i, r := range s { 332 if si != i { 333 t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i) 334 return 335 } 336 index[j] = info{i, r} 337 j++ 338 r1, size1 := DecodeRune(b[i:]) 339 if r != r1 { 340 t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r) 341 return 342 } 343 r2, size2 := DecodeRuneInString(s[i:]) 344 if r != r2 { 345 t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r) 346 return 347 } 348 if size1 != size2 { 349 t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2) 350 return 351 } 352 si += size1 353 } 354 j-- 355 for si = len(s); si > 0; { 356 r1, size1 := DecodeLastRune(b[0:si]) 357 r2, size2 := DecodeLastRuneInString(s[0:si]) 358 if size1 != size2 { 359 t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2) 360 return 361 } 362 if r1 != index[j].r { 363 t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r) 364 return 365 } 366 if r2 != index[j].r { 367 t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r) 368 return 369 } 370 si -= size1 371 if si != index[j].index { 372 t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index) 373 return 374 } 375 j-- 376 } 377 if si != 0 { 378 t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si) 379 } 380 } 381 382 // Check that negative runes encode as U+FFFD. 383 func TestNegativeRune(t *testing.T) { 384 errorbuf := make([]byte, UTFMax) 385 errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)] 386 buf := make([]byte, UTFMax) 387 buf = buf[0:EncodeRune(buf, -1)] 388 if !bytes.Equal(buf, errorbuf) { 389 t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf) 390 } 391 } 392 393 type RuneCountTest struct { 394 in string 395 out int 396 } 397 398 var runecounttests = []RuneCountTest{ 399 {"abcd", 4}, 400 {"☺☻☹", 3}, 401 {"1,2,3,4", 7}, 402 {"\xe2\x00", 2}, 403 {"\xe2\x80", 2}, 404 {"a\xe2\x80", 3}, 405 } 406 407 func TestRuneCount(t *testing.T) { 408 for _, tt := range runecounttests { 409 if out := RuneCountInString(tt.in); out != tt.out { 410 t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out) 411 } 412 if out := RuneCount([]byte(tt.in)); out != tt.out { 413 t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out) 414 } 415 } 416 } 417 418 type RuneLenTest struct { 419 r rune 420 size int 421 } 422 423 var runelentests = []RuneLenTest{ 424 {0, 1}, 425 {'e', 1}, 426 {'é', 2}, 427 {'☺', 3}, 428 {RuneError, 3}, 429 {MaxRune, 4}, 430 {0xD800, -1}, 431 {0xDFFF, -1}, 432 {MaxRune + 1, -1}, 433 {-1, -1}, 434 } 435 436 func TestRuneLen(t *testing.T) { 437 for _, tt := range runelentests { 438 if size := RuneLen(tt.r); size != tt.size { 439 t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size) 440 } 441 } 442 } 443 444 type ValidTest struct { 445 in string 446 out bool 447 } 448 449 var validTests = []ValidTest{ 450 {"", true}, 451 {"a", true}, 452 {"abc", true}, 453 {"Ж", true}, 454 {"ЖЖ", true}, 455 {"брэд-ЛГТМ", true}, 456 {"☺☻☹", true}, 457 {"aa\xe2", false}, 458 {string([]byte{66, 250}), false}, 459 {string([]byte{66, 250, 67}), false}, 460 {"a\uFFFDb", true}, 461 {string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF 462 {string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range 463 {string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range 464 {string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range 465 {string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect 466 {string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic) 467 {string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic) 468 } 469 470 func TestValid(t *testing.T) { 471 for _, tt := range validTests { 472 if Valid([]byte(tt.in)) != tt.out { 473 t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out) 474 } 475 if ValidString(tt.in) != tt.out { 476 t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out) 477 } 478 } 479 } 480 481 type ValidRuneTest struct { 482 r rune 483 ok bool 484 } 485 486 var validrunetests = []ValidRuneTest{ 487 {0, true}, 488 {'e', true}, 489 {'é', true}, 490 {'☺', true}, 491 {RuneError, true}, 492 {MaxRune, true}, 493 {0xD7FF, true}, 494 {0xD800, false}, 495 {0xDFFF, false}, 496 {0xE000, true}, 497 {MaxRune + 1, false}, 498 {-1, false}, 499 } 500 501 func TestValidRune(t *testing.T) { 502 for _, tt := range validrunetests { 503 if ok := ValidRune(tt.r); ok != tt.ok { 504 t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok) 505 } 506 } 507 } 508 509 func BenchmarkRuneCountTenASCIIChars(b *testing.B) { 510 s := []byte("0123456789") 511 for i := 0; i < b.N; i++ { 512 RuneCount(s) 513 } 514 } 515 516 func BenchmarkRuneCountTenJapaneseChars(b *testing.B) { 517 s := []byte("日本語日本語日本語日") 518 for i := 0; i < b.N; i++ { 519 RuneCount(s) 520 } 521 } 522 523 func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) { 524 for i := 0; i < b.N; i++ { 525 RuneCountInString("0123456789") 526 } 527 } 528 529 func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) { 530 for i := 0; i < b.N; i++ { 531 RuneCountInString("日本語日本語日本語日") 532 } 533 } 534 535 func BenchmarkValidTenASCIIChars(b *testing.B) { 536 s := []byte("0123456789") 537 for i := 0; i < b.N; i++ { 538 Valid(s) 539 } 540 } 541 542 func BenchmarkValidTenJapaneseChars(b *testing.B) { 543 s := []byte("日本語日本語日本語日") 544 for i := 0; i < b.N; i++ { 545 Valid(s) 546 } 547 } 548 549 func BenchmarkValidStringTenASCIIChars(b *testing.B) { 550 for i := 0; i < b.N; i++ { 551 ValidString("0123456789") 552 } 553 } 554 555 func BenchmarkValidStringTenJapaneseChars(b *testing.B) { 556 for i := 0; i < b.N; i++ { 557 ValidString("日本語日本語日本語日") 558 } 559 } 560 561 func BenchmarkEncodeASCIIRune(b *testing.B) { 562 buf := make([]byte, UTFMax) 563 for i := 0; i < b.N; i++ { 564 EncodeRune(buf, 'a') 565 } 566 } 567 568 func BenchmarkEncodeJapaneseRune(b *testing.B) { 569 buf := make([]byte, UTFMax) 570 for i := 0; i < b.N; i++ { 571 EncodeRune(buf, '本') 572 } 573 } 574 575 func BenchmarkDecodeASCIIRune(b *testing.B) { 576 a := []byte{'a'} 577 for i := 0; i < b.N; i++ { 578 DecodeRune(a) 579 } 580 } 581 582 func BenchmarkDecodeJapaneseRune(b *testing.B) { 583 nihon := []byte("本") 584 for i := 0; i < b.N; i++ { 585 DecodeRune(nihon) 586 } 587 } 588 589 func BenchmarkFullASCIIRune(b *testing.B) { 590 a := []byte{'a'} 591 for i := 0; i < b.N; i++ { 592 FullRune(a) 593 } 594 } 595 596 func BenchmarkFullJapaneseRune(b *testing.B) { 597 nihon := []byte("本") 598 for i := 0; i < b.N; i++ { 599 FullRune(nihon) 600 } 601 }