github.com/primecitizens/pcz/std@v0.2.1/text/unicode/utf8/utf8_test.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2009 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 package utf8_test 9 10 import ( 11 "bytes" 12 "strings" 13 "testing" 14 15 stdstring "github.com/primecitizens/pcz/std/builtin/string" 16 . "github.com/primecitizens/pcz/std/text/unicode/common" 17 . "github.com/primecitizens/pcz/std/text/unicode/utf8" 18 ) 19 20 type Utf8Map struct { 21 r rune 22 str string 23 } 24 25 var utf8map = []Utf8Map{ 26 {0x0000, "\x00"}, 27 {0x0001, "\x01"}, 28 {0x007e, "\x7e"}, 29 {0x007f, "\x7f"}, 30 {0x0080, "\xc2\x80"}, 31 {0x0081, "\xc2\x81"}, 32 {0x00bf, "\xc2\xbf"}, 33 {0x00c0, "\xc3\x80"}, 34 {0x00c1, "\xc3\x81"}, 35 {0x00c8, "\xc3\x88"}, 36 {0x00d0, "\xc3\x90"}, 37 {0x00e0, "\xc3\xa0"}, 38 {0x00f0, "\xc3\xb0"}, 39 {0x00f8, "\xc3\xb8"}, 40 {0x00ff, "\xc3\xbf"}, 41 {0x0100, "\xc4\x80"}, 42 {0x07ff, "\xdf\xbf"}, 43 {0x0400, "\xd0\x80"}, 44 {0x0800, "\xe0\xa0\x80"}, 45 {0x0801, "\xe0\xa0\x81"}, 46 {0x1000, "\xe1\x80\x80"}, 47 {0xd000, "\xed\x80\x80"}, 48 {0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half. 49 {0xe000, "\xee\x80\x80"}, // first code point after surrogate half. 50 {0xfffe, "\xef\xbf\xbe"}, 51 {0xffff, "\xef\xbf\xbf"}, 52 {0x10000, "\xf0\x90\x80\x80"}, 53 {0x10001, "\xf0\x90\x80\x81"}, 54 {0x40000, "\xf1\x80\x80\x80"}, 55 {0x10fffe, "\xf4\x8f\xbf\xbe"}, 56 {0x10ffff, "\xf4\x8f\xbf\xbf"}, 57 {0xFFFD, "\xef\xbf\xbd"}, 58 } 59 60 var surrogateMap = []Utf8Map{ 61 {0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1) 62 {0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1) 63 } 64 65 var testStrings = []string{ 66 "", 67 "abcd", 68 "☺☻☹", 69 "日a本b語ç日ð本Ê語þ日¥本¼語i日©", 70 "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©", 71 "\x80\x80\x80\x80", 72 } 73 74 func TestFullRune(t *testing.T) { 75 for _, m := range utf8map { 76 b := []byte(m.str) 77 s := m.str 78 if !FullRune(s) { 79 t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r) 80 } 81 b1 := b[0 : len(b)-1] 82 s1 := string(b1) 83 if FullRune(s1) { 84 t.Errorf("FullRune(%q) = true, want false", s1) 85 } 86 } 87 for _, s := range []string{"\xc0", "\xc1"} { 88 if !FullRune(s) { 89 t.Errorf("FullRuneInString(%q) = false, want true", s) 90 } 91 } 92 } 93 94 func TestEncodeRune(t *testing.T) { 95 for _, m := range utf8map { 96 b := []byte(m.str) 97 var buf [10]byte 98 b1, _ := EncodeRune(buf[0:0], m.r) 99 if !bytes.Equal(b, b1) { 100 t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b) 101 } 102 } 103 } 104 105 func TestAppendRune(t *testing.T) { 106 for _, m := range utf8map { 107 if buf := AppendRunes(nil, m.r); string(buf) != m.str { 108 t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str) 109 } 110 if buf := AppendRunes([]byte("init"), m.r); string(buf) != "init"+m.str { 111 t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str) 112 } 113 } 114 } 115 116 func TestDecodeRune(t *testing.T) { 117 for _, m := range utf8map { 118 b := []byte(m.str) 119 s := m.str 120 r, size := First(s) 121 if r != m.r || size != len(b) { 122 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) 123 } 124 125 s = m.str + "\x00" 126 r, size = First(s) 127 if r != m.r || size != len(b) { 128 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) 129 } 130 131 // make sure missing bytes fail 132 wantsize := 1 133 if wantsize >= len(b) { 134 wantsize = 0 135 } 136 s = m.str[0 : len(m.str)-1] 137 r, size = First(s) 138 if r != RuneError || size != wantsize { 139 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize) 140 } 141 142 // make sure bad sequences fail 143 if len(b) == 1 { 144 b[0] = 0x80 145 } else { 146 b[len(b)-1] = 0x7F 147 } 148 s = string(b) 149 r, size = First(s) 150 if r != RuneError || size != 1 { 151 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1) 152 } 153 154 } 155 } 156 157 func TestDecodeSurrogateRune(t *testing.T) { 158 for _, m := range surrogateMap { 159 b := []byte(m.str) 160 s := m.str 161 r, size := First(s) 162 if r != RuneError || size != 1 { 163 t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1) 164 } 165 } 166 } 167 168 // Check that DecodeRune and DecodeLastRune correspond to 169 // the equivalent range loop. 170 func TestSequencing(t *testing.T) { 171 for _, ts := range testStrings { 172 for _, m := range utf8map { 173 for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} { 174 testSequence(t, s) 175 } 176 } 177 } 178 } 179 180 func runtimeRuneCount(s string) int { 181 return len([]rune(s)) // Replaced by gc with call to runtime.countrunes(s). 182 } 183 184 // Check that a range loop, len([]rune(string)) optimization and 185 // []rune conversions visit the same runes. 186 // Not really a test of this package, but the assumption is used here and 187 // it's good to verify. 188 func TestRuntimeConversion(t *testing.T) { 189 for _, ts := range testStrings { 190 count := Count(ts) 191 if n := runtimeRuneCount(ts); n != count { 192 t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCountInString", ts, n, count) 193 break 194 } 195 196 runes := []rune(ts) 197 if n := len(runes); n != count { 198 t.Errorf("%q: []rune() has length %d; got %d from RuneCountInString", ts, n, count) 199 break 200 } 201 i := 0 202 for _, r := range ts { 203 if r != runes[i] { 204 t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r) 205 } 206 i++ 207 } 208 } 209 } 210 211 var invalidSequenceTests = []string{ 212 "\xed\xa0\x80\x80", // surrogate min 213 "\xed\xbf\xbf\x80", // surrogate max 214 215 // xx 216 "\x91\x80\x80\x80", 217 218 // s1 219 "\xC2\x7F\x80\x80", 220 "\xC2\xC0\x80\x80", 221 "\xDF\x7F\x80\x80", 222 "\xDF\xC0\x80\x80", 223 224 // s2 225 "\xE0\x9F\xBF\x80", 226 "\xE0\xA0\x7F\x80", 227 "\xE0\xBF\xC0\x80", 228 "\xE0\xC0\x80\x80", 229 230 // s3 231 "\xE1\x7F\xBF\x80", 232 "\xE1\x80\x7F\x80", 233 "\xE1\xBF\xC0\x80", 234 "\xE1\xC0\x80\x80", 235 236 //s4 237 "\xED\x7F\xBF\x80", 238 "\xED\x80\x7F\x80", 239 "\xED\x9F\xC0\x80", 240 "\xED\xA0\x80\x80", 241 242 // s5 243 "\xF0\x8F\xBF\xBF", 244 "\xF0\x90\x7F\xBF", 245 "\xF0\x90\x80\x7F", 246 "\xF0\xBF\xBF\xC0", 247 "\xF0\xBF\xC0\x80", 248 "\xF0\xC0\x80\x80", 249 250 // s6 251 "\xF1\x7F\xBF\xBF", 252 "\xF1\x80\x7F\xBF", 253 "\xF1\x80\x80\x7F", 254 "\xF1\xBF\xBF\xC0", 255 "\xF1\xBF\xC0\x80", 256 "\xF1\xC0\x80\x80", 257 258 // s7 259 "\xF4\x7F\xBF\xBF", 260 "\xF4\x80\x7F\xBF", 261 "\xF4\x80\x80\x7F", 262 "\xF4\x8F\xBF\xC0", 263 "\xF4\x8F\xC0\x80", 264 "\xF4\x90\x80\x80", 265 } 266 267 func runtimeDecodeRune(s string) rune { 268 for _, r := range s { 269 return r 270 } 271 return -1 272 } 273 274 func TestDecodeInvalidSequence(t *testing.T) { 275 for _, s := range invalidSequenceTests { 276 r2, _ := First(s) 277 if want := RuneError; r2 != want { 278 t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s, r2, want) 279 return 280 } 281 282 r3 := runtimeDecodeRune(s) 283 if r2 != r3 { 284 t.Errorf("DecodeRune(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3) 285 return 286 } 287 } 288 } 289 290 func testSequence(t *testing.T, s string) { 291 type info struct { 292 index int 293 r rune 294 } 295 index := make([]info, len(s)) 296 si := 0 297 j := 0 298 for i, r := range s { 299 if si != i { 300 t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i) 301 return 302 } 303 index[j] = info{i, r} 304 j++ 305 r2, size2 := First(s[i:]) 306 if r != r2 { 307 t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r2, r) 308 return 309 } 310 si += size2 311 } 312 j-- 313 for si = len(s); si > 0; { 314 r2, size2 := Last(s[0:si]) 315 if r2 != index[j].r { 316 t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r) 317 return 318 } 319 si -= size2 320 if si != index[j].index { 321 t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index) 322 return 323 } 324 j-- 325 } 326 if si != 0 { 327 t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si) 328 } 329 } 330 331 // Check that negative runes encode as U+FFFD. 332 func TestNegativeRune(t *testing.T) { 333 errorbuf := make([]byte, 0, MaxRuneLen) 334 errorbuf, _ = EncodeRune(errorbuf, RuneError) 335 buf := make([]byte, 0, MaxRuneLen) 336 buf, _ = EncodeRune(buf, -1) 337 if !bytes.Equal(buf, errorbuf) { 338 t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf) 339 } 340 } 341 342 type RuneCountTest struct { 343 in string 344 out int 345 } 346 347 var runecounttests = []RuneCountTest{ 348 {"abcd", 4}, 349 {"☺☻☹", 3}, 350 {"1,2,3,4", 7}, 351 {"\xe2\x00", 2}, 352 {"\xe2\x80", 2}, 353 {"a\xe2\x80", 3}, 354 } 355 356 func TestRuneCount(t *testing.T) { 357 for _, tt := range runecounttests { 358 if out := Count(tt.in); out != tt.out { 359 t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out) 360 } 361 } 362 } 363 364 type RuneLenTest struct { 365 r rune 366 size int 367 } 368 369 var runelentests = []RuneLenTest{ 370 {0, 1}, 371 {'e', 1}, 372 {'é', 2}, 373 {'☺', 3}, 374 {RuneError, 3}, 375 {MaxRune, 4}, 376 {0xD800, -1}, 377 {0xDFFF, -1}, 378 {MaxRune + 1, -1}, 379 {-1, -1}, 380 } 381 382 func TestRuneLen(t *testing.T) { 383 for _, tt := range runelentests { 384 if size := RuneLen(tt.r); size != tt.size { 385 t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size) 386 } 387 } 388 } 389 390 type ValidTest struct { 391 in string 392 out bool 393 } 394 395 var validTests = []ValidTest{ 396 {"", true}, 397 {"a", true}, 398 {"abc", true}, 399 {"Ж", true}, 400 {"ЖЖ", true}, 401 {"брэд-ЛГТМ", true}, 402 {"☺☻☹", true}, 403 {"aa\xe2", false}, 404 {string([]byte{66, 250}), false}, 405 {string([]byte{66, 250, 67}), false}, 406 {"a\uFFFDb", true}, 407 {string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF 408 {string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range 409 {string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range 410 {string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range 411 {string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect 412 {string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic) 413 {string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic) 414 } 415 416 func TestValid(t *testing.T) { 417 for _, tt := range validTests { 418 if Valid(tt.in) != tt.out { 419 t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out) 420 } 421 } 422 } 423 424 type ValidRuneTest struct { 425 r rune 426 ok bool 427 } 428 429 var validrunetests = []ValidRuneTest{ 430 {0, true}, 431 {'e', true}, 432 {'é', true}, 433 {'☺', true}, 434 {RuneError, true}, 435 {MaxRune, true}, 436 {0xD7FF, true}, 437 {0xD800, false}, 438 {0xDFFF, false}, 439 {0xE000, true}, 440 {MaxRune + 1, false}, 441 {-1, false}, 442 } 443 444 func TestValidRune(t *testing.T) { 445 for _, tt := range validrunetests { 446 if ok := RuneValid(tt.r); ok != tt.ok { 447 t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok) 448 } 449 } 450 } 451 452 func BenchmarkRuneCountTenASCIIChars(b *testing.B) { 453 for i := 0; i < b.N; i++ { 454 Count("0123456789") 455 } 456 } 457 458 func BenchmarkRuneCountTenJapaneseChars(b *testing.B) { 459 for i := 0; i < b.N; i++ { 460 Count("日本語日本語日本語日") 461 } 462 } 463 464 var ascii100000 = strings.Repeat("0123456789", 10000) 465 466 func BenchmarkValidStringTenASCIIChars(b *testing.B) { 467 for i := 0; i < b.N; i++ { 468 Valid("0123456789") 469 } 470 } 471 472 func BenchmarkValidString100KASCIIChars(b *testing.B) { 473 for i := 0; i < b.N; i++ { 474 Valid(ascii100000) 475 } 476 } 477 478 func BenchmarkValidStringTenJapaneseChars(b *testing.B) { 479 for i := 0; i < b.N; i++ { 480 Valid("日本語日本語日本語日") 481 } 482 } 483 484 func BenchmarkValidStringLongMostlyASCII(b *testing.B) { 485 for i := 0; i < b.N; i++ { 486 Valid(longStringMostlyASCII) 487 } 488 } 489 490 func BenchmarkValidStringLongJapanese(b *testing.B) { 491 for i := 0; i < b.N; i++ { 492 Valid(longStringJapanese) 493 } 494 } 495 496 var longStringMostlyASCII string // ~100KB, ~97% ASCII 497 var longStringJapanese string // ~100KB, non-ASCII 498 499 func init() { 500 const japanese = "日本語日本語日本語日" 501 var b strings.Builder 502 for i := 0; b.Len() < 100_000; i++ { 503 if i%100 == 0 { 504 b.WriteString(japanese) 505 } else { 506 b.WriteString("0123456789") 507 } 508 } 509 longStringMostlyASCII = b.String() 510 longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese)) 511 } 512 513 func BenchmarkEncodeASCIIRune(b *testing.B) { 514 buf := make([]byte, 0, MaxRuneLen) 515 for i := 0; i < b.N; i++ { 516 EncodeRune(buf, 'a') 517 } 518 } 519 520 func BenchmarkEncodeJapaneseRune(b *testing.B) { 521 buf := make([]byte, 0, MaxRuneLen) 522 for i := 0; i < b.N; i++ { 523 EncodeRune(buf, '本') 524 } 525 } 526 527 func BenchmarkAppendASCIIRune(b *testing.B) { 528 buf := make([]byte, MaxRuneLen) 529 for i := 0; i < b.N; i++ { 530 AppendRunes(buf[:0], 'a') 531 } 532 } 533 534 func BenchmarkAppendJapaneseRune(b *testing.B) { 535 buf := make([]byte, MaxRuneLen) 536 for i := 0; i < b.N; i++ { 537 AppendRunes(buf[:0], '本') 538 } 539 } 540 541 func BenchmarkDecodeASCIIRune(b *testing.B) { 542 a := "a" 543 for i := 0; i < b.N; i++ { 544 First(a) 545 } 546 } 547 548 func BenchmarkDecodeJapaneseRune(b *testing.B) { 549 nihon := "本" 550 for i := 0; i < b.N; i++ { 551 First(nihon) 552 } 553 } 554 555 // boolSink is used to reference the return value of benchmarked 556 // functions to avoid dead code elimination. 557 var boolSink bool 558 559 func BenchmarkFullRune(b *testing.B) { 560 benchmarks := []struct { 561 name string 562 data []byte 563 }{ 564 {"ASCII", []byte("a")}, 565 {"Incomplete", []byte("\xf0\x90\x80")}, 566 {"Japanese", []byte("本")}, 567 } 568 for _, bm := range benchmarks { 569 b.Run(bm.name, func(b *testing.B) { 570 for i := 0; i < b.N; i++ { 571 boolSink = FullRune(stdstring.FromBytes(bm.data)) 572 } 573 }) 574 } 575 }