github.com/primecitizens/pcz/std@v0.2.1/text/unicode/letter_test.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2009 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 package unicode_test 9 10 import ( 11 "flag" 12 "fmt" 13 "runtime" 14 "sort" 15 "strings" 16 "testing" 17 . "unicode" 18 ) 19 20 var upperTest = []rune{ 21 0x41, 22 0xc0, 23 0xd8, 24 0x100, 25 0x139, 26 0x14a, 27 0x178, 28 0x181, 29 0x376, 30 0x3cf, 31 0x13bd, 32 0x1f2a, 33 0x2102, 34 0x2c00, 35 0x2c10, 36 0x2c20, 37 0xa650, 38 0xa722, 39 0xff3a, 40 0x10400, 41 0x1d400, 42 0x1d7ca, 43 } 44 45 var notupperTest = []rune{ 46 0x40, 47 0x5b, 48 0x61, 49 0x185, 50 0x1b0, 51 0x377, 52 0x387, 53 0x2150, 54 0xab7d, 55 0xffff, 56 0x10000, 57 } 58 59 var letterTest = []rune{ 60 0x41, 61 0x61, 62 0xaa, 63 0xba, 64 0xc8, 65 0xdb, 66 0xf9, 67 0x2ec, 68 0x535, 69 0x620, 70 0x6e6, 71 0x93d, 72 0xa15, 73 0xb99, 74 0xdc0, 75 0xedd, 76 0x1000, 77 0x1200, 78 0x1312, 79 0x1401, 80 0x2c00, 81 0xa800, 82 0xf900, 83 0xfa30, 84 0xffda, 85 0xffdc, 86 0x10000, 87 0x10300, 88 0x10400, 89 0x20000, 90 0x2f800, 91 0x2fa1d, 92 } 93 94 var notletterTest = []rune{ 95 0x20, 96 0x35, 97 0x375, 98 0x619, 99 0x700, 100 0x1885, 101 0xfffe, 102 0x1ffff, 103 0x10ffff, 104 } 105 106 // Contains all the special cased Latin-1 chars. 107 var spaceTest = []rune{ 108 0x09, 109 0x0a, 110 0x0b, 111 0x0c, 112 0x0d, 113 0x20, 114 0x85, 115 0xA0, 116 0x2000, 117 0x3000, 118 } 119 120 type caseT struct { 121 cas int 122 in, out rune 123 } 124 125 var caseTest = []caseT{ 126 // errors 127 {-1, '\n', 0xFFFD}, 128 {UpperCase, -1, -1}, 129 {UpperCase, 1 << 30, 1 << 30}, 130 131 // ASCII (special-cased so test carefully) 132 {UpperCase, '\n', '\n'}, 133 {UpperCase, 'a', 'A'}, 134 {UpperCase, 'A', 'A'}, 135 {UpperCase, '7', '7'}, 136 {LowerCase, '\n', '\n'}, 137 {LowerCase, 'a', 'a'}, 138 {LowerCase, 'A', 'a'}, 139 {LowerCase, '7', '7'}, 140 {TitleCase, '\n', '\n'}, 141 {TitleCase, 'a', 'A'}, 142 {TitleCase, 'A', 'A'}, 143 {TitleCase, '7', '7'}, 144 145 // Latin-1: easy to read the tests! 146 {UpperCase, 0x80, 0x80}, 147 {UpperCase, 'Å', 'Å'}, 148 {UpperCase, 'å', 'Å'}, 149 {LowerCase, 0x80, 0x80}, 150 {LowerCase, 'Å', 'å'}, 151 {LowerCase, 'å', 'å'}, 152 {TitleCase, 0x80, 0x80}, 153 {TitleCase, 'Å', 'Å'}, 154 {TitleCase, 'å', 'Å'}, 155 156 // 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049 157 {UpperCase, 0x0131, 'I'}, 158 {LowerCase, 0x0131, 0x0131}, 159 {TitleCase, 0x0131, 'I'}, 160 161 // 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132 162 {UpperCase, 0x0133, 0x0132}, 163 {LowerCase, 0x0133, 0x0133}, 164 {TitleCase, 0x0133, 0x0132}, 165 166 // 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B; 167 {UpperCase, 0x212A, 0x212A}, 168 {LowerCase, 0x212A, 'k'}, 169 {TitleCase, 0x212A, 0x212A}, 170 171 // From an UpperLower sequence 172 // A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641; 173 {UpperCase, 0xA640, 0xA640}, 174 {LowerCase, 0xA640, 0xA641}, 175 {TitleCase, 0xA640, 0xA640}, 176 // A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640 177 {UpperCase, 0xA641, 0xA640}, 178 {LowerCase, 0xA641, 0xA641}, 179 {TitleCase, 0xA641, 0xA640}, 180 // A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F; 181 {UpperCase, 0xA64E, 0xA64E}, 182 {LowerCase, 0xA64E, 0xA64F}, 183 {TitleCase, 0xA64E, 0xA64E}, 184 // A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E 185 {UpperCase, 0xA65F, 0xA65E}, 186 {LowerCase, 0xA65F, 0xA65F}, 187 {TitleCase, 0xA65F, 0xA65E}, 188 189 // From another UpperLower sequence 190 // 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A; 191 {UpperCase, 0x0139, 0x0139}, 192 {LowerCase, 0x0139, 0x013A}, 193 {TitleCase, 0x0139, 0x0139}, 194 // 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140; 195 {UpperCase, 0x013f, 0x013f}, 196 {LowerCase, 0x013f, 0x0140}, 197 {TitleCase, 0x013f, 0x013f}, 198 // 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147 199 {UpperCase, 0x0148, 0x0147}, 200 {LowerCase, 0x0148, 0x0148}, 201 {TitleCase, 0x0148, 0x0147}, 202 203 // Lowercase lower than uppercase. 204 // AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8 205 {UpperCase, 0xab78, 0x13a8}, 206 {LowerCase, 0xab78, 0xab78}, 207 {TitleCase, 0xab78, 0x13a8}, 208 {UpperCase, 0x13a8, 0x13a8}, 209 {LowerCase, 0x13a8, 0xab78}, 210 {TitleCase, 0x13a8, 0x13a8}, 211 212 // Last block in the 5.1.0 table 213 // 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428; 214 {UpperCase, 0x10400, 0x10400}, 215 {LowerCase, 0x10400, 0x10428}, 216 {TitleCase, 0x10400, 0x10400}, 217 // 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F; 218 {UpperCase, 0x10427, 0x10427}, 219 {LowerCase, 0x10427, 0x1044F}, 220 {TitleCase, 0x10427, 0x10427}, 221 // 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400 222 {UpperCase, 0x10428, 0x10400}, 223 {LowerCase, 0x10428, 0x10428}, 224 {TitleCase, 0x10428, 0x10400}, 225 // 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427 226 {UpperCase, 0x1044F, 0x10427}, 227 {LowerCase, 0x1044F, 0x1044F}, 228 {TitleCase, 0x1044F, 0x10427}, 229 230 // First one not in the 5.1.0 table 231 // 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;; 232 {UpperCase, 0x10450, 0x10450}, 233 {LowerCase, 0x10450, 0x10450}, 234 {TitleCase, 0x10450, 0x10450}, 235 236 // Non-letters with case. 237 {LowerCase, 0x2161, 0x2171}, 238 {UpperCase, 0x0345, 0x0399}, 239 } 240 241 func TestIsLetter(t *testing.T) { 242 for _, r := range upperTest { 243 if !IsLetter(r) { 244 t.Errorf("IsLetter(U+%04X) = false, want true", r) 245 } 246 } 247 for _, r := range letterTest { 248 if !IsLetter(r) { 249 t.Errorf("IsLetter(U+%04X) = false, want true", r) 250 } 251 } 252 for _, r := range notletterTest { 253 if IsLetter(r) { 254 t.Errorf("IsLetter(U+%04X) = true, want false", r) 255 } 256 } 257 } 258 259 func TestIsUpper(t *testing.T) { 260 for _, r := range upperTest { 261 if !IsUpper(r) { 262 t.Errorf("IsUpper(U+%04X) = false, want true", r) 263 } 264 } 265 for _, r := range notupperTest { 266 if IsUpper(r) { 267 t.Errorf("IsUpper(U+%04X) = true, want false", r) 268 } 269 } 270 for _, r := range notletterTest { 271 if IsUpper(r) { 272 t.Errorf("IsUpper(U+%04X) = true, want false", r) 273 } 274 } 275 } 276 277 func caseString(c int) string { 278 switch c { 279 case UpperCase: 280 return "UpperCase" 281 case LowerCase: 282 return "LowerCase" 283 case TitleCase: 284 return "TitleCase" 285 } 286 return "ErrorCase" 287 } 288 289 func TestTo(t *testing.T) { 290 for _, c := range caseTest { 291 r := To(c.cas, c.in) 292 if c.out != r { 293 t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out) 294 } 295 } 296 } 297 298 func TestToUpperCase(t *testing.T) { 299 for _, c := range caseTest { 300 if c.cas != UpperCase { 301 continue 302 } 303 r := ToUpper(c.in) 304 if c.out != r { 305 t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out) 306 } 307 } 308 } 309 310 func TestToLowerCase(t *testing.T) { 311 for _, c := range caseTest { 312 if c.cas != LowerCase { 313 continue 314 } 315 r := ToLower(c.in) 316 if c.out != r { 317 t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out) 318 } 319 } 320 } 321 322 func TestToTitleCase(t *testing.T) { 323 for _, c := range caseTest { 324 if c.cas != TitleCase { 325 continue 326 } 327 r := ToTitle(c.in) 328 if c.out != r { 329 t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out) 330 } 331 } 332 } 333 334 func TestIsSpace(t *testing.T) { 335 for _, c := range spaceTest { 336 if !IsSpace(c) { 337 t.Errorf("IsSpace(U+%04X) = false; want true", c) 338 } 339 } 340 for _, c := range letterTest { 341 if IsSpace(c) { 342 t.Errorf("IsSpace(U+%04X) = true; want false", c) 343 } 344 } 345 } 346 347 // Check that the optimizations for IsLetter etc. agree with the tables. 348 // We only need to check the Latin-1 range. 349 func TestLetterOptimizations(t *testing.T) { 350 for i := rune(0); i <= MaxLatin1; i++ { 351 if Is(Letter, i) != IsLetter(i) { 352 t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i) 353 } 354 if Is(Upper, i) != IsUpper(i) { 355 t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i) 356 } 357 if Is(Lower, i) != IsLower(i) { 358 t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i) 359 } 360 if Is(Title, i) != IsTitle(i) { 361 t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i) 362 } 363 if Is(White_Space, i) != IsSpace(i) { 364 t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i) 365 } 366 if To(UpperCase, i) != ToUpper(i) { 367 t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i) 368 } 369 if To(LowerCase, i) != ToLower(i) { 370 t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i) 371 } 372 if To(TitleCase, i) != ToTitle(i) { 373 t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i) 374 } 375 } 376 } 377 378 func TestTurkishCase(t *testing.T) { 379 lower := []rune("abcçdefgğhıijklmnoöprsştuüvyz") 380 upper := []rune("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ") 381 for i, l := range lower { 382 u := upper[i] 383 if TurkishCase.ToLower(l) != l { 384 t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l) 385 } 386 if TurkishCase.ToUpper(u) != u { 387 t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u) 388 } 389 if TurkishCase.ToUpper(l) != u { 390 t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u) 391 } 392 if TurkishCase.ToLower(u) != l { 393 t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l) 394 } 395 if TurkishCase.ToTitle(u) != u { 396 t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u) 397 } 398 if TurkishCase.ToTitle(l) != u { 399 t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u) 400 } 401 } 402 } 403 404 var simpleFoldTests = []string{ 405 // SimpleFold(x) returns the next equivalent rune > x or wraps 406 // around to smaller values. 407 408 // Easy cases. 409 "Aa", 410 "δΔ", 411 412 // ASCII special cases. 413 "KkK", 414 "Ssſ", 415 416 // Non-ASCII special cases. 417 "ρϱΡ", 418 "ͅΙιι", 419 420 // Extra special cases: has lower/upper but no case fold. 421 "İ", 422 "ı", 423 424 // Upper comes before lower (Cherokee). 425 "\u13b0\uab80", 426 } 427 428 func TestSimpleFold(t *testing.T) { 429 for _, tt := range simpleFoldTests { 430 cycle := []rune(tt) 431 r := cycle[len(cycle)-1] 432 for _, out := range cycle { 433 if r := SimpleFold(r); r != out { 434 t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out) 435 } 436 r = out 437 } 438 } 439 440 if r := SimpleFold(-42); r != -42 { 441 t.Errorf("SimpleFold(-42) = %v, want -42", r) 442 } 443 } 444 445 // Running 'go test -calibrate' runs the calibration to find a plausible 446 // cutoff point for linear search of a range list vs. binary search. 447 // We create a fake table and then time how long it takes to do a 448 // sequence of searches within that table, for all possible inputs 449 // relative to the ranges (something before all, in each, between each, after all). 450 // This assumes that all possible runes are equally likely. 451 // In practice most runes are ASCII so this is a conservative estimate 452 // of an effective cutoff value. In practice we could probably set it higher 453 // than what this function recommends. 454 455 var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search") 456 457 func TestCalibrate(t *testing.T) { 458 if !*calibrate { 459 return 460 } 461 462 if runtime.GOARCH == "amd64" { 463 fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH) 464 } 465 466 // Find the point where binary search wins by more than 10%. 467 // The 10% bias gives linear search an edge when they're close, 468 // because on predominantly ASCII inputs linear search is even 469 // better than our benchmarks measure. 470 n := sort.Search(64, func(n int) bool { 471 tab := fakeTable(n) 472 blinear := func(b *testing.B) { 473 tab := tab 474 max := n*5 + 20 475 for i := 0; i < b.N; i++ { 476 for j := 0; j <= max; j++ { 477 linear(tab, uint16(j)) 478 } 479 } 480 } 481 bbinary := func(b *testing.B) { 482 tab := tab 483 max := n*5 + 20 484 for i := 0; i < b.N; i++ { 485 for j := 0; j <= max; j++ { 486 binary(tab, uint16(j)) 487 } 488 } 489 } 490 bmlinear := testing.Benchmark(blinear) 491 bmbinary := testing.Benchmark(bbinary) 492 fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp()) 493 return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110 494 }) 495 fmt.Printf("calibration: linear cutoff = %d\n", n) 496 } 497 498 func fakeTable(n int) []Range16 { 499 var r16 []Range16 500 for i := 0; i < n; i++ { 501 r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1}) 502 } 503 return r16 504 } 505 506 func linear(ranges []Range16, r uint16) bool { 507 for i := range ranges { 508 range_ := &ranges[i] 509 if r < range_.Lo { 510 return false 511 } 512 if r <= range_.Hi { 513 return (r-range_.Lo)%range_.Stride == 0 514 } 515 } 516 return false 517 } 518 519 func binary(ranges []Range16, r uint16) bool { 520 // binary search over ranges 521 lo := 0 522 hi := len(ranges) 523 for lo < hi { 524 m := lo + (hi-lo)/2 525 range_ := &ranges[m] 526 if range_.Lo <= r && r <= range_.Hi { 527 return (r-range_.Lo)%range_.Stride == 0 528 } 529 if r < range_.Lo { 530 hi = m 531 } else { 532 lo = m + 1 533 } 534 } 535 return false 536 } 537 538 func TestLatinOffset(t *testing.T) { 539 var maps = []map[string]*RangeTable{ 540 Categories, 541 FoldCategory, 542 FoldScript, 543 Properties, 544 Scripts, 545 } 546 for _, m := range maps { 547 for name, tab := range m { 548 i := 0 549 for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 { 550 i++ 551 } 552 if tab.LatinOffset != i { 553 t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i) 554 } 555 } 556 } 557 } 558 559 func TestSpecialCaseNoMapping(t *testing.T) { 560 // Issue 25636 561 // no change for rune 'A', zero delta, under upper/lower/title case change. 562 var noChangeForCapitalA = CaseRange{'A', 'A', [MaxCase]rune{0, 0, 0}} 563 got := strings.ToLowerSpecial(SpecialCase([]CaseRange{noChangeForCapitalA}), "ABC") 564 want := "Abc" 565 if got != want { 566 t.Errorf("got %q; want %q", got, want) 567 } 568 } 569 570 func TestNegativeRune(t *testing.T) { 571 // Issue 43254 572 // These tests cover negative rune handling by testing values which, 573 // when cast to uint8 or uint16, look like a particular valid rune. 574 // This package has Latin-1-specific optimizations, so we test all of 575 // Latin-1 and representative non-Latin-1 values in the character 576 // categories covered by IsGraphic, etc. 577 nonLatin1 := []uint32{ 578 // Lu: LATIN CAPITAL LETTER A WITH MACRON 579 0x0100, 580 // Ll: LATIN SMALL LETTER A WITH MACRON 581 0x0101, 582 // Lt: LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 583 0x01C5, 584 // M: COMBINING GRAVE ACCENT 585 0x0300, 586 // Nd: ARABIC-INDIC DIGIT ZERO 587 0x0660, 588 // P: GREEK QUESTION MARK 589 0x037E, 590 // S: MODIFIER LETTER LEFT ARROWHEAD 591 0x02C2, 592 // Z: OGHAM SPACE MARK 593 0x1680, 594 } 595 for i := 0; i < MaxLatin1+len(nonLatin1); i++ { 596 base := uint32(i) 597 if i >= MaxLatin1 { 598 base = nonLatin1[i-MaxLatin1] 599 } 600 601 // Note r is negative, but uint8(r) == uint8(base) and 602 // uint16(r) == uint16(base). 603 r := rune(base - 1<<31) 604 if Is(Letter, r) { 605 t.Errorf("Is(Letter, 0x%x - 1<<31) = true, want false", base) 606 } 607 if IsControl(r) { 608 t.Errorf("IsControl(0x%x - 1<<31) = true, want false", base) 609 } 610 if IsDigit(r) { 611 t.Errorf("IsDigit(0x%x - 1<<31) = true, want false", base) 612 } 613 if IsGraphic(r) { 614 t.Errorf("IsGraphic(0x%x - 1<<31) = true, want false", base) 615 } 616 if IsLetter(r) { 617 t.Errorf("IsLetter(0x%x - 1<<31) = true, want false", base) 618 } 619 if IsLower(r) { 620 t.Errorf("IsLower(0x%x - 1<<31) = true, want false", base) 621 } 622 if IsMark(r) { 623 t.Errorf("IsMark(0x%x - 1<<31) = true, want false", base) 624 } 625 if IsNumber(r) { 626 t.Errorf("IsNumber(0x%x - 1<<31) = true, want false", base) 627 } 628 if IsPrint(r) { 629 t.Errorf("IsPrint(0x%x - 1<<31) = true, want false", base) 630 } 631 if IsPunct(r) { 632 t.Errorf("IsPunct(0x%x - 1<<31) = true, want false", base) 633 } 634 if IsSpace(r) { 635 t.Errorf("IsSpace(0x%x - 1<<31) = true, want false", base) 636 } 637 if IsSymbol(r) { 638 t.Errorf("IsSymbol(0x%x - 1<<31) = true, want false", base) 639 } 640 if IsTitle(r) { 641 t.Errorf("IsTitle(0x%x - 1<<31) = true, want false", base) 642 } 643 if IsUpper(r) { 644 t.Errorf("IsUpper(0x%x - 1<<31) = true, want false", base) 645 } 646 } 647 }