golang.org/x/text@v0.14.0/unicode/norm/normalize_test.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package norm 6 7 import ( 8 "bytes" 9 "flag" 10 "fmt" 11 "io" 12 "log" 13 "os" 14 "os/exec" 15 "path/filepath" 16 "runtime" 17 "strings" 18 "testing" 19 "unicode/utf8" 20 21 "golang.org/x/text/internal/testtext" 22 "golang.org/x/text/transform" 23 ) 24 25 var ( 26 testn = flag.Int("testn", -1, "specific test number to run or -1 for all") 27 ) 28 29 // pc replaces any rune r that is repeated n times, for n > 1, with r{n}. 30 func pc(s string) []byte { 31 b := bytes.NewBuffer(make([]byte, 0, len(s))) 32 for i := 0; i < len(s); { 33 r, sz := utf8.DecodeRuneInString(s[i:]) 34 n := 0 35 if sz == 1 { 36 // Special-case one-byte case to handle repetition for invalid UTF-8. 37 for c := s[i]; i+n < len(s) && s[i+n] == c; n++ { 38 } 39 } else { 40 for _, r2 := range s[i:] { 41 if r2 != r { 42 break 43 } 44 n++ 45 } 46 } 47 b.WriteString(s[i : i+sz]) 48 if n > 1 { 49 fmt.Fprintf(b, "{%d}", n) 50 } 51 i += sz * n 52 } 53 return b.Bytes() 54 } 55 56 // pidx finds the index from which two strings start to differ, plus context. 57 // It returns the index and ellipsis if the index is greater than 0. 58 func pidx(a, b string) (i int, prefix string) { 59 for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ { 60 } 61 if i < 8 { 62 return 0, "" 63 } 64 i -= 3 // ensure taking at least one full rune before the difference. 65 for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- { 66 } 67 return i, "..." 68 } 69 70 type PositionTest struct { 71 input string 72 pos int 73 buffer string // expected contents of reorderBuffer, if applicable 74 } 75 76 type positionFunc func(rb *reorderBuffer, s string) (int, []byte) 77 78 func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) { 79 rb := reorderBuffer{} 80 rb.init(f, nil) 81 for i, test := range tests { 82 rb.reset() 83 rb.src = inputString(test.input) 84 rb.nsrc = len(test.input) 85 pos, out := fn(&rb, test.input) 86 if pos != test.pos { 87 t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos) 88 } 89 if outs := string(out); outs != test.buffer { 90 k, pfx := pidx(outs, test.buffer) 91 t.Errorf("%s:%d: buffer \nwas %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:])) 92 } 93 } 94 } 95 96 func grave(n int) string { 97 return rep(0x0300, n) 98 } 99 100 func rep(r rune, n int) string { 101 return strings.Repeat(string(r), n) 102 } 103 104 const segSize = maxByteBufferSize 105 106 var cgj = GraphemeJoiner 107 108 var decomposeSegmentTests = []PositionTest{ 109 // illegal runes 110 {"\xC2", 0, ""}, 111 {"\xC0", 1, "\xC0"}, 112 {"\u00E0\x80", 2, "\u0061\u0300"}, 113 // starter 114 {"a", 1, "a"}, 115 {"ab", 1, "a"}, 116 // starter + composing 117 {"a\u0300", 3, "a\u0300"}, 118 {"a\u0300b", 3, "a\u0300"}, 119 // with decomposition 120 {"\u00C0", 2, "A\u0300"}, 121 {"\u00C0b", 2, "A\u0300"}, 122 // long 123 {grave(31), 60, grave(30) + cgj}, 124 {"a" + grave(31), 61, "a" + grave(30) + cgj}, 125 126 // Stability tests: see https://www.unicode.org/review/pr-29.html. 127 // U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;; 128 // U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;; 129 // U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; 130 // U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;; 131 // U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;; 132 {"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"}, 133 {"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"}, 134 {"\u0B47\u0B3E", 6, "\u0B47\u0B3E"}, 135 {"\u1100\u1161", 6, "\u1100\u1161"}, 136 137 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;; 138 // Sequence of decomposing characters that are starters and modifiers. 139 {"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj}, 140 141 {grave(30), 60, grave(30)}, 142 // U+FF9E is a starter, but decomposes to U+3099, which is not. 143 {grave(30) + "\uff9e", 60, grave(30) + cgj}, 144 // ends with incomplete UTF-8 encoding 145 {"\xCC", 0, ""}, 146 {"\u0300\xCC", 2, "\u0300"}, 147 } 148 149 func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) { 150 rb.initString(NFD, s) 151 rb.setFlusher(nil, appendFlush) 152 p := decomposeSegment(rb, 0, true) 153 return p, rb.out 154 } 155 156 func TestDecomposeSegment(t *testing.T) { 157 runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests) 158 } 159 160 var firstBoundaryTests = []PositionTest{ 161 // no boundary 162 {"", -1, ""}, 163 {"\u0300", -1, ""}, 164 {"\x80\x80", -1, ""}, 165 // illegal runes 166 {"\xff", 0, ""}, 167 {"\u0300\xff", 2, ""}, 168 {"\u0300\xc0\x80\x80", 2, ""}, 169 // boundaries 170 {"a", 0, ""}, 171 {"\u0300a", 2, ""}, 172 // Hangul 173 {"\u1103\u1161", 0, ""}, 174 {"\u110B\u1173\u11B7", 0, ""}, 175 {"\u1161\u110B\u1173\u11B7", 3, ""}, 176 {"\u1173\u11B7\u1103\u1161", 6, ""}, 177 // too many combining characters. 178 {grave(maxNonStarters - 1), -1, ""}, 179 {grave(maxNonStarters), 60, ""}, 180 {grave(maxNonStarters + 1), 60, ""}, 181 } 182 183 func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) { 184 return rb.f.form.FirstBoundary([]byte(s)), nil 185 } 186 187 func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) { 188 return rb.f.form.FirstBoundaryInString(s), nil 189 } 190 191 func TestFirstBoundary(t *testing.T) { 192 runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests) 193 runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests) 194 } 195 196 func TestNextBoundary(t *testing.T) { 197 testCases := []struct { 198 input string 199 atEOF bool 200 want int 201 }{ 202 // no boundary 203 {"", true, 0}, 204 {"", false, -1}, 205 {"\u0300", true, 2}, 206 {"\u0300", false, -1}, 207 {"\x80\x80", true, 1}, 208 {"\x80\x80", false, 1}, 209 // illegal runes 210 {"\xff", false, 1}, 211 {"\u0300\xff", false, 2}, 212 {"\u0300\xc0\x80\x80", false, 2}, 213 {"\xc2\x80\x80", false, 2}, 214 {"\xc2", false, -1}, 215 {"\xc2", true, 1}, 216 {"a\u0300\xc2", false, -1}, 217 {"a\u0300\xc2", true, 3}, 218 // boundaries 219 {"a", true, 1}, 220 {"a", false, -1}, 221 {"aa", false, 1}, 222 {"\u0300", true, 2}, 223 {"\u0300", false, -1}, 224 {"\u0300a", false, 2}, 225 // Hangul 226 {"\u1103\u1161", true, 6}, 227 {"\u1103\u1161", false, -1}, 228 {"\u110B\u1173\u11B7", false, -1}, 229 {"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9}, 230 {"\u1161\u110B\u1173\u11B7", false, 3}, 231 {"\u1173\u11B7\u1103\u1161", false, 6}, 232 // too many combining characters. 233 {grave(maxNonStarters - 1), false, -1}, 234 {grave(maxNonStarters), false, 60}, 235 {grave(maxNonStarters + 1), false, 60}, 236 } 237 238 for _, tc := range testCases { 239 if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want { 240 t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want) 241 } 242 if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want { 243 t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want) 244 } 245 } 246 } 247 248 var decomposeToLastTests = []PositionTest{ 249 // ends with inert character 250 {"Hello!", 6, ""}, 251 {"\u0632", 2, ""}, 252 {"a\u0301\u0635", 5, ""}, 253 // ends with non-inert starter 254 {"a", 0, "a"}, 255 {"a\u0301a", 3, "a"}, 256 {"a\u0301\u03B9", 3, "\u03B9"}, 257 {"a\u0327", 0, "a\u0327"}, 258 // illegal runes 259 {"\xFF", 1, ""}, 260 {"aa\xFF", 3, ""}, 261 {"\xC0\x80\x80", 3, ""}, 262 {"\xCC\x80\x80", 3, ""}, 263 // ends with incomplete UTF-8 encoding 264 {"a\xCC", 2, ""}, 265 // ends with combining characters 266 {"\u0300\u0301", 0, "\u0300\u0301"}, 267 {"a\u0300\u0301", 0, "a\u0300\u0301"}, 268 {"a\u0301\u0308", 0, "a\u0301\u0308"}, 269 {"a\u0308\u0301", 0, "a\u0308\u0301"}, 270 {"aaaa\u0300\u0301", 3, "a\u0300\u0301"}, 271 {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"}, 272 {"\u00C0", 0, "A\u0300"}, 273 {"a\u00C0", 1, "A\u0300"}, 274 // decomposing 275 {"a\u0300\u00E0", 3, "a\u0300"}, 276 // multisegment decompositions (flushes leading segments) 277 {"a\u0300\uFDC0", 7, "\u064A"}, 278 {"\uFDC0" + grave(29), 4, "\u064A" + grave(29)}, 279 {"\uFDC0" + grave(30), 4, "\u064A" + grave(30)}, 280 {"\uFDC0" + grave(31), 5, grave(30)}, 281 {"\uFDFA" + grave(14), 31, "\u0645" + grave(14)}, 282 // Overflow 283 {"\u00E0" + grave(29), 0, "a" + grave(30)}, 284 {"\u00E0" + grave(30), 2, grave(30)}, 285 // Hangul 286 {"a\u1103", 1, "\u1103"}, 287 {"a\u110B", 1, "\u110B"}, 288 {"a\u110B\u1173", 1, "\u110B\u1173"}, 289 // See comment in composition.go:compBoundaryAfter. 290 {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"}, 291 {"a\uC73C", 1, "\u110B\u1173"}, 292 {"다음", 3, "\u110B\u1173\u11B7"}, 293 {"다", 0, "\u1103\u1161"}, 294 {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"}, 295 {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"}, 296 {"다음음", 6, "\u110B\u1173\u11B7"}, 297 {"음다다", 6, "\u1103\u1161"}, 298 // maximized buffer 299 {"a" + grave(30), 0, "a" + grave(30)}, 300 // Buffer overflow 301 {"a" + grave(31), 3, grave(30)}, 302 // weird UTF-8 303 {"a\u0300\u11B7", 0, "a\u0300\u11B7"}, 304 } 305 306 func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) { 307 rb.setFlusher([]byte(s), appendFlush) 308 decomposeToLastBoundary(rb) 309 buf := rb.flush(nil) 310 return len(rb.out), buf 311 } 312 313 func TestDecomposeToLastBoundary(t *testing.T) { 314 runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests) 315 } 316 317 var lastBoundaryTests = []PositionTest{ 318 // ends with inert character 319 {"Hello!", 6, ""}, 320 {"\u0632", 2, ""}, 321 // ends with non-inert starter 322 {"a", 0, ""}, 323 // illegal runes 324 {"\xff", 1, ""}, 325 {"aa\xff", 3, ""}, 326 {"a\xff\u0300", 1, ""}, // TODO: should probably be 2. 327 {"\xc0\x80\x80", 3, ""}, 328 {"\xc0\x80\x80\u0300", 3, ""}, 329 // ends with incomplete UTF-8 encoding 330 {"\xCC", -1, ""}, 331 {"\xE0\x80", -1, ""}, 332 {"\xF0\x80\x80", -1, ""}, 333 {"a\xCC", 0, ""}, 334 {"\x80\xCC", 1, ""}, 335 {"\xCC\xCC", 1, ""}, 336 // ends with combining characters 337 {"a\u0300\u0301", 0, ""}, 338 {"aaaa\u0300\u0301", 3, ""}, 339 {"\u0300a\u0300\u0301", 2, ""}, 340 {"\u00C2", 0, ""}, 341 {"a\u00C2", 1, ""}, 342 // decomposition may recombine 343 {"\u0226", 0, ""}, 344 // no boundary 345 {"", -1, ""}, 346 {"\u0300\u0301", -1, ""}, 347 {"\u0300", -1, ""}, 348 {"\x80\x80", -1, ""}, 349 {"\x80\x80\u0301", -1, ""}, 350 // Hangul 351 {"다음", 3, ""}, 352 {"다", 0, ""}, 353 {"\u1103\u1161\u110B\u1173\u11B7", 6, ""}, 354 {"\u110B\u1173\u11B7\u1103\u1161", 9, ""}, 355 // too many combining characters. 356 {grave(maxNonStarters - 1), -1, ""}, 357 // May still be preceded with a non-starter. 358 {grave(maxNonStarters), -1, ""}, 359 // May still need to insert a cgj after the last combiner. 360 {grave(maxNonStarters + 1), 2, ""}, 361 {grave(maxNonStarters + 2), 4, ""}, 362 363 {"a" + grave(maxNonStarters-1), 0, ""}, 364 {"a" + grave(maxNonStarters), 0, ""}, 365 // May still need to insert a cgj after the last combiner. 366 {"a" + grave(maxNonStarters+1), 3, ""}, 367 {"a" + grave(maxNonStarters+2), 5, ""}, 368 } 369 370 func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) { 371 return rb.f.form.LastBoundary([]byte(s)), nil 372 } 373 374 func TestLastBoundary(t *testing.T) { 375 runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests) 376 } 377 378 type spanTest struct { 379 input string 380 atEOF bool 381 n int 382 err error 383 } 384 385 var quickSpanTests = []spanTest{ 386 {"", true, 0, nil}, 387 // starters 388 {"a", true, 1, nil}, 389 {"abc", true, 3, nil}, 390 {"\u043Eb", true, 3, nil}, 391 // incomplete last rune. 392 {"\xCC", true, 1, nil}, 393 {"\xCC", false, 0, transform.ErrShortSrc}, 394 {"a\xCC", true, 2, nil}, 395 {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD 396 // incorrectly ordered combining characters 397 {"\u0300\u0316", true, 0, transform.ErrEndOfSpan}, 398 {"\u0300\u0316", false, 0, transform.ErrEndOfSpan}, 399 {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan}, 400 {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan}, 401 // have a maximum number of combining characters. 402 {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 403 {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 404 {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 405 {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan}, 406 {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil}, 407 {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil}, 408 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, 409 {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, 410 411 {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc}, 412 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, 413 {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, 414 } 415 416 var quickSpanNFDTests = []spanTest{ 417 // needs decomposing 418 {"\u00C0", true, 0, transform.ErrEndOfSpan}, 419 {"abc\u00C0", true, 3, transform.ErrEndOfSpan}, 420 // correctly ordered combining characters 421 {"\u0300", true, 2, nil}, 422 {"ab\u0300", true, 4, nil}, 423 {"ab\u0300cd", true, 6, nil}, 424 {"\u0300cd", true, 4, nil}, 425 {"\u0316\u0300", true, 4, nil}, 426 {"ab\u0316\u0300", true, 6, nil}, 427 {"ab\u0316\u0300cd", true, 8, nil}, 428 {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan}, 429 {"\u0316\u0300cd", true, 6, nil}, 430 {"\u043E\u0308b", true, 5, nil}, 431 // incorrectly ordered combining characters 432 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well. 433 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, 434 // Hangul 435 {"같은", true, 0, transform.ErrEndOfSpan}, 436 } 437 438 var quickSpanNFCTests = []spanTest{ 439 // okay composed 440 {"\u00C0", true, 2, nil}, 441 {"abc\u00C0", true, 5, nil}, 442 // correctly ordered combining characters 443 // TODO: b may combine with modifiers, which is why this fails. We could 444 // make a more precise test that actually checks whether last 445 // characters combines. Probably not worth it. 446 {"ab\u0300", true, 1, transform.ErrEndOfSpan}, 447 {"ab\u0300cd", true, 1, transform.ErrEndOfSpan}, 448 {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan}, 449 {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan}, 450 {"\u00C0\u035D", true, 4, nil}, 451 // we do not special case leading combining characters 452 {"\u0300cd", true, 0, transform.ErrEndOfSpan}, 453 {"\u0300", true, 0, transform.ErrEndOfSpan}, 454 {"\u0316\u0300", true, 0, transform.ErrEndOfSpan}, 455 {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan}, 456 // incorrectly ordered combining characters 457 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, 458 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, 459 // Hangul 460 {"같은", true, 6, nil}, 461 {"같은", false, 3, transform.ErrShortSrc}, 462 // We return the start of the violating segment in case of overflow. 463 {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan}, 464 {grave(30), true, 0, transform.ErrEndOfSpan}, 465 } 466 467 func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) { 468 for i, tc := range testCases { 469 s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) 470 ok := testtext.Run(t, s, func(t *testing.T) { 471 n, err := f.Span([]byte(tc.input), tc.atEOF) 472 if n != tc.n || err != tc.err { 473 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) 474 } 475 }) 476 if !ok { 477 continue // Don't do the String variant if the Bytes variant failed. 478 } 479 s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) 480 testtext.Run(t, s, func(t *testing.T) { 481 n, err := f.SpanString(tc.input, tc.atEOF) 482 if n != tc.n || err != tc.err { 483 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) 484 } 485 }) 486 } 487 } 488 489 func TestSpan(t *testing.T) { 490 runSpanTests(t, "NFD", NFD, quickSpanTests) 491 runSpanTests(t, "NFD", NFD, quickSpanNFDTests) 492 runSpanTests(t, "NFC", NFC, quickSpanTests) 493 runSpanTests(t, "NFC", NFC, quickSpanNFCTests) 494 } 495 496 var isNormalTests = []PositionTest{ 497 {"", 1, ""}, 498 // illegal runes 499 {"\xff", 1, ""}, 500 // starters 501 {"a", 1, ""}, 502 {"abc", 1, ""}, 503 {"\u043Eb", 1, ""}, 504 // incorrectly ordered combining characters 505 {"\u0300\u0316", 0, ""}, 506 {"ab\u0300\u0316", 0, ""}, 507 {"ab\u0300\u0316cd", 0, ""}, 508 {"\u0300\u0316cd", 0, ""}, 509 } 510 var isNormalNFDTests = []PositionTest{ 511 // needs decomposing 512 {"\u00C0", 0, ""}, 513 {"abc\u00C0", 0, ""}, 514 // correctly ordered combining characters 515 {"\u0300", 1, ""}, 516 {"ab\u0300", 1, ""}, 517 {"ab\u0300cd", 1, ""}, 518 {"\u0300cd", 1, ""}, 519 {"\u0316\u0300", 1, ""}, 520 {"ab\u0316\u0300", 1, ""}, 521 {"ab\u0316\u0300cd", 1, ""}, 522 {"\u0316\u0300cd", 1, ""}, 523 {"\u043E\u0308b", 1, ""}, 524 // Hangul 525 {"같은", 0, ""}, 526 } 527 var isNormalNFCTests = []PositionTest{ 528 // okay composed 529 {"\u00C0", 1, ""}, 530 {"abc\u00C0", 1, ""}, 531 // need reordering 532 {"a\u0300", 0, ""}, 533 {"a\u0300cd", 0, ""}, 534 {"a\u0316\u0300", 0, ""}, 535 {"a\u0316\u0300cd", 0, ""}, 536 // correctly ordered combining characters 537 {"ab\u0300", 1, ""}, 538 {"ab\u0300cd", 1, ""}, 539 {"ab\u0316\u0300", 1, ""}, 540 {"ab\u0316\u0300cd", 1, ""}, 541 {"\u00C0\u035D", 1, ""}, 542 {"\u0300", 1, ""}, 543 {"\u0316\u0300cd", 1, ""}, 544 // Hangul 545 {"같은", 1, ""}, 546 } 547 548 var isNormalNFKXTests = []PositionTest{ 549 // Special case. 550 {"\u00BC", 0, ""}, 551 } 552 553 func isNormalF(rb *reorderBuffer, s string) (int, []byte) { 554 if rb.f.form.IsNormal([]byte(s)) { 555 return 1, nil 556 } 557 return 0, nil 558 } 559 560 func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) { 561 if rb.f.form.IsNormalString(s) { 562 return 1, nil 563 } 564 return 0, nil 565 } 566 567 func TestIsNormal(t *testing.T) { 568 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests) 569 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests) 570 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests) 571 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests) 572 runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests) 573 runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests) 574 runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests) 575 runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests) 576 runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests) 577 runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests) 578 } 579 580 func TestIsNormalString(t *testing.T) { 581 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests) 582 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests) 583 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests) 584 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests) 585 } 586 587 type AppendTest struct { 588 left string 589 right string 590 out string 591 } 592 593 type appendFunc func(f Form, out []byte, s string) []byte 594 595 var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"} 596 597 func runNormTests(t *testing.T, name string, fn appendFunc) { 598 for f := NFC; f <= NFKD; f++ { 599 runAppendTests(t, name, f, fn, normTests[f]) 600 } 601 } 602 603 func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) { 604 for i, test := range tests { 605 t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) { 606 id := pc(test.left + test.right) 607 if *testn >= 0 && i != *testn { 608 return 609 } 610 t.Run("fn", func(t *testing.T) { 611 out := []byte(test.left) 612 have := string(fn(f, out, test.right)) 613 if len(have) != len(test.out) { 614 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out)) 615 } 616 if have != test.out { 617 k, pf := pidx(have, test.out) 618 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:])) 619 } 620 }) 621 622 // Bootstrap by normalizing input. Ensures that the various variants 623 // behave the same. 624 for g := NFC; g <= NFKD; g++ { 625 if f == g { 626 continue 627 } 628 t.Run(fstr[g], func(t *testing.T) { 629 want := g.String(test.left + test.right) 630 have := string(fn(g, g.AppendString(nil, test.left), test.right)) 631 if len(have) != len(want) { 632 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want)) 633 } 634 if have != want { 635 k, pf := pidx(have, want) 636 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:])) 637 } 638 }) 639 } 640 }) 641 } 642 } 643 644 var normTests = [][]AppendTest{ 645 appendTestsNFC, 646 appendTestsNFD, 647 appendTestsNFKC, 648 appendTestsNFKD, 649 } 650 651 var appendTestsNFC = []AppendTest{ 652 {"", ascii, ascii}, 653 {"", txt_all, txt_all}, 654 {"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)}, 655 {grave(30), "\uff9e", grave(30) + cgj + "\uff9e"}, 656 657 // Tests designed for Iter. 658 { // ordering of non-composing combining characters 659 "", 660 "\u0305\u0316", 661 "\u0316\u0305", 662 }, 663 { // segment overflow 664 "", 665 "a" + rep(0x0305, maxNonStarters+4) + "\u0316", 666 "a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4), 667 }, 668 669 { // Combine across non-blocking non-starters. 670 // U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;; 671 // U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;; 672 "", "a\u0327\u0325", "\u1e01\u0327", 673 }, 674 675 { // Jamo V+T does not combine. 676 "", 677 "\u1161\u11a8", 678 "\u1161\u11a8", 679 }, 680 681 // Stability tests: see https://www.unicode.org/review/pr-29.html. 682 {"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"}, 683 {"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"}, 684 {"", "\u0b47\u0b3e", "\u0b4b"}, 685 {"", "\u1100\u1161", "\uac00"}, 686 687 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;; 688 { // 0d4a starts a new segment. 689 "", 690 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15), 691 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15), 692 }, 693 694 { // Split combining characters. 695 // TODO: don't insert CGJ before starters. 696 "", 697 "\u0d46" + strings.Repeat("\u0d3e", 31), 698 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e", 699 }, 700 701 { // Split combining characters. 702 "", 703 "\u0d4a" + strings.Repeat("\u0d3e", 30), 704 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e", 705 }, 706 707 { // https://golang.org/issues/20079 708 "", 709 "\xeb\u0344", 710 "\xeb\u0308\u0301", 711 }, 712 713 { // https://golang.org/issues/20079 714 "", 715 "\uac00" + strings.Repeat("\u0300", 30), 716 "\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300", 717 }, 718 719 { // https://golang.org/issues/20079 720 "", 721 "\xeb" + strings.Repeat("\u0300", 31), 722 "\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300", 723 }, 724 } 725 726 var appendTestsNFD = []AppendTest{ 727 // TODO: Move some of the tests here. 728 } 729 730 var appendTestsNFKC = []AppendTest{ 731 // empty buffers 732 {"", "", ""}, 733 {"a", "", "a"}, 734 {"", "a", "a"}, 735 {"", "\u0041\u0307\u0304", "\u01E0"}, 736 // segment split across buffers 737 {"", "a\u0300b", "\u00E0b"}, 738 {"a", "\u0300b", "\u00E0b"}, 739 {"a", "\u0300\u0316", "\u00E0\u0316"}, 740 {"a", "\u0316\u0300", "\u00E0\u0316"}, 741 {"a", "\u0300a\u0300", "\u00E0\u00E0"}, 742 {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"}, 743 {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"}, 744 {"a\u0300", "\u0327", "\u00E0\u0327"}, 745 {"a\u0327", "\u0300", "\u00E0\u0327"}, 746 {"a\u0316", "\u0300", "\u00E0\u0316"}, 747 {"\u0041\u0307", "\u0304", "\u01E0"}, 748 // Hangul 749 {"", "\u110B\u1173", "\uC73C"}, 750 {"", "\u1103\u1161", "\uB2E4"}, 751 {"", "\u110B\u1173\u11B7", "\uC74C"}, 752 {"", "\u320E", "\x28\uAC00\x29"}, 753 {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"}, 754 {"\u1103", "\u1161", "\uB2E4"}, 755 {"\u110B", "\u1173\u11B7", "\uC74C"}, 756 {"\u110B\u1173", "\u11B7", "\uC74C"}, 757 {"\uC73C", "\u11B7", "\uC74C"}, 758 // UTF-8 encoding split across buffers 759 {"a\xCC", "\x80", "\u00E0"}, 760 {"a\xCC", "\x80b", "\u00E0b"}, 761 {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"}, 762 {"a\xCC", "\x80\x80", "\u00E0\x80"}, 763 {"a\xCC", "\x80\xCC", "\u00E0\xCC"}, 764 {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"}, 765 // ending in incomplete UTF-8 encoding 766 {"", "\xCC", "\xCC"}, 767 {"a", "\xCC", "a\xCC"}, 768 {"a", "b\xCC", "ab\xCC"}, 769 {"\u0226", "\xCC", "\u0226\xCC"}, 770 // illegal runes 771 {"", "\x80", "\x80"}, 772 {"", "\x80\x80\x80", "\x80\x80\x80"}, 773 {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"}, 774 {"", "a\x80", "a\x80"}, 775 {"", "a\x80\x80\x80", "a\x80\x80\x80"}, 776 {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"}, 777 {"a", "\x80\x80\x80", "a\x80\x80\x80"}, 778 // overflow 779 {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)}, 780 {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)}, 781 {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)}, 782 // overflow of combining characters 783 {"", grave(34), grave(30) + cgj + grave(4)}, 784 {"", grave(36), grave(30) + cgj + grave(6)}, 785 {grave(29), grave(5), grave(30) + cgj + grave(4)}, 786 {grave(30), grave(4), grave(30) + cgj + grave(4)}, 787 {grave(30), grave(3), grave(30) + cgj + grave(3)}, 788 {grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)}, 789 {"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)}, 790 {"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)}, 791 // - First rune has a trailing non-starter. 792 {"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)}, 793 // - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be 794 // inserted even when FF9E starts a new segment. 795 {"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)}, 796 {grave(30), "\uff9e", grave(30) + cgj + "\u3099"}, 797 // - Many non-starter decompositions in a row causing overflow. 798 {"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"}, 799 {"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"}, 800 801 {"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"}, 802 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)}, 803 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)}, 804 805 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers. 806 {"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"}, 807 {"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"}, 808 {"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"}, 809 810 // weird UTF-8 811 {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"}, 812 {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"}, 813 {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"}, 814 {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"}, 815 {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"}, 816 {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"}, 817 {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"}, 818 {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"}, 819 820 {"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)}, 821 // large input. 822 {"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)}, 823 {"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)}, 824 {"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)}, 825 {"", "\u0041\u0307\u0304", "\u01E0"}, 826 } 827 828 var appendTestsNFKD = []AppendTest{ 829 {"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)}, 830 831 { // segment overflow on unchanged character 832 "", 833 "a" + grave(64) + "\u0316", 834 "a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4), 835 }, 836 { // segment overflow on unchanged character + start value 837 "", 838 "a" + grave(98) + "\u0316", 839 "a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8), 840 }, 841 { // segment overflow on decomposition. (U+0340 decomposes to U+0300.) 842 "", 843 "a" + grave(59) + "\u0340", 844 "a" + grave(30) + cgj + grave(30), 845 }, 846 { // segment overflow on non-starter decomposition 847 "", 848 "a" + grave(33) + "\u0340" + grave(30) + "\u0320", 849 "a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4), 850 }, 851 { // start value after ASCII overflow 852 "", 853 rep('a', segSize) + grave(32) + "\u0320", 854 rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2), 855 }, 856 { // Jamo overflow 857 "", 858 "\u1100\u1161" + grave(30) + "\u0320" + grave(2), 859 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 860 }, 861 { // Hangul 862 "", 863 "\uac00", 864 "\u1100\u1161", 865 }, 866 { // Hangul overflow 867 "", 868 "\uac00" + grave(32) + "\u0320", 869 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 870 }, 871 { // Hangul overflow in Hangul mode. 872 "", 873 "\uac00\uac00" + grave(32) + "\u0320", 874 "\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 875 }, 876 { // Hangul overflow in Hangul mode. 877 "", 878 strings.Repeat("\uac00", 3) + grave(32) + "\u0320", 879 strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3), 880 }, 881 { // start value after cc=0 882 "", 883 "您您" + grave(34) + "\u0320", 884 "您您" + grave(30) + cgj + "\u0320" + grave(4), 885 }, 886 { // start value after normalization 887 "", 888 "\u0300\u0320a" + grave(34) + "\u0320", 889 "\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4), 890 }, 891 { 892 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers. 893 "", 894 "a\u0f7f" + rep(0xf71, 29) + "\u0f81", 895 "a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80", 896 }, 897 } 898 899 func TestAppend(t *testing.T) { 900 runNormTests(t, "Append", func(f Form, out []byte, s string) []byte { 901 return f.Append(out, []byte(s)...) 902 }) 903 } 904 905 func TestAppendString(t *testing.T) { 906 runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte { 907 return f.AppendString(out, s) 908 }) 909 } 910 911 func TestBytes(t *testing.T) { 912 runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte { 913 buf := []byte{} 914 buf = append(buf, out...) 915 buf = append(buf, s...) 916 return f.Bytes(buf) 917 }) 918 } 919 920 func TestString(t *testing.T) { 921 runNormTests(t, "String", func(f Form, out []byte, s string) []byte { 922 outs := string(out) + s 923 return []byte(f.String(outs)) 924 }) 925 } 926 927 func runNM(code string) (string, error) { 928 // Write the file. 929 tmpdir, err := os.MkdirTemp(os.TempDir(), "normalize_test") 930 if err != nil { 931 return "", fmt.Errorf("failed to create tmpdir: %v", err) 932 } 933 defer os.RemoveAll(tmpdir) 934 goTool := filepath.Join(runtime.GOROOT(), "bin", "go") 935 filename := filepath.Join(tmpdir, "main.go") 936 if err := os.WriteFile(filename, []byte(code), 0644); err != nil { 937 return "", fmt.Errorf("failed to write main.go: %v", err) 938 } 939 outputFile := filepath.Join(tmpdir, "main") 940 941 // Build the binary. 942 out, err := exec.Command(goTool, "build", "-o", outputFile, filename).CombinedOutput() 943 if err != nil { 944 return "", fmt.Errorf("failed to execute command: %v", err) 945 } 946 947 // Get the symbols. 948 out, err = exec.Command(goTool, "tool", "nm", outputFile).CombinedOutput() 949 return string(out), err 950 } 951 952 func TestLinking(t *testing.T) { 953 const prog = ` 954 package main 955 import "fmt" 956 import "golang.org/x/text/unicode/norm" 957 func main() { fmt.Println(norm.%s) } 958 ` 959 960 baseline, errB := runNM(fmt.Sprintf(prog, "MaxSegmentSize")) 961 withTables, errT := runNM(fmt.Sprintf(prog, `NFC.String("")`)) 962 if errB != nil || errT != nil { 963 t.Skipf("TestLinking failed: %v and %v", errB, errT) 964 } 965 966 symbols := []string{"norm.formTable", "norm.nfkcValues", "norm.decomps"} 967 for _, symbol := range symbols { 968 if strings.Contains(baseline, symbol) { 969 t.Errorf("found: %q unexpectedly", symbol) 970 } 971 if !strings.Contains(withTables, symbol) { 972 t.Errorf("didn't find: %q unexpectedly", symbol) 973 } 974 } 975 } 976 977 func appendBench(f Form, in []byte) func() { 978 buf := make([]byte, 0, 4*len(in)) 979 return func() { 980 f.Append(buf, in...) 981 } 982 } 983 984 func bytesBench(f Form, in []byte) func() { 985 return func() { 986 f.Bytes(in) 987 } 988 } 989 990 func iterBench(f Form, in []byte) func() { 991 iter := Iter{} 992 return func() { 993 iter.Init(f, in) 994 for !iter.Done() { 995 iter.Next() 996 } 997 } 998 } 999 1000 func transformBench(f Form, in []byte) func() { 1001 buf := make([]byte, 4*len(in)) 1002 return func() { 1003 if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n { 1004 log.Panic(n, len(in), err) 1005 } 1006 } 1007 } 1008 1009 func readerBench(f Form, in []byte) func() { 1010 buf := make([]byte, 4*len(in)) 1011 return func() { 1012 r := f.Reader(bytes.NewReader(in)) 1013 var err error 1014 for err == nil { 1015 _, err = r.Read(buf) 1016 } 1017 if err != io.EOF { 1018 panic("") 1019 } 1020 } 1021 } 1022 1023 func writerBench(f Form, in []byte) func() { 1024 buf := make([]byte, 0, 4*len(in)) 1025 return func() { 1026 r := f.Writer(bytes.NewBuffer(buf)) 1027 if _, err := r.Write(in); err != nil { 1028 panic("") 1029 } 1030 } 1031 } 1032 1033 func appendBenchmarks(bm []func(), f Form, in []byte) []func() { 1034 bm = append(bm, appendBench(f, in)) 1035 bm = append(bm, iterBench(f, in)) 1036 bm = append(bm, transformBench(f, in)) 1037 bm = append(bm, readerBench(f, in)) 1038 bm = append(bm, writerBench(f, in)) 1039 return bm 1040 } 1041 1042 func doFormBenchmark(b *testing.B, inf, f Form, s string) { 1043 b.StopTimer() 1044 in := inf.Bytes([]byte(s)) 1045 bm := appendBenchmarks(nil, f, in) 1046 b.SetBytes(int64(len(in) * len(bm))) 1047 b.StartTimer() 1048 for i := 0; i < b.N; i++ { 1049 for _, fn := range bm { 1050 fn() 1051 } 1052 } 1053 } 1054 1055 func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) { 1056 b.StopTimer() 1057 fn := f(NFC, s) 1058 b.SetBytes(int64(len(s))) 1059 b.StartTimer() 1060 for i := 0; i < b.N; i++ { 1061 fn() 1062 } 1063 } 1064 1065 var ( 1066 smallNoChange = []byte("nörmalization") 1067 smallChange = []byte("No\u0308rmalization") 1068 ascii = strings.Repeat("There is nothing to change here! ", 500) 1069 ) 1070 1071 func lowerBench(f Form, in []byte) func() { 1072 // Use package strings instead of bytes as it doesn't allocate memory 1073 // if there aren't any changes. 1074 s := string(in) 1075 return func() { 1076 strings.ToLower(s) 1077 } 1078 } 1079 1080 func BenchmarkLowerCaseNoChange(b *testing.B) { 1081 doSingle(b, lowerBench, smallNoChange) 1082 } 1083 func BenchmarkLowerCaseChange(b *testing.B) { 1084 doSingle(b, lowerBench, smallChange) 1085 } 1086 1087 func quickSpanBench(f Form, in []byte) func() { 1088 return func() { 1089 f.QuickSpan(in) 1090 } 1091 } 1092 1093 func BenchmarkQuickSpanChangeNFC(b *testing.B) { 1094 doSingle(b, quickSpanBench, smallNoChange) 1095 } 1096 1097 func BenchmarkBytesNoChangeNFC(b *testing.B) { 1098 doSingle(b, bytesBench, smallNoChange) 1099 } 1100 func BenchmarkBytesChangeNFC(b *testing.B) { 1101 doSingle(b, bytesBench, smallChange) 1102 } 1103 1104 func BenchmarkAppendNoChangeNFC(b *testing.B) { 1105 doSingle(b, appendBench, smallNoChange) 1106 } 1107 func BenchmarkAppendChangeNFC(b *testing.B) { 1108 doSingle(b, appendBench, smallChange) 1109 } 1110 func BenchmarkAppendLargeNFC(b *testing.B) { 1111 doSingle(b, appendBench, txt_all_bytes) 1112 } 1113 1114 func BenchmarkIterNoChangeNFC(b *testing.B) { 1115 doSingle(b, iterBench, smallNoChange) 1116 } 1117 func BenchmarkIterChangeNFC(b *testing.B) { 1118 doSingle(b, iterBench, smallChange) 1119 } 1120 func BenchmarkIterLargeNFC(b *testing.B) { 1121 doSingle(b, iterBench, txt_all_bytes) 1122 } 1123 1124 func BenchmarkTransformNoChangeNFC(b *testing.B) { 1125 doSingle(b, transformBench, smallNoChange) 1126 } 1127 func BenchmarkTransformChangeNFC(b *testing.B) { 1128 doSingle(b, transformBench, smallChange) 1129 } 1130 func BenchmarkTransformLargeNFC(b *testing.B) { 1131 doSingle(b, transformBench, txt_all_bytes) 1132 } 1133 1134 func BenchmarkNormalizeAsciiNFC(b *testing.B) { 1135 doFormBenchmark(b, NFC, NFC, ascii) 1136 } 1137 func BenchmarkNormalizeAsciiNFD(b *testing.B) { 1138 doFormBenchmark(b, NFC, NFD, ascii) 1139 } 1140 func BenchmarkNormalizeAsciiNFKC(b *testing.B) { 1141 doFormBenchmark(b, NFC, NFKC, ascii) 1142 } 1143 func BenchmarkNormalizeAsciiNFKD(b *testing.B) { 1144 doFormBenchmark(b, NFC, NFKD, ascii) 1145 } 1146 1147 func BenchmarkNormalizeNFC2NFC(b *testing.B) { 1148 doFormBenchmark(b, NFC, NFC, txt_all) 1149 } 1150 func BenchmarkNormalizeNFC2NFD(b *testing.B) { 1151 doFormBenchmark(b, NFC, NFD, txt_all) 1152 } 1153 func BenchmarkNormalizeNFD2NFC(b *testing.B) { 1154 doFormBenchmark(b, NFD, NFC, txt_all) 1155 } 1156 func BenchmarkNormalizeNFD2NFD(b *testing.B) { 1157 doFormBenchmark(b, NFD, NFD, txt_all) 1158 } 1159 1160 // Hangul is often special-cased, so we test it separately. 1161 func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) { 1162 doFormBenchmark(b, NFC, NFC, txt_kr) 1163 } 1164 func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) { 1165 doFormBenchmark(b, NFC, NFD, txt_kr) 1166 } 1167 func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) { 1168 doFormBenchmark(b, NFD, NFC, txt_kr) 1169 } 1170 func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) { 1171 doFormBenchmark(b, NFD, NFD, txt_kr) 1172 } 1173 1174 var forms = []Form{NFC, NFD, NFKC, NFKD} 1175 1176 func doTextBenchmark(b *testing.B, s string) { 1177 b.StopTimer() 1178 in := []byte(s) 1179 bm := []func(){} 1180 for _, f := range forms { 1181 bm = appendBenchmarks(bm, f, in) 1182 } 1183 b.SetBytes(int64(len(s) * len(bm))) 1184 b.StartTimer() 1185 for i := 0; i < b.N; i++ { 1186 for _, f := range bm { 1187 f() 1188 } 1189 } 1190 } 1191 1192 func BenchmarkCanonicalOrdering(b *testing.B) { 1193 doTextBenchmark(b, txt_canon) 1194 } 1195 func BenchmarkExtendedLatin(b *testing.B) { 1196 doTextBenchmark(b, txt_vn) 1197 } 1198 func BenchmarkMiscTwoByteUtf8(b *testing.B) { 1199 doTextBenchmark(b, twoByteUtf8) 1200 } 1201 func BenchmarkMiscThreeByteUtf8(b *testing.B) { 1202 doTextBenchmark(b, threeByteUtf8) 1203 } 1204 func BenchmarkHangul(b *testing.B) { 1205 doTextBenchmark(b, txt_kr) 1206 } 1207 func BenchmarkJapanese(b *testing.B) { 1208 doTextBenchmark(b, txt_jp) 1209 } 1210 func BenchmarkChinese(b *testing.B) { 1211 doTextBenchmark(b, txt_cn) 1212 } 1213 func BenchmarkOverflow(b *testing.B) { 1214 doTextBenchmark(b, overflow) 1215 } 1216 1217 var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B" 1218 1219 // Tests sampled from the Canonical ordering tests (Part 2) of 1220 // https://unicode.org/Public/UNIDATA/NormalizationTest.txt 1221 const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062 1222 \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062 1223 \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062 1224 \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062 1225 \u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062 1226 \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062 1227 \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062 1228 \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062 1229 \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062 1230 \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062 1231 \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062 1232 \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062 1233 \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062 1234 \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062 1235 \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062 1236 \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062 1237 \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062 1238 \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062` 1239 1240 // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/ 1241 const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. 1242 Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ 1243 nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc 1244 một giấy phép khác có các điều khoản tương tự như giấy phép này 1245 cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào 1246 trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của 1247 người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc 1248 bất kỳ chương nào của tác phẩm đã trong vùng dành cho công 1249 chúng theo quy định của pháp luật thì tình trạng của nó không 1250 bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.` 1251 1252 // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru 1253 const txt_ru = `При обязательном соблюдении следующих условий: 1254 Attribution — Вы должны атрибутировать произведение (указывать 1255 автора и источник) в порядке, предусмотренном автором или 1256 лицензиаром (но только так, чтобы никоим образом не подразумевалось, 1257 что они поддерживают вас или использование вами данного произведения). 1258 Υπό τις ακόλουθες προϋποθέσεις:` 1259 1260 // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/ 1261 const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον 1262 τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια 1263 (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή 1264 τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, 1265 τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα 1266 μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή 1267 παρόμοια άδεια.` 1268 1269 // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar 1270 const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن 1271 تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من 1272 الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل). 1273 المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة 1274 من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد 1275 لهذا الترخيص.` 1276 1277 // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/ 1278 const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן 1279 המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך 1280 שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות, 1281 לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך 1282 החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.` 1283 1284 const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il 1285 1286 // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/ 1287 const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시 1288 (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의 1289 원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의 1290 이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다). 1291 동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본 1292 라이선스와 동일한 라이선스를 적용해야 합니다.` 1293 1294 // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/ 1295 const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่ 1296 มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่ 1297 ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่ 1298 คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื 1299 อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื 1300 อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น` 1301 1302 const threeByteUtf8 = txt_th 1303 1304 // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/ 1305 const txt_jp = `あなたの従うべき条件は以下の通りです。 1306 表示 — あなたは原著作者のクレジットを表示しなければなりません。 1307 継承 — もしあなたがこの作品を改変、変形または加工した場合、 1308 あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ 1309 頒布することができます。` 1310 1311 // http://creativecommons.org/licenses/by-sa/2.5/cn/ 1312 const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、 1313 广播或通过信息网络传播本作品 创作演绎作品 1314 对本作品进行商业性使用 惟须遵守下列条件: 1315 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。 1316 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作, 1317 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。` 1318 1319 const txt_cjk = txt_cn + txt_jp + txt_kr 1320 const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk 1321 1322 var txt_all_bytes = []byte(txt_all)