github.com/go-xe2/third@v1.0.3/golang.org/x/text/unicode/norm/normalize_test.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package norm 6 7 import ( 8 "bytes" 9 "flag" 10 "fmt" 11 "io" 12 "log" 13 "strings" 14 "testing" 15 "unicode/utf8" 16 17 "github.com/go-xe2/third/golang.org/x/text/internal/testtext" 18 "github.com/go-xe2/third/golang.org/x/text/transform" 19 ) 20 21 var ( 22 testn = flag.Int("testn", -1, "specific test number to run or -1 for all") 23 ) 24 25 // pc replaces any rune r that is repeated n times, for n > 1, with r{n}. 26 func pc(s string) []byte { 27 b := bytes.NewBuffer(make([]byte, 0, len(s))) 28 for i := 0; i < len(s); { 29 r, sz := utf8.DecodeRuneInString(s[i:]) 30 n := 0 31 if sz == 1 { 32 // Special-case one-byte case to handle repetition for invalid UTF-8. 33 for c := s[i]; i+n < len(s) && s[i+n] == c; n++ { 34 } 35 } else { 36 for _, r2 := range s[i:] { 37 if r2 != r { 38 break 39 } 40 n++ 41 } 42 } 43 b.WriteString(s[i : i+sz]) 44 if n > 1 { 45 fmt.Fprintf(b, "{%d}", n) 46 } 47 i += sz * n 48 } 49 return b.Bytes() 50 } 51 52 // pidx finds the index from which two strings start to differ, plus context. 53 // It returns the index and ellipsis if the index is greater than 0. 54 func pidx(a, b string) (i int, prefix string) { 55 for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ { 56 } 57 if i < 8 { 58 return 0, "" 59 } 60 i -= 3 // ensure taking at least one full rune before the difference. 61 for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- { 62 } 63 return i, "..." 64 } 65 66 type PositionTest struct { 67 input string 68 pos int 69 buffer string // expected contents of reorderBuffer, if applicable 70 } 71 72 type positionFunc func(rb *reorderBuffer, s string) (int, []byte) 73 74 func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) { 75 rb := reorderBuffer{} 76 rb.init(f, nil) 77 for i, test := range tests { 78 rb.reset() 79 rb.src = inputString(test.input) 80 rb.nsrc = len(test.input) 81 pos, out := fn(&rb, test.input) 82 if pos != test.pos { 83 t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos) 84 } 85 if outs := string(out); outs != test.buffer { 86 k, pfx := pidx(outs, test.buffer) 87 t.Errorf("%s:%d: buffer \nwas %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:])) 88 } 89 } 90 } 91 92 func grave(n int) string { 93 return rep(0x0300, n) 94 } 95 96 func rep(r rune, n int) string { 97 return strings.Repeat(string(r), n) 98 } 99 100 const segSize = maxByteBufferSize 101 102 var cgj = GraphemeJoiner 103 104 var decomposeSegmentTests = []PositionTest{ 105 // illegal runes 106 {"\xC2", 0, ""}, 107 {"\xC0", 1, "\xC0"}, 108 {"\u00E0\x80", 2, "\u0061\u0300"}, 109 // starter 110 {"a", 1, "a"}, 111 {"ab", 1, "a"}, 112 // starter + composing 113 {"a\u0300", 3, "a\u0300"}, 114 {"a\u0300b", 3, "a\u0300"}, 115 // with decomposition 116 {"\u00C0", 2, "A\u0300"}, 117 {"\u00C0b", 2, "A\u0300"}, 118 // long 119 {grave(31), 60, grave(30) + cgj}, 120 {"a" + grave(31), 61, "a" + grave(30) + cgj}, 121 122 // Stability tests: see http://www.unicode.org/review/pr-29.html. 123 // U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;; 124 // U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;; 125 // U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; 126 // U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;; 127 // U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;; 128 {"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"}, 129 {"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"}, 130 {"\u0B47\u0B3E", 6, "\u0B47\u0B3E"}, 131 {"\u1100\u1161", 6, "\u1100\u1161"}, 132 133 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;; 134 // Sequence of decomposing characters that are starters and modifiers. 135 {"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj}, 136 137 {grave(30), 60, grave(30)}, 138 // U+FF9E is a starter, but decomposes to U+3099, which is not. 139 {grave(30) + "\uff9e", 60, grave(30) + cgj}, 140 // ends with incomplete UTF-8 encoding 141 {"\xCC", 0, ""}, 142 {"\u0300\xCC", 2, "\u0300"}, 143 } 144 145 func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) { 146 rb.initString(NFD, s) 147 rb.setFlusher(nil, appendFlush) 148 p := decomposeSegment(rb, 0, true) 149 return p, rb.out 150 } 151 152 func TestDecomposeSegment(t *testing.T) { 153 runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests) 154 } 155 156 var firstBoundaryTests = []PositionTest{ 157 // no boundary 158 {"", -1, ""}, 159 {"\u0300", -1, ""}, 160 {"\x80\x80", -1, ""}, 161 // illegal runes 162 {"\xff", 0, ""}, 163 {"\u0300\xff", 2, ""}, 164 {"\u0300\xc0\x80\x80", 2, ""}, 165 // boundaries 166 {"a", 0, ""}, 167 {"\u0300a", 2, ""}, 168 // Hangul 169 {"\u1103\u1161", 0, ""}, 170 {"\u110B\u1173\u11B7", 0, ""}, 171 {"\u1161\u110B\u1173\u11B7", 3, ""}, 172 {"\u1173\u11B7\u1103\u1161", 6, ""}, 173 // too many combining characters. 174 {grave(maxNonStarters - 1), -1, ""}, 175 {grave(maxNonStarters), 60, ""}, 176 {grave(maxNonStarters + 1), 60, ""}, 177 } 178 179 func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) { 180 return rb.f.form.FirstBoundary([]byte(s)), nil 181 } 182 183 func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) { 184 return rb.f.form.FirstBoundaryInString(s), nil 185 } 186 187 func TestFirstBoundary(t *testing.T) { 188 runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests) 189 runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests) 190 } 191 192 func TestNextBoundary(t *testing.T) { 193 testCases := []struct { 194 input string 195 atEOF bool 196 want int 197 }{ 198 // no boundary 199 {"", true, 0}, 200 {"", false, -1}, 201 {"\u0300", true, 2}, 202 {"\u0300", false, -1}, 203 {"\x80\x80", true, 1}, 204 {"\x80\x80", false, 1}, 205 // illegal runes 206 {"\xff", false, 1}, 207 {"\u0300\xff", false, 2}, 208 {"\u0300\xc0\x80\x80", false, 2}, 209 {"\xc2\x80\x80", false, 2}, 210 {"\xc2", false, -1}, 211 {"\xc2", true, 1}, 212 {"a\u0300\xc2", false, -1}, 213 {"a\u0300\xc2", true, 3}, 214 // boundaries 215 {"a", true, 1}, 216 {"a", false, -1}, 217 {"aa", false, 1}, 218 {"\u0300", true, 2}, 219 {"\u0300", false, -1}, 220 {"\u0300a", false, 2}, 221 // Hangul 222 {"\u1103\u1161", true, 6}, 223 {"\u1103\u1161", false, -1}, 224 {"\u110B\u1173\u11B7", false, -1}, 225 {"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9}, 226 {"\u1161\u110B\u1173\u11B7", false, 3}, 227 {"\u1173\u11B7\u1103\u1161", false, 6}, 228 // too many combining characters. 229 {grave(maxNonStarters - 1), false, -1}, 230 {grave(maxNonStarters), false, 60}, 231 {grave(maxNonStarters + 1), false, 60}, 232 } 233 234 for _, tc := range testCases { 235 if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want { 236 t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want) 237 } 238 if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want { 239 t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want) 240 } 241 } 242 } 243 244 var decomposeToLastTests = []PositionTest{ 245 // ends with inert character 246 {"Hello!", 6, ""}, 247 {"\u0632", 2, ""}, 248 {"a\u0301\u0635", 5, ""}, 249 // ends with non-inert starter 250 {"a", 0, "a"}, 251 {"a\u0301a", 3, "a"}, 252 {"a\u0301\u03B9", 3, "\u03B9"}, 253 {"a\u0327", 0, "a\u0327"}, 254 // illegal runes 255 {"\xFF", 1, ""}, 256 {"aa\xFF", 3, ""}, 257 {"\xC0\x80\x80", 3, ""}, 258 {"\xCC\x80\x80", 3, ""}, 259 // ends with incomplete UTF-8 encoding 260 {"a\xCC", 2, ""}, 261 // ends with combining characters 262 {"\u0300\u0301", 0, "\u0300\u0301"}, 263 {"a\u0300\u0301", 0, "a\u0300\u0301"}, 264 {"a\u0301\u0308", 0, "a\u0301\u0308"}, 265 {"a\u0308\u0301", 0, "a\u0308\u0301"}, 266 {"aaaa\u0300\u0301", 3, "a\u0300\u0301"}, 267 {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"}, 268 {"\u00C0", 0, "A\u0300"}, 269 {"a\u00C0", 1, "A\u0300"}, 270 // decomposing 271 {"a\u0300\u00E0", 3, "a\u0300"}, 272 // multisegment decompositions (flushes leading segments) 273 {"a\u0300\uFDC0", 7, "\u064A"}, 274 {"\uFDC0" + grave(29), 4, "\u064A" + grave(29)}, 275 {"\uFDC0" + grave(30), 4, "\u064A" + grave(30)}, 276 {"\uFDC0" + grave(31), 5, grave(30)}, 277 {"\uFDFA" + grave(14), 31, "\u0645" + grave(14)}, 278 // Overflow 279 {"\u00E0" + grave(29), 0, "a" + grave(30)}, 280 {"\u00E0" + grave(30), 2, grave(30)}, 281 // Hangul 282 {"a\u1103", 1, "\u1103"}, 283 {"a\u110B", 1, "\u110B"}, 284 {"a\u110B\u1173", 1, "\u110B\u1173"}, 285 // See comment in composition.go:compBoundaryAfter. 286 {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"}, 287 {"a\uC73C", 1, "\u110B\u1173"}, 288 {"다음", 3, "\u110B\u1173\u11B7"}, 289 {"다", 0, "\u1103\u1161"}, 290 {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"}, 291 {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"}, 292 {"다음음", 6, "\u110B\u1173\u11B7"}, 293 {"음다다", 6, "\u1103\u1161"}, 294 // maximized buffer 295 {"a" + grave(30), 0, "a" + grave(30)}, 296 // Buffer overflow 297 {"a" + grave(31), 3, grave(30)}, 298 // weird UTF-8 299 {"a\u0300\u11B7", 0, "a\u0300\u11B7"}, 300 } 301 302 func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) { 303 rb.setFlusher([]byte(s), appendFlush) 304 decomposeToLastBoundary(rb) 305 buf := rb.flush(nil) 306 return len(rb.out), buf 307 } 308 309 func TestDecomposeToLastBoundary(t *testing.T) { 310 runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests) 311 } 312 313 var lastBoundaryTests = []PositionTest{ 314 // ends with inert character 315 {"Hello!", 6, ""}, 316 {"\u0632", 2, ""}, 317 // ends with non-inert starter 318 {"a", 0, ""}, 319 // illegal runes 320 {"\xff", 1, ""}, 321 {"aa\xff", 3, ""}, 322 {"a\xff\u0300", 1, ""}, // TODO: should probably be 2. 323 {"\xc0\x80\x80", 3, ""}, 324 {"\xc0\x80\x80\u0300", 3, ""}, 325 // ends with incomplete UTF-8 encoding 326 {"\xCC", -1, ""}, 327 {"\xE0\x80", -1, ""}, 328 {"\xF0\x80\x80", -1, ""}, 329 {"a\xCC", 0, ""}, 330 {"\x80\xCC", 1, ""}, 331 {"\xCC\xCC", 1, ""}, 332 // ends with combining characters 333 {"a\u0300\u0301", 0, ""}, 334 {"aaaa\u0300\u0301", 3, ""}, 335 {"\u0300a\u0300\u0301", 2, ""}, 336 {"\u00C2", 0, ""}, 337 {"a\u00C2", 1, ""}, 338 // decomposition may recombine 339 {"\u0226", 0, ""}, 340 // no boundary 341 {"", -1, ""}, 342 {"\u0300\u0301", -1, ""}, 343 {"\u0300", -1, ""}, 344 {"\x80\x80", -1, ""}, 345 {"\x80\x80\u0301", -1, ""}, 346 // Hangul 347 {"다음", 3, ""}, 348 {"다", 0, ""}, 349 {"\u1103\u1161\u110B\u1173\u11B7", 6, ""}, 350 {"\u110B\u1173\u11B7\u1103\u1161", 9, ""}, 351 // too many combining characters. 352 {grave(maxNonStarters - 1), -1, ""}, 353 // May still be preceded with a non-starter. 354 {grave(maxNonStarters), -1, ""}, 355 // May still need to insert a cgj after the last combiner. 356 {grave(maxNonStarters + 1), 2, ""}, 357 {grave(maxNonStarters + 2), 4, ""}, 358 359 {"a" + grave(maxNonStarters-1), 0, ""}, 360 {"a" + grave(maxNonStarters), 0, ""}, 361 // May still need to insert a cgj after the last combiner. 362 {"a" + grave(maxNonStarters+1), 3, ""}, 363 {"a" + grave(maxNonStarters+2), 5, ""}, 364 } 365 366 func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) { 367 return rb.f.form.LastBoundary([]byte(s)), nil 368 } 369 370 func TestLastBoundary(t *testing.T) { 371 runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests) 372 } 373 374 type spanTest struct { 375 input string 376 atEOF bool 377 n int 378 err error 379 } 380 381 var quickSpanTests = []spanTest{ 382 {"", true, 0, nil}, 383 // starters 384 {"a", true, 1, nil}, 385 {"abc", true, 3, nil}, 386 {"\u043Eb", true, 3, nil}, 387 // incomplete last rune. 388 {"\xCC", true, 1, nil}, 389 {"\xCC", false, 0, transform.ErrShortSrc}, 390 {"a\xCC", true, 2, nil}, 391 {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD 392 // incorrectly ordered combining characters 393 {"\u0300\u0316", true, 0, transform.ErrEndOfSpan}, 394 {"\u0300\u0316", false, 0, transform.ErrEndOfSpan}, 395 {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan}, 396 {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan}, 397 // have a maximum number of combining characters. 398 {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 399 {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 400 {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, 401 {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan}, 402 {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil}, 403 {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil}, 404 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, 405 {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, 406 407 {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc}, 408 {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, 409 {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, 410 } 411 412 var quickSpanNFDTests = []spanTest{ 413 // needs decomposing 414 {"\u00C0", true, 0, transform.ErrEndOfSpan}, 415 {"abc\u00C0", true, 3, transform.ErrEndOfSpan}, 416 // correctly ordered combining characters 417 {"\u0300", true, 2, nil}, 418 {"ab\u0300", true, 4, nil}, 419 {"ab\u0300cd", true, 6, nil}, 420 {"\u0300cd", true, 4, nil}, 421 {"\u0316\u0300", true, 4, nil}, 422 {"ab\u0316\u0300", true, 6, nil}, 423 {"ab\u0316\u0300cd", true, 8, nil}, 424 {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan}, 425 {"\u0316\u0300cd", true, 6, nil}, 426 {"\u043E\u0308b", true, 5, nil}, 427 // incorrectly ordered combining characters 428 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well. 429 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, 430 // Hangul 431 {"같은", true, 0, transform.ErrEndOfSpan}, 432 } 433 434 var quickSpanNFCTests = []spanTest{ 435 // okay composed 436 {"\u00C0", true, 2, nil}, 437 {"abc\u00C0", true, 5, nil}, 438 // correctly ordered combining characters 439 // TODO: b may combine with modifiers, which is why this fails. We could 440 // make a more precise test that that actually checks whether last 441 // characters combines. Probably not worth it. 442 {"ab\u0300", true, 1, transform.ErrEndOfSpan}, 443 {"ab\u0300cd", true, 1, transform.ErrEndOfSpan}, 444 {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan}, 445 {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan}, 446 {"\u00C0\u035D", true, 4, nil}, 447 // we do not special case leading combining characters 448 {"\u0300cd", true, 0, transform.ErrEndOfSpan}, 449 {"\u0300", true, 0, transform.ErrEndOfSpan}, 450 {"\u0316\u0300", true, 0, transform.ErrEndOfSpan}, 451 {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan}, 452 // incorrectly ordered combining characters 453 {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, 454 {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, 455 // Hangul 456 {"같은", true, 6, nil}, 457 {"같은", false, 3, transform.ErrShortSrc}, 458 // We return the start of the violating segment in case of overflow. 459 {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan}, 460 {grave(30), true, 0, transform.ErrEndOfSpan}, 461 } 462 463 func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) { 464 for i, tc := range testCases { 465 s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) 466 ok := testtext.Run(t, s, func(t *testing.T) { 467 n, err := f.Span([]byte(tc.input), tc.atEOF) 468 if n != tc.n || err != tc.err { 469 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) 470 } 471 }) 472 if !ok { 473 continue // Don't do the String variant if the Bytes variant failed. 474 } 475 s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) 476 testtext.Run(t, s, func(t *testing.T) { 477 n, err := f.SpanString(tc.input, tc.atEOF) 478 if n != tc.n || err != tc.err { 479 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) 480 } 481 }) 482 } 483 } 484 485 func TestSpan(t *testing.T) { 486 runSpanTests(t, "NFD", NFD, quickSpanTests) 487 runSpanTests(t, "NFD", NFD, quickSpanNFDTests) 488 runSpanTests(t, "NFC", NFC, quickSpanTests) 489 runSpanTests(t, "NFC", NFC, quickSpanNFCTests) 490 } 491 492 var isNormalTests = []PositionTest{ 493 {"", 1, ""}, 494 // illegal runes 495 {"\xff", 1, ""}, 496 // starters 497 {"a", 1, ""}, 498 {"abc", 1, ""}, 499 {"\u043Eb", 1, ""}, 500 // incorrectly ordered combining characters 501 {"\u0300\u0316", 0, ""}, 502 {"ab\u0300\u0316", 0, ""}, 503 {"ab\u0300\u0316cd", 0, ""}, 504 {"\u0300\u0316cd", 0, ""}, 505 } 506 var isNormalNFDTests = []PositionTest{ 507 // needs decomposing 508 {"\u00C0", 0, ""}, 509 {"abc\u00C0", 0, ""}, 510 // correctly ordered combining characters 511 {"\u0300", 1, ""}, 512 {"ab\u0300", 1, ""}, 513 {"ab\u0300cd", 1, ""}, 514 {"\u0300cd", 1, ""}, 515 {"\u0316\u0300", 1, ""}, 516 {"ab\u0316\u0300", 1, ""}, 517 {"ab\u0316\u0300cd", 1, ""}, 518 {"\u0316\u0300cd", 1, ""}, 519 {"\u043E\u0308b", 1, ""}, 520 // Hangul 521 {"같은", 0, ""}, 522 } 523 var isNormalNFCTests = []PositionTest{ 524 // okay composed 525 {"\u00C0", 1, ""}, 526 {"abc\u00C0", 1, ""}, 527 // need reordering 528 {"a\u0300", 0, ""}, 529 {"a\u0300cd", 0, ""}, 530 {"a\u0316\u0300", 0, ""}, 531 {"a\u0316\u0300cd", 0, ""}, 532 // correctly ordered combining characters 533 {"ab\u0300", 1, ""}, 534 {"ab\u0300cd", 1, ""}, 535 {"ab\u0316\u0300", 1, ""}, 536 {"ab\u0316\u0300cd", 1, ""}, 537 {"\u00C0\u035D", 1, ""}, 538 {"\u0300", 1, ""}, 539 {"\u0316\u0300cd", 1, ""}, 540 // Hangul 541 {"같은", 1, ""}, 542 } 543 544 var isNormalNFKXTests = []PositionTest{ 545 // Special case. 546 {"\u00BC", 0, ""}, 547 } 548 549 func isNormalF(rb *reorderBuffer, s string) (int, []byte) { 550 if rb.f.form.IsNormal([]byte(s)) { 551 return 1, nil 552 } 553 return 0, nil 554 } 555 556 func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) { 557 if rb.f.form.IsNormalString(s) { 558 return 1, nil 559 } 560 return 0, nil 561 } 562 563 func TestIsNormal(t *testing.T) { 564 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests) 565 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests) 566 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests) 567 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests) 568 runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests) 569 runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests) 570 runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests) 571 runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests) 572 runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests) 573 runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests) 574 } 575 576 func TestIsNormalString(t *testing.T) { 577 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests) 578 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests) 579 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests) 580 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests) 581 } 582 583 type AppendTest struct { 584 left string 585 right string 586 out string 587 } 588 589 type appendFunc func(f Form, out []byte, s string) []byte 590 591 var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"} 592 593 func runNormTests(t *testing.T, name string, fn appendFunc) { 594 for f := NFC; f <= NFKD; f++ { 595 runAppendTests(t, name, f, fn, normTests[f]) 596 } 597 } 598 599 func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) { 600 for i, test := range tests { 601 t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) { 602 id := pc(test.left + test.right) 603 if *testn >= 0 && i != *testn { 604 return 605 } 606 t.Run("fn", func(t *testing.T) { 607 out := []byte(test.left) 608 have := string(fn(f, out, test.right)) 609 if len(have) != len(test.out) { 610 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out)) 611 } 612 if have != test.out { 613 k, pf := pidx(have, test.out) 614 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:])) 615 } 616 }) 617 618 // Bootstrap by normalizing input. Ensures that the various variants 619 // behave the same. 620 for g := NFC; g <= NFKD; g++ { 621 if f == g { 622 continue 623 } 624 t.Run(fstr[g], func(t *testing.T) { 625 want := g.String(test.left + test.right) 626 have := string(fn(g, g.AppendString(nil, test.left), test.right)) 627 if len(have) != len(want) { 628 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want)) 629 } 630 if have != want { 631 k, pf := pidx(have, want) 632 t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:])) 633 } 634 }) 635 } 636 }) 637 } 638 } 639 640 var normTests = [][]AppendTest{ 641 appendTestsNFC, 642 appendTestsNFD, 643 appendTestsNFKC, 644 appendTestsNFKD, 645 } 646 647 var appendTestsNFC = []AppendTest{ 648 {"", ascii, ascii}, 649 {"", txt_all, txt_all}, 650 {"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)}, 651 {grave(30), "\uff9e", grave(30) + cgj + "\uff9e"}, 652 653 // Tests designed for Iter. 654 { // ordering of non-composing combining characters 655 "", 656 "\u0305\u0316", 657 "\u0316\u0305", 658 }, 659 { // segment overflow 660 "", 661 "a" + rep(0x0305, maxNonStarters+4) + "\u0316", 662 "a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4), 663 }, 664 665 { // Combine across non-blocking non-starters. 666 // U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;; 667 // U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;; 668 "", "a\u0327\u0325", "\u1e01\u0327", 669 }, 670 671 { // Jamo V+T does not combine. 672 "", 673 "\u1161\u11a8", 674 "\u1161\u11a8", 675 }, 676 677 // Stability tests: see http://www.unicode.org/review/pr-29.html. 678 {"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"}, 679 {"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"}, 680 {"", "\u0b47\u0b3e", "\u0b4b"}, 681 {"", "\u1100\u1161", "\uac00"}, 682 683 // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;; 684 { // 0d4a starts a new segment. 685 "", 686 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15), 687 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15), 688 }, 689 690 { // Split combining characters. 691 // TODO: don't insert CGJ before starters. 692 "", 693 "\u0d46" + strings.Repeat("\u0d3e", 31), 694 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e", 695 }, 696 697 { // Split combining characters. 698 "", 699 "\u0d4a" + strings.Repeat("\u0d3e", 30), 700 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e", 701 }, 702 703 { // https://golang.org/issues/20079 704 "", 705 "\xeb\u0344", 706 "\xeb\u0308\u0301", 707 }, 708 709 { // https://golang.org/issues/20079 710 "", 711 "\uac00" + strings.Repeat("\u0300", 30), 712 "\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300", 713 }, 714 715 { // https://golang.org/issues/20079 716 "", 717 "\xeb" + strings.Repeat("\u0300", 31), 718 "\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300", 719 }, 720 } 721 722 var appendTestsNFD = []AppendTest{ 723 // TODO: Move some of the tests here. 724 } 725 726 var appendTestsNFKC = []AppendTest{ 727 // empty buffers 728 {"", "", ""}, 729 {"a", "", "a"}, 730 {"", "a", "a"}, 731 {"", "\u0041\u0307\u0304", "\u01E0"}, 732 // segment split across buffers 733 {"", "a\u0300b", "\u00E0b"}, 734 {"a", "\u0300b", "\u00E0b"}, 735 {"a", "\u0300\u0316", "\u00E0\u0316"}, 736 {"a", "\u0316\u0300", "\u00E0\u0316"}, 737 {"a", "\u0300a\u0300", "\u00E0\u00E0"}, 738 {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"}, 739 {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"}, 740 {"a\u0300", "\u0327", "\u00E0\u0327"}, 741 {"a\u0327", "\u0300", "\u00E0\u0327"}, 742 {"a\u0316", "\u0300", "\u00E0\u0316"}, 743 {"\u0041\u0307", "\u0304", "\u01E0"}, 744 // Hangul 745 {"", "\u110B\u1173", "\uC73C"}, 746 {"", "\u1103\u1161", "\uB2E4"}, 747 {"", "\u110B\u1173\u11B7", "\uC74C"}, 748 {"", "\u320E", "\x28\uAC00\x29"}, 749 {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"}, 750 {"\u1103", "\u1161", "\uB2E4"}, 751 {"\u110B", "\u1173\u11B7", "\uC74C"}, 752 {"\u110B\u1173", "\u11B7", "\uC74C"}, 753 {"\uC73C", "\u11B7", "\uC74C"}, 754 // UTF-8 encoding split across buffers 755 {"a\xCC", "\x80", "\u00E0"}, 756 {"a\xCC", "\x80b", "\u00E0b"}, 757 {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"}, 758 {"a\xCC", "\x80\x80", "\u00E0\x80"}, 759 {"a\xCC", "\x80\xCC", "\u00E0\xCC"}, 760 {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"}, 761 // ending in incomplete UTF-8 encoding 762 {"", "\xCC", "\xCC"}, 763 {"a", "\xCC", "a\xCC"}, 764 {"a", "b\xCC", "ab\xCC"}, 765 {"\u0226", "\xCC", "\u0226\xCC"}, 766 // illegal runes 767 {"", "\x80", "\x80"}, 768 {"", "\x80\x80\x80", "\x80\x80\x80"}, 769 {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"}, 770 {"", "a\x80", "a\x80"}, 771 {"", "a\x80\x80\x80", "a\x80\x80\x80"}, 772 {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"}, 773 {"a", "\x80\x80\x80", "a\x80\x80\x80"}, 774 // overflow 775 {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)}, 776 {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)}, 777 {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)}, 778 // overflow of combining characters 779 {"", grave(34), grave(30) + cgj + grave(4)}, 780 {"", grave(36), grave(30) + cgj + grave(6)}, 781 {grave(29), grave(5), grave(30) + cgj + grave(4)}, 782 {grave(30), grave(4), grave(30) + cgj + grave(4)}, 783 {grave(30), grave(3), grave(30) + cgj + grave(3)}, 784 {grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)}, 785 {"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)}, 786 {"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)}, 787 // - First rune has a trailing non-starter. 788 {"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)}, 789 // - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be 790 // inserted even when FF9E starts a new segment. 791 {"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)}, 792 {grave(30), "\uff9e", grave(30) + cgj + "\u3099"}, 793 // - Many non-starter decompositions in a row causing overflow. 794 {"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"}, 795 {"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"}, 796 797 {"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"}, 798 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)}, 799 {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)}, 800 801 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers. 802 {"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"}, 803 {"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"}, 804 {"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"}, 805 806 // weird UTF-8 807 {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"}, 808 {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"}, 809 {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"}, 810 {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"}, 811 {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"}, 812 {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"}, 813 {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"}, 814 {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"}, 815 816 {"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)}, 817 // large input. 818 {"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)}, 819 {"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)}, 820 {"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)}, 821 {"", "\u0041\u0307\u0304", "\u01E0"}, 822 } 823 824 var appendTestsNFKD = []AppendTest{ 825 {"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)}, 826 827 { // segment overflow on unchanged character 828 "", 829 "a" + grave(64) + "\u0316", 830 "a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4), 831 }, 832 { // segment overflow on unchanged character + start value 833 "", 834 "a" + grave(98) + "\u0316", 835 "a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8), 836 }, 837 { // segment overflow on decomposition. (U+0340 decomposes to U+0300.) 838 "", 839 "a" + grave(59) + "\u0340", 840 "a" + grave(30) + cgj + grave(30), 841 }, 842 { // segment overflow on non-starter decomposition 843 "", 844 "a" + grave(33) + "\u0340" + grave(30) + "\u0320", 845 "a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4), 846 }, 847 { // start value after ASCII overflow 848 "", 849 rep('a', segSize) + grave(32) + "\u0320", 850 rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2), 851 }, 852 { // Jamo overflow 853 "", 854 "\u1100\u1161" + grave(30) + "\u0320" + grave(2), 855 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 856 }, 857 { // Hangul 858 "", 859 "\uac00", 860 "\u1100\u1161", 861 }, 862 { // Hangul overflow 863 "", 864 "\uac00" + grave(32) + "\u0320", 865 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 866 }, 867 { // Hangul overflow in Hangul mode. 868 "", 869 "\uac00\uac00" + grave(32) + "\u0320", 870 "\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3), 871 }, 872 { // Hangul overflow in Hangul mode. 873 "", 874 strings.Repeat("\uac00", 3) + grave(32) + "\u0320", 875 strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3), 876 }, 877 { // start value after cc=0 878 "", 879 "您您" + grave(34) + "\u0320", 880 "您您" + grave(30) + cgj + "\u0320" + grave(4), 881 }, 882 { // start value after normalization 883 "", 884 "\u0300\u0320a" + grave(34) + "\u0320", 885 "\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4), 886 }, 887 { 888 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers. 889 "", 890 "a\u0f7f" + rep(0xf71, 29) + "\u0f81", 891 "a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80", 892 }, 893 } 894 895 func TestAppend(t *testing.T) { 896 runNormTests(t, "Append", func(f Form, out []byte, s string) []byte { 897 return f.Append(out, []byte(s)...) 898 }) 899 } 900 901 func TestAppendString(t *testing.T) { 902 runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte { 903 return f.AppendString(out, s) 904 }) 905 } 906 907 func TestBytes(t *testing.T) { 908 runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte { 909 buf := []byte{} 910 buf = append(buf, out...) 911 buf = append(buf, s...) 912 return f.Bytes(buf) 913 }) 914 } 915 916 func TestString(t *testing.T) { 917 runNormTests(t, "String", func(f Form, out []byte, s string) []byte { 918 outs := string(out) + s 919 return []byte(f.String(outs)) 920 }) 921 } 922 923 func TestLinking(t *testing.T) { 924 const prog = ` 925 package main 926 import "fmt" 927 import "github.com/go-xe2/third/golang.org/x/text/unicode/norm" 928 func main() { fmt.Println(norm.%s) } 929 ` 930 baseline, errB := testtext.CodeSize(fmt.Sprintf(prog, "MaxSegmentSize")) 931 withTables, errT := testtext.CodeSize(fmt.Sprintf(prog, `NFC.String("")`)) 932 if errB != nil || errT != nil { 933 t.Skipf("code size failed: %v and %v", errB, errT) 934 } 935 // Tables are at least 50K 936 if d := withTables - baseline; d < 50*1024 { 937 t.Errorf("tables appear not to be dropped: %d - %d = %d", 938 withTables, baseline, d) 939 } 940 } 941 942 func appendBench(f Form, in []byte) func() { 943 buf := make([]byte, 0, 4*len(in)) 944 return func() { 945 f.Append(buf, in...) 946 } 947 } 948 949 func bytesBench(f Form, in []byte) func() { 950 return func() { 951 f.Bytes(in) 952 } 953 } 954 955 func iterBench(f Form, in []byte) func() { 956 iter := Iter{} 957 return func() { 958 iter.Init(f, in) 959 for !iter.Done() { 960 iter.Next() 961 } 962 } 963 } 964 965 func transformBench(f Form, in []byte) func() { 966 buf := make([]byte, 4*len(in)) 967 return func() { 968 if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n { 969 log.Panic(n, len(in), err) 970 } 971 } 972 } 973 974 func readerBench(f Form, in []byte) func() { 975 buf := make([]byte, 4*len(in)) 976 return func() { 977 r := f.Reader(bytes.NewReader(in)) 978 var err error 979 for err == nil { 980 _, err = r.Read(buf) 981 } 982 if err != io.EOF { 983 panic("") 984 } 985 } 986 } 987 988 func writerBench(f Form, in []byte) func() { 989 buf := make([]byte, 0, 4*len(in)) 990 return func() { 991 r := f.Writer(bytes.NewBuffer(buf)) 992 if _, err := r.Write(in); err != nil { 993 panic("") 994 } 995 } 996 } 997 998 func appendBenchmarks(bm []func(), f Form, in []byte) []func() { 999 bm = append(bm, appendBench(f, in)) 1000 bm = append(bm, iterBench(f, in)) 1001 bm = append(bm, transformBench(f, in)) 1002 bm = append(bm, readerBench(f, in)) 1003 bm = append(bm, writerBench(f, in)) 1004 return bm 1005 } 1006 1007 func doFormBenchmark(b *testing.B, inf, f Form, s string) { 1008 b.StopTimer() 1009 in := inf.Bytes([]byte(s)) 1010 bm := appendBenchmarks(nil, f, in) 1011 b.SetBytes(int64(len(in) * len(bm))) 1012 b.StartTimer() 1013 for i := 0; i < b.N; i++ { 1014 for _, fn := range bm { 1015 fn() 1016 } 1017 } 1018 } 1019 1020 func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) { 1021 b.StopTimer() 1022 fn := f(NFC, s) 1023 b.SetBytes(int64(len(s))) 1024 b.StartTimer() 1025 for i := 0; i < b.N; i++ { 1026 fn() 1027 } 1028 } 1029 1030 var ( 1031 smallNoChange = []byte("nörmalization") 1032 smallChange = []byte("No\u0308rmalization") 1033 ascii = strings.Repeat("There is nothing to change here! ", 500) 1034 ) 1035 1036 func lowerBench(f Form, in []byte) func() { 1037 // Use package strings instead of bytes as it doesn't allocate memory 1038 // if there aren't any changes. 1039 s := string(in) 1040 return func() { 1041 strings.ToLower(s) 1042 } 1043 } 1044 1045 func BenchmarkLowerCaseNoChange(b *testing.B) { 1046 doSingle(b, lowerBench, smallNoChange) 1047 } 1048 func BenchmarkLowerCaseChange(b *testing.B) { 1049 doSingle(b, lowerBench, smallChange) 1050 } 1051 1052 func quickSpanBench(f Form, in []byte) func() { 1053 return func() { 1054 f.QuickSpan(in) 1055 } 1056 } 1057 1058 func BenchmarkQuickSpanChangeNFC(b *testing.B) { 1059 doSingle(b, quickSpanBench, smallNoChange) 1060 } 1061 1062 func BenchmarkBytesNoChangeNFC(b *testing.B) { 1063 doSingle(b, bytesBench, smallNoChange) 1064 } 1065 func BenchmarkBytesChangeNFC(b *testing.B) { 1066 doSingle(b, bytesBench, smallChange) 1067 } 1068 1069 func BenchmarkAppendNoChangeNFC(b *testing.B) { 1070 doSingle(b, appendBench, smallNoChange) 1071 } 1072 func BenchmarkAppendChangeNFC(b *testing.B) { 1073 doSingle(b, appendBench, smallChange) 1074 } 1075 func BenchmarkAppendLargeNFC(b *testing.B) { 1076 doSingle(b, appendBench, txt_all_bytes) 1077 } 1078 1079 func BenchmarkIterNoChangeNFC(b *testing.B) { 1080 doSingle(b, iterBench, smallNoChange) 1081 } 1082 func BenchmarkIterChangeNFC(b *testing.B) { 1083 doSingle(b, iterBench, smallChange) 1084 } 1085 func BenchmarkIterLargeNFC(b *testing.B) { 1086 doSingle(b, iterBench, txt_all_bytes) 1087 } 1088 1089 func BenchmarkTransformNoChangeNFC(b *testing.B) { 1090 doSingle(b, transformBench, smallNoChange) 1091 } 1092 func BenchmarkTransformChangeNFC(b *testing.B) { 1093 doSingle(b, transformBench, smallChange) 1094 } 1095 func BenchmarkTransformLargeNFC(b *testing.B) { 1096 doSingle(b, transformBench, txt_all_bytes) 1097 } 1098 1099 func BenchmarkNormalizeAsciiNFC(b *testing.B) { 1100 doFormBenchmark(b, NFC, NFC, ascii) 1101 } 1102 func BenchmarkNormalizeAsciiNFD(b *testing.B) { 1103 doFormBenchmark(b, NFC, NFD, ascii) 1104 } 1105 func BenchmarkNormalizeAsciiNFKC(b *testing.B) { 1106 doFormBenchmark(b, NFC, NFKC, ascii) 1107 } 1108 func BenchmarkNormalizeAsciiNFKD(b *testing.B) { 1109 doFormBenchmark(b, NFC, NFKD, ascii) 1110 } 1111 1112 func BenchmarkNormalizeNFC2NFC(b *testing.B) { 1113 doFormBenchmark(b, NFC, NFC, txt_all) 1114 } 1115 func BenchmarkNormalizeNFC2NFD(b *testing.B) { 1116 doFormBenchmark(b, NFC, NFD, txt_all) 1117 } 1118 func BenchmarkNormalizeNFD2NFC(b *testing.B) { 1119 doFormBenchmark(b, NFD, NFC, txt_all) 1120 } 1121 func BenchmarkNormalizeNFD2NFD(b *testing.B) { 1122 doFormBenchmark(b, NFD, NFD, txt_all) 1123 } 1124 1125 // Hangul is often special-cased, so we test it separately. 1126 func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) { 1127 doFormBenchmark(b, NFC, NFC, txt_kr) 1128 } 1129 func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) { 1130 doFormBenchmark(b, NFC, NFD, txt_kr) 1131 } 1132 func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) { 1133 doFormBenchmark(b, NFD, NFC, txt_kr) 1134 } 1135 func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) { 1136 doFormBenchmark(b, NFD, NFD, txt_kr) 1137 } 1138 1139 var forms = []Form{NFC, NFD, NFKC, NFKD} 1140 1141 func doTextBenchmark(b *testing.B, s string) { 1142 b.StopTimer() 1143 in := []byte(s) 1144 bm := []func(){} 1145 for _, f := range forms { 1146 bm = appendBenchmarks(bm, f, in) 1147 } 1148 b.SetBytes(int64(len(s) * len(bm))) 1149 b.StartTimer() 1150 for i := 0; i < b.N; i++ { 1151 for _, f := range bm { 1152 f() 1153 } 1154 } 1155 } 1156 1157 func BenchmarkCanonicalOrdering(b *testing.B) { 1158 doTextBenchmark(b, txt_canon) 1159 } 1160 func BenchmarkExtendedLatin(b *testing.B) { 1161 doTextBenchmark(b, txt_vn) 1162 } 1163 func BenchmarkMiscTwoByteUtf8(b *testing.B) { 1164 doTextBenchmark(b, twoByteUtf8) 1165 } 1166 func BenchmarkMiscThreeByteUtf8(b *testing.B) { 1167 doTextBenchmark(b, threeByteUtf8) 1168 } 1169 func BenchmarkHangul(b *testing.B) { 1170 doTextBenchmark(b, txt_kr) 1171 } 1172 func BenchmarkJapanese(b *testing.B) { 1173 doTextBenchmark(b, txt_jp) 1174 } 1175 func BenchmarkChinese(b *testing.B) { 1176 doTextBenchmark(b, txt_cn) 1177 } 1178 func BenchmarkOverflow(b *testing.B) { 1179 doTextBenchmark(b, overflow) 1180 } 1181 1182 var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B" 1183 1184 // Tests sampled from the Canonical ordering tests (Part 2) of 1185 // http://unicode.org/Public/UNIDATA/NormalizationTest.txt 1186 const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062 1187 \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062 1188 \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062 1189 \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062 1190 \u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062 1191 \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062 1192 \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062 1193 \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062 1194 \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062 1195 \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062 1196 \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062 1197 \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062 1198 \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062 1199 \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062 1200 \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062 1201 \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062 1202 \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062 1203 \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062` 1204 1205 // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/ 1206 const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. 1207 Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ 1208 nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc 1209 một giấy phép khác có các điều khoản tương tự như giấy phép này 1210 cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào 1211 trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của 1212 người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc 1213 bất kỳ chương nào của tác phẩm đã trong vùng dành cho công 1214 chúng theo quy định của pháp luật thì tình trạng của nó không 1215 bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.` 1216 1217 // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru 1218 const txt_ru = `При обязательном соблюдении следующих условий: 1219 Attribution — Вы должны атрибутировать произведение (указывать 1220 автора и источник) в порядке, предусмотренном автором или 1221 лицензиаром (но только так, чтобы никоим образом не подразумевалось, 1222 что они поддерживают вас или использование вами данного произведения). 1223 Υπό τις ακόλουθες προϋποθέσεις:` 1224 1225 // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/ 1226 const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον 1227 τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια 1228 (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή 1229 τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, 1230 τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα 1231 μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή 1232 παρόμοια άδεια.` 1233 1234 // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar 1235 const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن 1236 تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من 1237 الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل). 1238 المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة 1239 من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد 1240 لهذا الترخيص.` 1241 1242 // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/ 1243 const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן 1244 המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך 1245 שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות, 1246 לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך 1247 החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.` 1248 1249 const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il 1250 1251 // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/ 1252 const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시 1253 (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의 1254 원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의 1255 이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다). 1256 동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본 1257 라이선스와 동일한 라이선스를 적용해야 합니다.` 1258 1259 // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/ 1260 const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่ 1261 มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่ 1262 ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่ 1263 คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื 1264 อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื 1265 อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น` 1266 1267 const threeByteUtf8 = txt_th 1268 1269 // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/ 1270 const txt_jp = `あなたの従うべき条件は以下の通りです。 1271 表示 — あなたは原著作者のクレジットを表示しなければなりません。 1272 継承 — もしあなたがこの作品を改変、変形または加工した場合、 1273 あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ 1274 頒布することができます。` 1275 1276 // http://creativecommons.org/licenses/by-sa/2.5/cn/ 1277 const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、 1278 广播或通过信息网络传播本作品 创作演绎作品 1279 对本作品进行商业性使用 惟须遵守下列条件: 1280 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。 1281 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作, 1282 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。` 1283 1284 const txt_cjk = txt_cn + txt_jp + txt_kr 1285 const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk 1286 1287 var txt_all_bytes = []byte(txt_all)