golang.org/x/text@v0.14.0/cases/map_test.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cases 6 7 import ( 8 "bytes" 9 "fmt" 10 "path" 11 "strings" 12 "testing" 13 "unicode/utf8" 14 15 "golang.org/x/text/internal/testtext" 16 "golang.org/x/text/language" 17 "golang.org/x/text/transform" 18 "golang.org/x/text/unicode/norm" 19 ) 20 21 type testCase struct { 22 lang string 23 src interface{} // string, []string, or nil to skip test 24 title interface{} // string, []string, or nil to skip test 25 lower interface{} // string, []string, or nil to skip test 26 upper interface{} // string, []string, or nil to skip test 27 opts options 28 } 29 30 var testCases = []testCase{ 31 0: { 32 lang: "und", 33 src: "abc aBc ABC abC İsıI ΕΣΆΣ", 34 title: "Abc Abc Abc Abc İsıi Εσάσ", 35 lower: "abc abc abc abc i\u0307sıi εσάσ", 36 upper: "ABC ABC ABC ABC İSII ΕΣΆΣ", 37 opts: getOpts(HandleFinalSigma(false)), 38 }, 39 40 1: { 41 lang: "und", 42 src: "abc aBc ABC abC İsıI ΕΣΆΣ Σ _Σ -Σ", 43 title: "Abc Abc Abc Abc İsıi Εσάς Σ _Σ -Σ", 44 lower: "abc abc abc abc i\u0307sıi εσάς σ _σ -σ", 45 upper: "ABC ABC ABC ABC İSII ΕΣΆΣ Σ _Σ -Σ", 46 opts: getOpts(HandleFinalSigma(true)), 47 }, 48 49 2: { // Title cased runes. 50 lang: supported, 51 src: "DžA", 52 title: "Dža", 53 lower: "dža", 54 upper: "DŽA", 55 }, 56 57 3: { 58 // Title breaking. 59 lang: supported, 60 src: []string{ 61 "FOO CASE TEST", 62 "DON'T DO THiS", 63 "χωΡΊΣ χωΡΊΣ^a χωΡΊΣ:a χωΡΊΣ:^a χωΡΊΣ^ όμΩΣ Σ", 64 "with-hyphens", 65 "49ers 49ers", 66 `"capitalize a^a -hyphen 0X _u a_u:a`, 67 "MidNumLet a.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg", 68 "MidNum a,b;c\u037ed\u0589e\u060cf\u2044g\ufe50h", 69 "\u0345 x\u3031x x\u05d0x \u05d0x a'.a a.a a4,a", 70 }, 71 title: []string{ 72 "Foo Case Test", 73 "Don't Do This", 74 "Χωρίς Χωρίσ^A Χωρίσ:a Χωρίσ:^A Χωρίς^ Όμως Σ", 75 "With-Hyphens", 76 // Note that 49Ers is correct according to the spec. 77 // TODO: provide some option to the user to treat different 78 // characters as cased. 79 "49Ers 49Ers", 80 `"Capitalize A^A -Hyphen 0X _U A_u:a`, 81 "Midnumlet A.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg", 82 "Midnum A,B;C\u037eD\u0589E\u060cF\u2044G\ufe50H", 83 "\u0399 X\u3031X X\u05d0x \u05d0X A'.A A.a A4,A", 84 }, 85 }, 86 87 // TODO: These are known deviations from the options{} Unicode Word Breaking 88 // Algorithm. 89 // { 90 // "und", 91 // "x_\u3031_x a4,4a", 92 // "X_\u3031_x A4,4a", // Currently is "X_\U3031_X A4,4A". 93 // "x_\u3031_x a4,4a", 94 // "X_\u3031_X A4,4A", 95 // options{}, 96 // }, 97 98 4: { 99 // Tests title options 100 lang: "und", 101 src: "abc aBc ABC abC İsıI o'Brien", 102 title: "Abc ABc ABC AbC İsıI O'Brien", 103 opts: getOpts(NoLower), 104 }, 105 106 5: { 107 lang: "el", 108 src: "aBc ΟΔΌΣ Οδός Σο ΣΟ Σ oΣ ΟΣ σ ἕξ \u03ac", 109 title: "Abc Οδός Οδός Σο Σο Σ Oς Ος Σ Ἕξ \u0386", 110 lower: "abc οδός οδός σο σο σ oς ος σ ἕξ \u03ac", 111 upper: "ABC ΟΔΟΣ ΟΔΟΣ ΣΟ ΣΟ Σ OΣ ΟΣ Σ ΕΞ \u0391", // Uppercase removes accents 112 }, 113 114 6: { 115 lang: "tr az", 116 src: "Isiİ İsıI I\u0307sIiİ İsıI\u0307 I\u0300\u0307", 117 title: "Isii İsıı I\u0307sıii İsıi I\u0300\u0307", 118 lower: "ısii isıı isıii isıi \u0131\u0300\u0307", 119 upper: "ISİİ İSII I\u0307SIİİ İSII\u0307 I\u0300\u0307", 120 }, 121 122 7: { 123 lang: "lt", 124 src: "I Ï J J̈ Į Į̈ Ì Í Ĩ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤", 125 title: "I Ï J J̈ Į Į̈ Ì Í Ĩ Xi̇̈ Xj̇̈ Xį̇̈ Xi̇̀ Xi̇́ Xi̇̃ Xi Xi̇̈ Xj Xj̇̈ Xį Xį̇̈ Xi̟̤", 126 lower: "i i̇̈ j j̇̈ į į̇̈ i̇̀ i̇́ i̇̃ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ xi xi̇̈ xj xj̇̈ xį xį̇̈ xi̟̤", 127 upper: "I Ï J J̈ Į Į̈ Ì Í Ĩ XÏ XJ̈ XĮ̈ XÌ XÍ XĨ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤", 128 }, 129 130 8: { 131 lang: "lt", 132 src: "\u012e\u0300 \u00cc i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307", 133 title: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307", 134 lower: "\u012f\u0307\u0300 i\u0307\u0300 i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307", 135 upper: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307", 136 }, 137 138 9: { 139 lang: "nl", 140 src: "ijs IJs Ij Ijs İJ İJs aa aA 'ns 'S", 141 title: "IJs IJs IJ IJs İj İjs Aa Aa 'ns 's", 142 }, 143 144 // Note: this specification is not currently part of CLDR. The same holds 145 // for the leading apostrophe handling for Dutch. 146 // See https://unicode.org/cldr/trac/ticket/7078. 147 10: { 148 lang: "af", 149 src: "wag 'n bietjie", 150 title: "Wag 'n Bietjie", 151 lower: "wag 'n bietjie", 152 upper: "WAG 'N BIETJIE", 153 }, 154 } 155 156 func TestCaseMappings(t *testing.T) { 157 for i, tt := range testCases { 158 src, ok := tt.src.([]string) 159 if !ok { 160 src = strings.Split(tt.src.(string), " ") 161 } 162 163 for _, lang := range strings.Split(tt.lang, " ") { 164 tag := language.MustParse(lang) 165 testEntry := func(name string, mk func(language.Tag, options) transform.SpanningTransformer, gold interface{}) { 166 c := Caser{mk(tag, tt.opts)} 167 if gold != nil { 168 wants, ok := gold.([]string) 169 if !ok { 170 wants = strings.Split(gold.(string), " ") 171 } 172 for j, want := range wants { 173 if got := c.String(src[j]); got != want { 174 t.Errorf("%d:%s:\n%s.String(%+q):\ngot %+q;\nwant %+q", i, lang, name, src[j], got, want) 175 } 176 } 177 } 178 dst := make([]byte, 256) // big enough to hold any result 179 src := []byte(strings.Join(src, " ")) 180 v := testtext.AllocsPerRun(20, func() { 181 c.Transform(dst, src, true) 182 }) 183 if v > 1.1 { 184 t.Errorf("%d:%s:\n%s: number of allocs was %f; want 0", i, lang, name, v) 185 } 186 } 187 testEntry("Upper", makeUpper, tt.upper) 188 testEntry("Lower", makeLower, tt.lower) 189 testEntry("Title", makeTitle, tt.title) 190 } 191 } 192 } 193 194 // TestAlloc tests that some mapping methods should not cause any allocation. 195 func TestAlloc(t *testing.T) { 196 dst := make([]byte, 256) // big enough to hold any result 197 src := []byte(txtNonASCII) 198 199 for i, f := range []func() Caser{ 200 func() Caser { return Upper(language.Und) }, 201 func() Caser { return Lower(language.Und) }, 202 func() Caser { return Lower(language.Und, HandleFinalSigma(false)) }, 203 // TODO: use a shared copy for these casers as well, in order of 204 // importance, starting with the most important: 205 // func() Caser { return Title(language.Und) }, 206 // func() Caser { return Title(language.Und, HandleFinalSigma(false)) }, 207 } { 208 testtext.Run(t, "", func(t *testing.T) { 209 var c Caser 210 v := testtext.AllocsPerRun(10, func() { 211 c = f() 212 }) 213 if v > 0 { 214 // TODO: Right now only Upper has 1 allocation. Special-case Lower 215 // and Title as well to have less allocations for the root locale. 216 t.Errorf("%d:init: number of allocs was %f; want 0", i, v) 217 } 218 v = testtext.AllocsPerRun(2, func() { 219 c.Transform(dst, src, true) 220 }) 221 if v > 0 { 222 t.Errorf("%d:transform: number of allocs was %f; want 0", i, v) 223 } 224 }) 225 } 226 } 227 228 func testHandover(t *testing.T, c Caser, src string) { 229 want := c.String(src) 230 // Find the common prefix. 231 pSrc := 0 232 for ; pSrc < len(src) && pSrc < len(want) && want[pSrc] == src[pSrc]; pSrc++ { 233 } 234 235 // Test handover for each substring of the prefix. 236 for i := 0; i < pSrc; i++ { 237 testtext.Run(t, fmt.Sprint("interleave/", i), func(t *testing.T) { 238 dst := make([]byte, 4*len(src)) 239 c.Reset() 240 nSpan, _ := c.Span([]byte(src[:i]), false) 241 copy(dst, src[:nSpan]) 242 nTransform, _, _ := c.Transform(dst[nSpan:], []byte(src[nSpan:]), true) 243 got := string(dst[:nSpan+nTransform]) 244 if got != want { 245 t.Errorf("full string: got %q; want %q", got, want) 246 } 247 }) 248 } 249 } 250 251 func TestHandover(t *testing.T) { 252 testCases := []struct { 253 desc string 254 t Caser 255 first, second string 256 }{{ 257 "title/nosigma/single midword", 258 Title(language.Und, HandleFinalSigma(false)), 259 "A.", "a", 260 }, { 261 "title/nosigma/single midword", 262 Title(language.Und, HandleFinalSigma(false)), 263 "A", ".a", 264 }, { 265 "title/nosigma/double midword", 266 Title(language.Und, HandleFinalSigma(false)), 267 "A..", "a", 268 }, { 269 "title/nosigma/double midword", 270 Title(language.Und, HandleFinalSigma(false)), 271 "A.", ".a", 272 }, { 273 "title/nosigma/double midword", 274 Title(language.Und, HandleFinalSigma(false)), 275 "A", "..a", 276 }, { 277 "title/sigma/single midword", 278 Title(language.Und), 279 "ΟΣ.", "a", 280 }, { 281 "title/sigma/single midword", 282 Title(language.Und), 283 "ΟΣ", ".a", 284 }, { 285 "title/sigma/double midword", 286 Title(language.Und), 287 "ΟΣ..", "a", 288 }, { 289 "title/sigma/double midword", 290 Title(language.Und), 291 "ΟΣ.", ".a", 292 }, { 293 "title/sigma/double midword", 294 Title(language.Und), 295 "ΟΣ", "..a", 296 }, { 297 "title/af/leading apostrophe", 298 Title(language.Afrikaans), 299 "'", "n bietje", 300 }} 301 for _, tc := range testCases { 302 testtext.Run(t, tc.desc, func(t *testing.T) { 303 src := tc.first + tc.second 304 want := tc.t.String(src) 305 tc.t.Reset() 306 n, _ := tc.t.Span([]byte(tc.first), false) 307 308 dst := make([]byte, len(want)) 309 copy(dst, tc.first[:n]) 310 311 nDst, _, _ := tc.t.Transform(dst[n:], []byte(src[n:]), true) 312 got := string(dst[:n+nDst]) 313 if got != want { 314 t.Errorf("got %q; want %q", got, want) 315 } 316 }) 317 } 318 } 319 320 // minBufSize is the size of the buffer by which the casing operation in 321 // this package are guaranteed to make progress. 322 const minBufSize = norm.MaxSegmentSize 323 324 type bufferTest struct { 325 desc, src, want string 326 firstErr error 327 dstSize, srcSize int 328 t transform.SpanningTransformer 329 } 330 331 var bufferTests []bufferTest 332 333 func init() { 334 bufferTests = []bufferTest{{ 335 desc: "und/upper/short dst", 336 src: "abcdefg", 337 want: "ABCDEFG", 338 firstErr: transform.ErrShortDst, 339 dstSize: 3, 340 srcSize: minBufSize, 341 t: Upper(language.Und), 342 }, { 343 desc: "und/upper/short src", 344 src: "123é56", 345 want: "123É56", 346 firstErr: transform.ErrShortSrc, 347 dstSize: 4, 348 srcSize: 4, 349 t: Upper(language.Und), 350 }, { 351 desc: "und/upper/no error on short", 352 src: "12", 353 want: "12", 354 firstErr: nil, 355 dstSize: 1, 356 srcSize: 1, 357 t: Upper(language.Und), 358 }, { 359 desc: "und/lower/short dst", 360 src: "ABCDEFG", 361 want: "abcdefg", 362 firstErr: transform.ErrShortDst, 363 dstSize: 3, 364 srcSize: minBufSize, 365 t: Lower(language.Und), 366 }, { 367 desc: "und/lower/short src", 368 src: "123É56", 369 want: "123é56", 370 firstErr: transform.ErrShortSrc, 371 dstSize: 4, 372 srcSize: 4, 373 t: Lower(language.Und), 374 }, { 375 desc: "und/lower/no error on short", 376 src: "12", 377 want: "12", 378 firstErr: nil, 379 dstSize: 1, 380 srcSize: 1, 381 t: Lower(language.Und), 382 }, { 383 desc: "und/lower/simple (no final sigma)", 384 src: "ΟΣ ΟΣΣ", 385 want: "οσ οσσ", 386 dstSize: minBufSize, 387 srcSize: minBufSize, 388 t: Lower(language.Und, HandleFinalSigma(false)), 389 }, { 390 desc: "und/title/simple (no final sigma)", 391 src: "ΟΣ ΟΣΣ", 392 want: "Οσ Οσσ", 393 dstSize: minBufSize, 394 srcSize: minBufSize, 395 t: Title(language.Und, HandleFinalSigma(false)), 396 }, { 397 desc: "und/title/final sigma: no error", 398 src: "ΟΣ", 399 want: "Ος", 400 dstSize: minBufSize, 401 srcSize: minBufSize, 402 t: Title(language.Und), 403 }, { 404 desc: "und/title/final sigma: short source", 405 src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ", 406 want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς", 407 firstErr: transform.ErrShortSrc, 408 dstSize: minBufSize, 409 srcSize: 10, 410 t: Title(language.Und), 411 }, { 412 desc: "und/title/final sigma: short destination 1", 413 src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ", 414 want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς", 415 firstErr: transform.ErrShortDst, 416 dstSize: 10, 417 srcSize: minBufSize, 418 t: Title(language.Und), 419 }, { 420 desc: "und/title/final sigma: short destination 2", 421 src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ", 422 want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς", 423 firstErr: transform.ErrShortDst, 424 dstSize: 9, 425 srcSize: minBufSize, 426 t: Title(language.Und), 427 }, { 428 desc: "und/title/final sigma: short destination 3", 429 src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ", 430 want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς", 431 firstErr: transform.ErrShortDst, 432 dstSize: 8, 433 srcSize: minBufSize, 434 t: Title(language.Und), 435 }, { 436 desc: "und/title/clipped UTF-8 rune", 437 src: "σσσσσσσσσσσ", 438 want: "Σσσσσσσσσσσ", 439 firstErr: transform.ErrShortSrc, 440 dstSize: minBufSize, 441 srcSize: 5, 442 t: Title(language.Und), 443 }, { 444 desc: "und/title/clipped UTF-8 rune atEOF", 445 src: "σσσ" + string([]byte{0xCF}), 446 want: "Σσσ" + string([]byte{0xCF}), 447 dstSize: minBufSize, 448 srcSize: minBufSize, 449 t: Title(language.Und), 450 }, { 451 // Note: the choice to change the final sigma at the end in case of 452 // too many case ignorables is arbitrary. The main reason for this 453 // choice is that it results in simpler code. 454 desc: "und/title/final sigma: max ignorables", 455 src: "ΟΣ" + strings.Repeat(".", maxIgnorable) + "a", 456 want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A", 457 dstSize: minBufSize, 458 srcSize: minBufSize, 459 t: Title(language.Und), 460 }, { 461 // Note: the choice to change the final sigma at the end in case of 462 // too many case ignorables is arbitrary. The main reason for this 463 // choice is that it results in simpler code. 464 desc: "und/title/long string", 465 src: "AA" + strings.Repeat(".", maxIgnorable+1) + "a", 466 want: "Aa" + strings.Repeat(".", maxIgnorable+1) + "A", 467 dstSize: minBufSize, 468 srcSize: len("AA" + strings.Repeat(".", maxIgnorable+1)), 469 t: Title(language.Und), 470 }, { 471 // Note: the choice to change the final sigma at the end in case of 472 // too many case ignorables is arbitrary. The main reason for this 473 // choice is that it results in simpler code. 474 desc: "und/title/final sigma: too many ignorables", 475 src: "ΟΣ" + strings.Repeat(".", maxIgnorable+1) + "a", 476 want: "Ος" + strings.Repeat(".", maxIgnorable+1) + "A", 477 dstSize: minBufSize, 478 srcSize: len("ΟΣ" + strings.Repeat(".", maxIgnorable+1)), 479 t: Title(language.Und), 480 }, { 481 desc: "und/title/final sigma: apostrophe", 482 src: "ΟΣ''a", 483 want: "Οσ''A", 484 dstSize: minBufSize, 485 srcSize: minBufSize, 486 t: Title(language.Und), 487 }, { 488 desc: "el/upper/max ignorables", 489 src: "ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313", 490 want: "Ο" + strings.Repeat("\u0321", maxIgnorable-1), 491 dstSize: minBufSize, 492 srcSize: minBufSize, 493 t: Upper(language.Greek), 494 }, { 495 desc: "el/upper/too many ignorables", 496 src: "ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313", 497 want: "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313", 498 dstSize: minBufSize, 499 srcSize: len("ο" + strings.Repeat("\u0321", maxIgnorable)), 500 t: Upper(language.Greek), 501 }, { 502 desc: "el/upper/short dst", 503 src: "123ο", 504 want: "123Ο", 505 firstErr: transform.ErrShortDst, 506 dstSize: 3, 507 srcSize: minBufSize, 508 t: Upper(language.Greek), 509 }, { 510 desc: "lt/lower/max ignorables", 511 src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300", 512 want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300", 513 dstSize: minBufSize, 514 srcSize: minBufSize, 515 t: Lower(language.Lithuanian), 516 }, { 517 desc: "lt/lower/too many ignorables", 518 src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300", 519 want: "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0300", 520 dstSize: minBufSize, 521 srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)), 522 t: Lower(language.Lithuanian), 523 }, { 524 desc: "lt/lower/decomposition with short dst buffer 1", 525 src: "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE 526 firstErr: transform.ErrShortDst, 527 want: "aaaaai\u0307\u0300", 528 dstSize: 5, 529 srcSize: minBufSize, 530 t: Lower(language.Lithuanian), 531 }, { 532 desc: "lt/lower/decomposition with short dst buffer 2", 533 src: "aaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE 534 firstErr: transform.ErrShortDst, 535 want: "aaaai\u0307\u0300", 536 dstSize: 5, 537 srcSize: minBufSize, 538 t: Lower(language.Lithuanian), 539 }, { 540 desc: "lt/upper/max ignorables", 541 src: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300", 542 want: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300", 543 dstSize: minBufSize, 544 srcSize: minBufSize, 545 t: Upper(language.Lithuanian), 546 }, { 547 desc: "lt/upper/too many ignorables", 548 src: "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300", 549 want: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300", 550 dstSize: minBufSize, 551 srcSize: len("i" + strings.Repeat("\u0321", maxIgnorable)), 552 t: Upper(language.Lithuanian), 553 }, { 554 desc: "lt/upper/short dst", 555 src: "12i\u0307\u0300", 556 want: "12\u00cc", 557 firstErr: transform.ErrShortDst, 558 dstSize: 3, 559 srcSize: minBufSize, 560 t: Upper(language.Lithuanian), 561 }, { 562 desc: "aztr/lower/max ignorables", 563 src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300", 564 want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300", 565 dstSize: minBufSize, 566 srcSize: minBufSize, 567 t: Lower(language.Turkish), 568 }, { 569 desc: "aztr/lower/too many ignorables", 570 src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300", 571 want: "\u0131" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300", 572 dstSize: minBufSize, 573 srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)), 574 t: Lower(language.Turkish), 575 }, { 576 desc: "nl/title/pre-IJ cutoff", 577 src: " ij", 578 want: " IJ", 579 firstErr: transform.ErrShortDst, 580 dstSize: 2, 581 srcSize: minBufSize, 582 t: Title(language.Dutch), 583 }, { 584 desc: "nl/title/mid-IJ cutoff", 585 src: " ij", 586 want: " IJ", 587 firstErr: transform.ErrShortDst, 588 dstSize: 3, 589 srcSize: minBufSize, 590 t: Title(language.Dutch), 591 }, { 592 desc: "af/title/apostrophe", 593 src: "'n bietje", 594 want: "'n Bietje", 595 firstErr: transform.ErrShortDst, 596 dstSize: 3, 597 srcSize: minBufSize, 598 t: Title(language.Afrikaans), 599 }} 600 } 601 602 func TestShortBuffersAndOverflow(t *testing.T) { 603 for i, tt := range bufferTests { 604 testtext.Run(t, tt.desc, func(t *testing.T) { 605 buf := make([]byte, tt.dstSize) 606 got := []byte{} 607 var nSrc, nDst int 608 var err error 609 for p := 0; p < len(tt.src); p += nSrc { 610 q := p + tt.srcSize 611 if q > len(tt.src) { 612 q = len(tt.src) 613 } 614 nDst, nSrc, err = tt.t.Transform(buf, []byte(tt.src[p:q]), q == len(tt.src)) 615 got = append(got, buf[:nDst]...) 616 617 if p == 0 && err != tt.firstErr { 618 t.Errorf("%d:%s:\n error was %v; want %v", i, tt.desc, err, tt.firstErr) 619 break 620 } 621 } 622 if string(got) != tt.want { 623 t.Errorf("%d:%s:\ngot %+q;\nwant %+q", i, tt.desc, got, tt.want) 624 } 625 testHandover(t, Caser{tt.t}, tt.src) 626 }) 627 } 628 } 629 630 func TestSpan(t *testing.T) { 631 for _, tt := range []struct { 632 desc string 633 src string 634 want string 635 atEOF bool 636 err error 637 t Caser 638 }{{ 639 desc: "und/upper/basic", 640 src: "abcdefg", 641 want: "", 642 atEOF: true, 643 err: transform.ErrEndOfSpan, 644 t: Upper(language.Und), 645 }, { 646 desc: "und/upper/short src", 647 src: "123É"[:4], 648 want: "123", 649 atEOF: false, 650 err: transform.ErrShortSrc, 651 t: Upper(language.Und), 652 }, { 653 desc: "und/upper/no error on short", 654 src: "12", 655 want: "12", 656 atEOF: false, 657 t: Upper(language.Und), 658 }, { 659 desc: "und/lower/basic", 660 src: "ABCDEFG", 661 want: "", 662 atEOF: true, 663 err: transform.ErrEndOfSpan, 664 t: Lower(language.Und), 665 }, { 666 desc: "und/lower/short src num", 667 src: "123é"[:4], 668 want: "123", 669 atEOF: false, 670 err: transform.ErrShortSrc, 671 t: Lower(language.Und), 672 }, { 673 desc: "und/lower/short src greek", 674 src: "αβγé"[:7], 675 want: "αβγ", 676 atEOF: false, 677 err: transform.ErrShortSrc, 678 t: Lower(language.Und), 679 }, { 680 desc: "und/lower/no error on short", 681 src: "12", 682 want: "12", 683 atEOF: false, 684 t: Lower(language.Und), 685 }, { 686 desc: "und/lower/simple (no final sigma)", 687 src: "ος οσσ", 688 want: "οσ οσσ", 689 atEOF: true, 690 t: Lower(language.Und, HandleFinalSigma(false)), 691 }, { 692 desc: "und/title/simple (no final sigma)", 693 src: "Οσ Οσσ", 694 want: "Οσ Οσσ", 695 atEOF: true, 696 t: Title(language.Und, HandleFinalSigma(false)), 697 }, { 698 desc: "und/lower/final sigma: no error", 699 src: "οΣ", // Oς 700 want: "ο", // Oς 701 err: transform.ErrEndOfSpan, 702 t: Lower(language.Und), 703 }, { 704 desc: "und/title/final sigma: no error", 705 src: "ΟΣ", // Oς 706 want: "Ο", // Oς 707 err: transform.ErrEndOfSpan, 708 t: Title(language.Und), 709 }, { 710 desc: "und/title/final sigma: no short source!", 711 src: "ΟσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσΣ", 712 want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσ", 713 err: transform.ErrEndOfSpan, 714 t: Title(language.Und), 715 }, { 716 desc: "und/title/clipped UTF-8 rune", 717 src: "Σσ" + string([]byte{0xCF}), 718 want: "Σσ", 719 atEOF: false, 720 err: transform.ErrShortSrc, 721 t: Title(language.Und), 722 }, { 723 desc: "und/title/clipped UTF-8 rune atEOF", 724 src: "Σσσ" + string([]byte{0xCF}), 725 want: "Σσσ" + string([]byte{0xCF}), 726 atEOF: true, 727 t: Title(language.Und), 728 }, { 729 // Note: the choice to change the final sigma at the end in case of 730 // too many case ignorables is arbitrary. The main reason for this 731 // choice is that it results in simpler code. 732 desc: "und/title/long string", 733 src: "A" + strings.Repeat("a", maxIgnorable+5), 734 want: "A" + strings.Repeat("a", maxIgnorable+5), 735 t: Title(language.Und), 736 }, { 737 // Note: the choice to change the final sigma at the end in case of 738 // too many case ignorables is arbitrary. The main reason for this 739 // choice is that it results in simpler code. 740 desc: "und/title/cyrillic", 741 src: "При", 742 want: "При", 743 atEOF: true, 744 t: Title(language.Und, HandleFinalSigma(false)), 745 }, { 746 // Note: the choice to change the final sigma at the end in case of 747 // too many case ignorables is arbitrary. The main reason for this 748 // choice is that it results in simpler code. 749 desc: "und/title/final sigma: max ignorables", 750 src: "Οσ" + strings.Repeat(".", maxIgnorable) + "A", 751 want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A", 752 t: Title(language.Und), 753 }, { 754 desc: "el/upper/max ignorables - not implemented", 755 src: "Ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313", 756 want: "", 757 err: transform.ErrEndOfSpan, 758 t: Upper(language.Greek), 759 }, { 760 desc: "el/upper/too many ignorables - not implemented", 761 src: "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313", 762 want: "", 763 err: transform.ErrEndOfSpan, 764 t: Upper(language.Greek), 765 }, { 766 desc: "el/upper/short dst", 767 src: "123ο", 768 want: "", 769 err: transform.ErrEndOfSpan, 770 t: Upper(language.Greek), 771 }, { 772 desc: "lt/lower/max ignorables", 773 src: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300", 774 want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300", 775 t: Lower(language.Lithuanian), 776 }, { 777 desc: "lt/lower/isLower", 778 src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300", 779 want: "", 780 err: transform.ErrEndOfSpan, 781 t: Lower(language.Lithuanian), 782 }, { 783 desc: "lt/lower/not identical", 784 src: "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE 785 err: transform.ErrEndOfSpan, 786 want: "aaaaa", 787 t: Lower(language.Lithuanian), 788 }, { 789 desc: "lt/lower/identical", 790 src: "aaaai\u0307\u0300", // U+00CC LATIN CAPITAL LETTER I GRAVE 791 want: "aaaai\u0307\u0300", 792 t: Lower(language.Lithuanian), 793 }, { 794 desc: "lt/upper/not implemented", 795 src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300", 796 want: "", 797 err: transform.ErrEndOfSpan, 798 t: Upper(language.Lithuanian), 799 }, { 800 desc: "lt/upper/not implemented, ascii", 801 src: "AB", 802 want: "", 803 err: transform.ErrEndOfSpan, 804 t: Upper(language.Lithuanian), 805 }, { 806 desc: "nl/title/pre-IJ cutoff", 807 src: " IJ", 808 want: " IJ", 809 t: Title(language.Dutch), 810 }, { 811 desc: "nl/title/mid-IJ cutoff", 812 src: " Ia", 813 want: " Ia", 814 t: Title(language.Dutch), 815 }, { 816 desc: "af/title/apostrophe", 817 src: "'n Bietje", 818 want: "'n Bietje", 819 t: Title(language.Afrikaans), 820 }, { 821 desc: "af/title/apostrophe-incorrect", 822 src: "'N Bietje", 823 // The Single_Quote (a MidWord), needs to be retained as unspanned so 824 // that a successive call to Transform can detect that N should not be 825 // capitalized. 826 want: "", 827 err: transform.ErrEndOfSpan, 828 t: Title(language.Afrikaans), 829 }} { 830 testtext.Run(t, tt.desc, func(t *testing.T) { 831 for p := 0; p < len(tt.want); p += utf8.RuneLen([]rune(tt.src[p:])[0]) { 832 tt.t.Reset() 833 n, err := tt.t.Span([]byte(tt.src[:p]), false) 834 if err != nil && err != transform.ErrShortSrc { 835 t.Errorf("early failure:Span(%+q): %v (%d < %d)", tt.src[:p], err, n, len(tt.want)) 836 break 837 } 838 } 839 tt.t.Reset() 840 n, err := tt.t.Span([]byte(tt.src), tt.atEOF) 841 if n != len(tt.want) || err != tt.err { 842 t.Errorf("Span(%+q, %v): got %d, %v; want %d, %v", tt.src, tt.atEOF, n, err, len(tt.want), tt.err) 843 } 844 testHandover(t, tt.t, tt.src) 845 }) 846 } 847 } 848 849 var txtASCII = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 50) 850 851 // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/ 852 const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. Nếu bạn sử 853 dụng, chuyển đổi, hoặc xây dựng dự án từ nội dung được chia sẻ này, bạn phải áp 854 dụng giấy phép này hoặc một giấy phép khác có các điều khoản tương tự như giấy 855 phép này cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào trên đây 856 cũng có thể được miễn bỏ nếu bạn được sự cho phép của người sở hữu bản quyền. 857 Phạm vi công chúng — Khi tác phẩm hoặc bất kỳ chương nào của tác phẩm đã trong 858 vùng dành cho công chúng theo quy định của pháp luật thì tình trạng của nó không 859 bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.` 860 861 // http://creativecommons.org/licenses/by-sa/2.5/cn/ 862 const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、 863 广播或通过信息网络传播本作品 创作演绎作品 864 对本作品进行商业性使用 惟须遵守下列条件: 865 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。 866 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作, 867 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。` 868 869 // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru 870 const txt_ru = `При обязательном соблюдении следующих условий: Attribution — Вы 871 должны атрибутировать произведение (указывать автора и источник) в порядке, 872 предусмотренном автором или лицензиаром (но только так, чтобы никоим образом не 873 подразумевалось, что они поддерживают вас или использование вами данного 874 произведения). Υπό τις ακόλουθες προϋποθέσεις:` 875 876 // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/ 877 const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με 878 τον τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια (χωρίς 879 όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή τη χρήση του έργου 880 από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, τροποποιήσετε ή δημιουργήσετε 881 περαιτέρω βασισμένοι στο έργο θα μπορείτε να διανέμετε το έργο που θα προκύψει 882 μόνο με την ίδια ή παρόμοια άδεια.` 883 884 const txtNonASCII = txt_vn + txt_cn + txt_ru + txt_gr 885 886 // TODO: Improve ASCII performance. 887 888 func BenchmarkCasers(b *testing.B) { 889 for _, s := range []struct{ name, text string }{ 890 {"ascii", txtASCII}, 891 {"nonASCII", txtNonASCII}, 892 {"short", "При"}, 893 } { 894 src := []byte(s.text) 895 // Measure case mappings in bytes package for comparison. 896 for _, f := range []struct { 897 name string 898 fn func(b []byte) []byte 899 }{ 900 {"lower", bytes.ToLower}, 901 {"title", bytes.ToTitle}, 902 {"upper", bytes.ToUpper}, 903 } { 904 testtext.Bench(b, path.Join(s.name, "bytes", f.name), func(b *testing.B) { 905 b.SetBytes(int64(len(src))) 906 for i := 0; i < b.N; i++ { 907 f.fn(src) 908 } 909 }) 910 } 911 for _, t := range []struct { 912 name string 913 caser transform.SpanningTransformer 914 }{ 915 {"fold/default", Fold()}, 916 {"upper/default", Upper(language.Und)}, 917 {"lower/sigma", Lower(language.Und)}, 918 {"lower/simple", Lower(language.Und, HandleFinalSigma(false))}, 919 {"title/sigma", Title(language.Und)}, 920 {"title/simple", Title(language.Und, HandleFinalSigma(false))}, 921 } { 922 c := Caser{t.caser} 923 dst := make([]byte, len(src)) 924 testtext.Bench(b, path.Join(s.name, t.name, "transform"), func(b *testing.B) { 925 b.SetBytes(int64(len(src))) 926 for i := 0; i < b.N; i++ { 927 c.Reset() 928 c.Transform(dst, src, true) 929 } 930 }) 931 // No need to check span for simple cases, as they will be the same 932 // as sigma. 933 if strings.HasSuffix(t.name, "/simple") { 934 continue 935 } 936 spanSrc := c.Bytes(src) 937 testtext.Bench(b, path.Join(s.name, t.name, "span"), func(b *testing.B) { 938 c.Reset() 939 if n, _ := c.Span(spanSrc, true); n < len(spanSrc) { 940 b.Fatalf("spanner is not recognizing text %q as done (at %d)", spanSrc, n) 941 } 942 b.SetBytes(int64(len(spanSrc))) 943 for i := 0; i < b.N; i++ { 944 c.Reset() 945 c.Span(spanSrc, true) 946 } 947 }) 948 } 949 } 950 }