github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/cases/map.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cases 6 7 // This file contains the definitions of case mappings for all supported 8 // languages. The rules for the language-specific tailorings were taken and 9 // modified from the CLDR transform definitions in common/transforms. 10 11 import ( 12 "strings" 13 "unicode" 14 "unicode/utf8" 15 16 "golang.org/x/text/language" 17 "golang.org/x/text/transform" 18 "golang.org/x/text/unicode/norm" 19 ) 20 21 // A mapFunc takes a context set to the current rune and writes the mapped 22 // version to the same context. It may advance the context to the next rune. It 23 // returns whether a checkpoint is possible: whether the pDst bytes written to 24 // dst so far won't need changing as we see more source bytes. 25 type mapFunc func(*context) bool 26 27 // maxIgnorable defines the maximum number of ignorables to consider for 28 // lookahead operations. 29 const maxIgnorable = 30 30 31 // supported lists the language tags for which we have tailorings. 32 const supported = "und af az el lt nl tr" 33 34 func init() { 35 tags := []language.Tag{} 36 for _, s := range strings.Split(supported, " ") { 37 tags = append(tags, language.MustParse(s)) 38 } 39 matcher = language.NewMatcher(tags) 40 Supported = language.NewCoverage(tags) 41 } 42 43 var ( 44 matcher language.Matcher 45 46 Supported language.Coverage 47 48 // We keep the following lists separate, instead of having a single per- 49 // language struct, to give the compiler a chance to remove unused code. 50 51 // Some uppercase mappers are stateless, so we can precompute the 52 // Transformers and save a bit on runtime allocations. 53 upperFunc = []mapFunc{ 54 nil, // und 55 nil, // af 56 aztrUpper(upper), // az 57 elUpper, // el 58 ltUpper(upper), // lt 59 nil, // nl 60 aztrUpper(upper), // tr 61 } 62 63 undUpper transform.Transformer = &undUpperCaser{} 64 65 lowerFunc = []mapFunc{ 66 lower, // und 67 lower, // af 68 aztrLower, // az 69 lower, // el 70 ltLower, // lt 71 lower, // nl 72 aztrLower, // tr 73 } 74 75 titleInfos = []struct { 76 title, lower mapFunc 77 rewrite func(*context) 78 }{ 79 {title, lower, nil}, // und 80 {title, lower, afnlRewrite}, // af 81 {aztrUpper(title), aztrLower, nil}, // az 82 {title, lower, nil}, // el 83 {ltUpper(title), ltLower, nil}, // lt 84 {nlTitle, lower, afnlRewrite}, // nl 85 {aztrUpper(title), aztrLower, nil}, // tr 86 } 87 ) 88 89 func makeUpper(t language.Tag, o options) transform.Transformer { 90 _, i, _ := matcher.Match(t) 91 f := upperFunc[i] 92 if f == nil { 93 return undUpper 94 } 95 return &simpleCaser{f: f} 96 } 97 98 func makeLower(t language.Tag, o options) transform.Transformer { 99 _, i, _ := matcher.Match(t) 100 f := lowerFunc[i] 101 if o.noFinalSigma { 102 return &simpleCaser{f: f} 103 } 104 return &lowerCaser{ 105 first: f, 106 midWord: finalSigma(f), 107 } 108 } 109 110 func makeTitle(t language.Tag, o options) transform.Transformer { 111 _, i, _ := matcher.Match(t) 112 x := &titleInfos[i] 113 lower := x.lower 114 if o.noLower { 115 lower = (*context).copy 116 } else if !o.noFinalSigma { 117 lower = finalSigma(lower) 118 } 119 return &titleCaser{ 120 title: x.title, 121 lower: lower, 122 rewrite: x.rewrite, 123 } 124 } 125 126 // TODO: consider a similar special case for the fast majority lower case. This 127 // is a bit more involved so will require some more precise benchmarking to 128 // justify it. 129 130 type undUpperCaser struct{ transform.NopResetter } 131 132 // undUpperCaser implements the Transformer interface for doing an upper case 133 // mapping for the root locale (und). It eliminates the need for an allocation 134 // as it prevents escaping by not using function pointers. 135 func (t *undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 136 c := context{dst: dst, src: src, atEOF: atEOF} 137 for c.next() { 138 upper(&c) 139 c.checkpoint() 140 } 141 return c.ret() 142 } 143 144 type simpleCaser struct { 145 context 146 f mapFunc 147 } 148 149 // simpleCaser implements the Transformer interface for doing a case operation 150 // on a rune-by-rune basis. 151 func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 152 t.context = context{dst: dst, src: src, atEOF: atEOF} 153 c := &t.context 154 for c.next() && t.f(c) { 155 c.checkpoint() 156 } 157 return c.ret() 158 } 159 160 // lowerCaser implements the Transformer interface. The default Unicode lower 161 // casing requires different treatment for the first and subsequent characters 162 // of a word, most notably to handle the Greek final Sigma. 163 type lowerCaser struct { 164 context 165 166 first, midWord mapFunc 167 } 168 169 func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 170 t.context = context{dst: dst, src: src, atEOF: atEOF} 171 c := &t.context 172 173 for isInterWord := true; c.next(); { 174 if isInterWord { 175 if c.info.isCased() { 176 if !t.first(c) { 177 break 178 } 179 isInterWord = false 180 } else if !c.copy() { 181 break 182 } 183 } else { 184 if c.info.isNotCasedAndNotCaseIgnorable() { 185 if !c.copy() { 186 break 187 } 188 isInterWord = true 189 } else if !t.midWord(c) { 190 break 191 } 192 } 193 c.checkpoint() 194 } 195 return c.ret() 196 } 197 198 // titleCaser implements the Transformer interface. Title casing algorithms 199 // distinguish between the first letter of a word and subsequent letters of the 200 // same word. It uses state to avoid requiring a potentially infinite lookahead. 201 type titleCaser struct { 202 context 203 204 // rune mappings used by the actual casing algorithms. 205 title, lower mapFunc 206 207 rewrite func(*context) 208 } 209 210 // Transform implements the standard Unicode title case algorithm as defined in 211 // Chapter 3 of The Unicode Standard: 212 // toTitlecase(X): Find the word boundaries in X according to Unicode Standard 213 // Annex #29, "Unicode Text Segmentation." For each word boundary, find the 214 // first cased character F following the word boundary. If F exists, map F to 215 // Titlecase_Mapping(F); then map all characters C between F and the following 216 // word boundary to Lowercase_Mapping(C). 217 func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 218 t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord} 219 c := &t.context 220 221 if !c.next() { 222 return c.ret() 223 } 224 225 for { 226 p := c.info 227 if t.rewrite != nil { 228 t.rewrite(c) 229 } 230 231 wasMid := p.isCaseIgnorableAndNonBreakStarter() 232 // Break out of this loop on failure to ensure we do not modify the 233 // state incorrectly. 234 if p.isCased() && !p.isCaseIgnorableAndNotCased() { 235 if !c.isMidWord { 236 if !t.title(c) { 237 break 238 } 239 c.isMidWord = true 240 } else if !t.lower(c) { 241 break 242 } 243 } else if !c.copy() { 244 break 245 } 246 247 // TODO: make this an "else if" if we can prove that no rune that does 248 // not match the first condition of the if statement can be a break. 249 if p.isBreak() { 250 c.isMidWord = false 251 } 252 253 // As we save the state of the transformer, it is safe to call 254 // checkpoint after any successful write. 255 c.checkpoint() 256 257 if !c.next() { 258 break 259 } 260 if wasMid && c.info.isCaseIgnorableAndNonBreakStarter() { 261 c.isMidWord = false 262 } 263 } 264 return c.ret() 265 } 266 267 // lower writes the lowercase version of the current rune to dst. 268 func lower(c *context) bool { 269 if c.info&hasMappingMask == 0 || c.caseType() == cLower { 270 return c.copy() 271 } 272 if c.info&exceptionBit == 0 { 273 return c.copyXOR() 274 } 275 e := exceptions[c.info>>exceptionShift+1:] 276 if nLower := (e[0] >> lengthBits) & lengthMask; nLower != noChange { 277 return c.writeString(e[1 : 1+nLower]) 278 } 279 return c.copy() 280 } 281 282 // upper writes the uppercase version of the current rune to dst. 283 func upper(c *context) bool { 284 ct := c.caseType() 285 if c.info&hasMappingMask == 0 || ct == cUpper { 286 return c.copy() 287 } 288 if c.info&exceptionBit == 0 { 289 return c.copyXOR() 290 } 291 e := exceptions[c.info>>exceptionShift+1:] 292 // Get length of first special case mapping. 293 n := (e[0] >> lengthBits) & lengthMask 294 if ct == cTitle { 295 // The first special case mapping is for lower. Set n to the second. 296 if n == noChange { 297 n = 0 298 } 299 n, e = e[0]&lengthMask, e[n:] 300 } 301 if n != noChange { 302 return c.writeString(e[1 : 1+n]) 303 } 304 return c.copy() 305 } 306 307 // title writes the title case version of the current rune to dst. 308 func title(c *context) bool { 309 ct := c.caseType() 310 if c.info&hasMappingMask == 0 || ct == cTitle { 311 return c.copy() 312 } 313 if c.info&exceptionBit == 0 { 314 if ct == cLower { 315 return c.copyXOR() 316 } 317 return c.copy() 318 } 319 // Get the exception data. 320 e := exceptions[c.info>>exceptionShift+1:] 321 322 nFirst := (e[0] >> lengthBits) & lengthMask 323 if nTitle := e[0] & lengthMask; nTitle != noChange { 324 if nFirst != noChange { 325 e = e[nFirst:] 326 } 327 return c.writeString(e[1 : 1+nTitle]) 328 } 329 if ct == cLower && nFirst != noChange { 330 // Use the uppercase version instead. 331 return c.writeString(e[1 : 1+nFirst]) 332 } 333 // Already in correct case. 334 return c.copy() 335 } 336 337 // finalSigma adds Greek final Sigma handing to another casing function. It 338 // determines whether a lowercased sigma should be σ or ς, by looking ahead for 339 // case-ignorables and a cased letters. 340 func finalSigma(f mapFunc) mapFunc { 341 return func(c *context) bool { 342 // ::NFD(); 343 // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 344 // Σ } [:case-ignorable:]* [:cased:] → σ; 345 // [:cased:] [:case-ignorable:]* { Σ → ς; 346 // ::Any-Lower; 347 // ::NFC(); 348 349 if !c.hasPrefix("Σ") { 350 return f(c) 351 } 352 353 p := c.pDst 354 c.writeString("ς") 355 // We need to do one more iteration after maxIgnorable, as a cased 356 // letter is not an ignorable and may modify the result. 357 for i := 0; i < maxIgnorable+1; i++ { 358 if !c.next() { 359 return false 360 } 361 if !c.info.isCaseIgnorable() { 362 if c.info.isCased() { 363 // p+1 is guaranteed to be in bounds: if writing ς was 364 // successful, p+1 will contain the second byte of ς. If not, 365 // this function will have returned after c.next returned false. 366 c.dst[p+1]++ // ς → σ 367 } 368 c.unreadRune() 369 return true 370 } 371 // A case ignorable may also introduce a word break, so we may need 372 // to continue searching even after detecting a break. 373 c.isMidWord = c.isMidWord && !c.info.isBreak() 374 c.copy() 375 } 376 return true 377 } 378 } 379 380 // elUpper implements Greek upper casing, which entails removing a predefined 381 // set of non-blocked modifiers. Note that these accents should not be removed 382 // for title casing! 383 // Example: "Οδός" -> "ΟΔΟΣ". 384 func elUpper(c *context) bool { 385 // From CLDR: 386 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; 387 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; 388 389 r, _ := utf8.DecodeRune(c.src[c.pSrc:]) 390 oldPDst := c.pDst 391 if !upper(c) { 392 return false 393 } 394 if !unicode.Is(unicode.Greek, r) { 395 return true 396 } 397 i := 0 398 // Take the properties of the uppercased rune that is already written to the 399 // destination. This saves us the trouble of having to uppercase the 400 // decomposed rune again. 401 if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil { 402 // Restore the destination position and process the decomposed rune. 403 r, sz := utf8.DecodeRune(b) 404 if r <= 0xFF { // See A.6.1 405 return true 406 } 407 c.pDst = oldPDst 408 // Insert the first rune and ignore the modifiers. See A.6.2. 409 c.writeBytes(b[:sz]) 410 i = len(b[sz:]) / 2 // Greek modifiers are always of length 2. 411 } 412 413 for ; i < maxIgnorable && c.next(); i++ { 414 switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r { 415 // Above and Iota Subscript 416 case 0x0300, // U+0300 COMBINING GRAVE ACCENT 417 0x0301, // U+0301 COMBINING ACUTE ACCENT 418 0x0304, // U+0304 COMBINING MACRON 419 0x0306, // U+0306 COMBINING BREVE 420 0x0308, // U+0308 COMBINING DIAERESIS 421 0x0313, // U+0313 COMBINING COMMA ABOVE 422 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE 423 0x0342, // U+0342 COMBINING GREEK PERISPOMENI 424 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI 425 // No-op. Gobble the modifier. 426 427 default: 428 switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() { 429 case cccZero: 430 c.unreadRune() 431 return true 432 433 // We don't need to test for IotaSubscript as the only rune that 434 // qualifies (U+0345) was already excluded in the switch statement 435 // above. See A.4. 436 437 case cccAbove: 438 return c.copy() 439 default: 440 // Some other modifier. We're still allowed to gobble Greek 441 // modifiers after this. 442 c.copy() 443 } 444 } 445 } 446 return i == maxIgnorable 447 } 448 449 func ltLower(c *context) bool { 450 // From CLDR: 451 // # Introduce an explicit dot above when lowercasing capital I's and J's 452 // # whenever there are more accents above. 453 // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 454 // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 455 // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 456 // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 457 // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 458 // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 459 // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 460 // ::NFD(); 461 // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; 462 // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; 463 // Į } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → į \u0307; 464 // Ì → i \u0307 \u0300; 465 // Í → i \u0307 \u0301; 466 // Ĩ → i \u0307 \u0303; 467 // ::Any-Lower(); 468 // ::NFC(); 469 470 i := 0 471 if r := c.src[c.pSrc]; r < utf8.RuneSelf { 472 lower(c) 473 if r != 'I' && r != 'J' { 474 return true 475 } 476 } else { 477 p := norm.NFD.Properties(c.src[c.pSrc:]) 478 if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') { 479 // UTF-8 optimization: the decomposition will only have an above 480 // modifier if the last rune of the decomposition is in [U+300-U+311]. 481 // In all other cases, a decomposition starting with I is always 482 // an I followed by modifiers that are not cased themselves. See A.2. 483 if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4. 484 if !c.writeBytes(d[:1]) { 485 return false 486 } 487 c.dst[c.pDst-1] += 'a' - 'A' // lower 488 489 // Assumption: modifier never changes on lowercase. See A.1. 490 // Assumption: all modifiers added have CCC = Above. See A.2.3. 491 return c.writeString("\u0307") && c.writeBytes(d[1:]) 492 } 493 // In all other cases the additional modifiers will have a CCC 494 // that is less than 230 (Above). We will insert the U+0307, if 495 // needed, after these modifiers so that a string in FCD form 496 // will remain so. See A.2.2. 497 lower(c) 498 i = 1 499 } else { 500 return lower(c) 501 } 502 } 503 504 for ; i < maxIgnorable && c.next(); i++ { 505 switch c.info.cccType() { 506 case cccZero: 507 c.unreadRune() 508 return true 509 case cccAbove: 510 return c.writeString("\u0307") && c.copy() // See A.1. 511 default: 512 c.copy() // See A.1. 513 } 514 } 515 return i == maxIgnorable 516 } 517 518 func ltUpper(f mapFunc) mapFunc { 519 return func(c *context) bool { 520 // From CLDR: 521 // ::NFD(); 522 // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; 523 // ::Any-Upper(); 524 // ::NFC(); 525 526 // TODO: See A.5. A soft-dotted rune never has an exception. This would 527 // allow us to overload the exception bit and encode this property in 528 // info. Need to measure performance impact of this. 529 r, _ := utf8.DecodeRune(c.src[c.pSrc:]) 530 oldPDst := c.pDst 531 if !f(c) { 532 return false 533 } 534 if !unicode.Is(unicode.Soft_Dotted, r) { 535 return true 536 } 537 538 // We don't need to do an NFD normalization, as a soft-dotted rune never 539 // contains U+0307. See A.3. 540 541 i := 0 542 for ; i < maxIgnorable && c.next(); i++ { 543 switch c.info.cccType() { 544 case cccZero: 545 c.unreadRune() 546 return true 547 case cccAbove: 548 if c.hasPrefix("\u0307") { 549 // We don't do a full NFC, but rather combine runes for 550 // some of the common cases. (Returning NFC or 551 // preserving normal form is neither a requirement nor 552 // a possibility anyway). 553 if !c.next() { 554 return false 555 } 556 if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc { 557 s := "" 558 switch c.src[c.pSrc+1] { 559 case 0x80: // U+0300 COMBINING GRAVE ACCENT 560 s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE 561 case 0x81: // U+0301 COMBINING ACUTE ACCENT 562 s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE 563 case 0x83: // U+0303 COMBINING TILDE 564 s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE 565 case 0x88: // U+0308 COMBINING DIAERESIS 566 s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS 567 default: 568 } 569 if s != "" { 570 c.pDst = oldPDst 571 return c.writeString(s) 572 } 573 } 574 } 575 return c.copy() 576 default: 577 c.copy() 578 } 579 } 580 return i == maxIgnorable 581 } 582 } 583 584 func aztrUpper(f mapFunc) mapFunc { 585 return func(c *context) bool { 586 // i→İ; 587 if c.src[c.pSrc] == 'i' { 588 return c.writeString("İ") 589 } 590 return f(c) 591 } 592 } 593 594 func aztrLower(c *context) (done bool) { 595 // From CLDR: 596 // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 597 // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE 598 // İ→i; 599 // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 600 // # This matches the behavior of the canonically equivalent I-dot_above 601 // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 602 // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 603 // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 604 // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; 605 // I→ı ; 606 // ::Any-Lower(); 607 if c.hasPrefix("\u0130") { // İ 608 return c.writeString("i") 609 } 610 if c.src[c.pSrc] != 'I' { 611 return lower(c) 612 } 613 614 // We ignore the lower-case I for now, but insert it later when we know 615 // which form we need. 616 start := c.pSrc + c.sz 617 618 i := 0 619 Loop: 620 // We check for up to n ignorables before \u0307. As \u0307 is an 621 // ignorable as well, n is maxIgnorable-1. 622 for ; i < maxIgnorable && c.next(); i++ { 623 switch c.info.cccType() { 624 case cccAbove: 625 if c.hasPrefix("\u0307") { 626 return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307 627 } 628 done = true 629 break Loop 630 case cccZero: 631 c.unreadRune() 632 done = true 633 break Loop 634 default: 635 // We'll write this rune after we know which starter to use. 636 } 637 } 638 if i == maxIgnorable { 639 done = true 640 } 641 return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done 642 } 643 644 func nlTitle(c *context) bool { 645 // From CLDR: 646 // # Special titlecasing for Dutch initial "ij". 647 // ::Any-Title(); 648 // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) 649 // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; 650 if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' { 651 return title(c) 652 } 653 654 if !c.writeString("I") || !c.next() { 655 return false 656 } 657 if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' { 658 return c.writeString("J") 659 } 660 c.unreadRune() 661 return true 662 } 663 664 // Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078. 665 func afnlRewrite(c *context) { 666 if c.hasPrefix("'") || c.hasPrefix("’") { 667 c.isMidWord = true 668 } 669 }