github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/cases/map.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cases 6 7 // This file contains the definitions of case mappings for all supported 8 // languages. The rules for the language-specific tailorings were taken and 9 // modified from the CLDR transform definitions in common/transforms. 10 11 import ( 12 "strings" 13 "unicode" 14 "unicode/utf8" 15 16 "github.com/insionng/yougam/libraries/x/text/language" 17 "github.com/insionng/yougam/libraries/x/text/transform" 18 "github.com/insionng/yougam/libraries/x/text/unicode/norm" 19 ) 20 21 // A mapFunc takes a context set to the current rune and writes the mapped 22 // version to the same context. It may advance the context to the next rune. It 23 // returns whether a checkpoint is possible: whether the pDst bytes written to 24 // dst so far won't need changing as we see more source bytes. 25 type mapFunc func(*context) bool 26 27 // maxIgnorable defines the maximum number of ignorables to consider for 28 // lookahead operations. 29 const maxIgnorable = 30 30 31 // supported lists the language tags for which we have tailorings. 32 const supported = "und af az el lt nl tr" 33 34 func init() { 35 tags := []language.Tag{} 36 for _, s := range strings.Split(supported, " ") { 37 tags = append(tags, language.MustParse(s)) 38 } 39 matcher = language.NewMatcher(tags) 40 Supported = language.NewCoverage(tags) 41 } 42 43 var ( 44 matcher language.Matcher 45 46 Supported language.Coverage 47 48 // We keep the following lists separate, instead of having a single per- 49 // language struct, to give the compiler a chance to remove unused code. 50 51 // Some uppercase mappers are stateless, so we can precompute the 52 // Transformers and save a bit on runtime allocations. 53 upperFunc = []mapFunc{ 54 nil, // und 55 nil, // af 56 aztrUpper(upper), // az 57 elUpper, // el 58 ltUpper(upper), // lt 59 nil, // nl 60 aztrUpper(upper), // tr 61 } 62 63 undUpper transform.Transformer = &undUpperCaser{} 64 65 lowerFunc = []mapFunc{ 66 lower, // und 67 lower, // af 68 aztrLower, // az 69 lower, // el 70 ltLower, // lt 71 lower, // nl 72 aztrLower, // tr 73 } 74 75 titleInfos = []struct { 76 title, lower mapFunc 77 rewrite func(*context) 78 }{ 79 {title, lower, nil}, // und 80 {title, lower, afnlRewrite}, // af 81 {aztrUpper(title), aztrLower, nil}, // az 82 {title, lower, nil}, // el 83 {ltUpper(title), ltLower, nil}, // lt 84 {nlTitle, lower, afnlRewrite}, // nl 85 {aztrUpper(title), aztrLower, nil}, // tr 86 } 87 ) 88 89 func makeUpper(t language.Tag, o options) transform.Transformer { 90 _, i, _ := matcher.Match(t) 91 f := upperFunc[i] 92 if f == nil { 93 return undUpper 94 } 95 return &simpleCaser{f: f} 96 } 97 98 func makeLower(t language.Tag, o options) transform.Transformer { 99 _, i, _ := matcher.Match(t) 100 f := lowerFunc[i] 101 if o.noFinalSigma { 102 return &simpleCaser{f: f} 103 } 104 return &lowerCaser{ 105 first: f, 106 midWord: finalSigma(f), 107 } 108 } 109 110 func makeTitle(t language.Tag, o options) transform.Transformer { 111 _, i, _ := matcher.Match(t) 112 x := &titleInfos[i] 113 lower := x.lower 114 if o.noLower { 115 lower = (*context).copy 116 } else if !o.noFinalSigma { 117 lower = finalSigma(lower) 118 } 119 return &titleCaser{ 120 title: x.title, 121 lower: lower, 122 rewrite: x.rewrite, 123 } 124 } 125 126 // TODO: consider a similar special case for the fast majority lower case. This 127 // is a bit more involved so will require some more precise benchmarking to 128 // justify it. 129 130 type undUpperCaser struct{ transform.NopResetter } 131 132 // undUpperCaser implements the Transformer interface for doing an upper case 133 // mapping for the root locale (und). It eliminates the need for an allocation 134 // as it prevents escaping by not using function pointers. 135 func (t *undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 136 c := context{dst: dst, src: src, atEOF: atEOF} 137 for c.next() { 138 upper(&c) 139 c.checkpoint() 140 } 141 return c.ret() 142 } 143 144 type simpleCaser struct { 145 context 146 f mapFunc 147 } 148 149 // simpleCaser implements the Transformer interface for doing a case operation 150 // on a rune-by-rune basis. 151 func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 152 t.context = context{dst: dst, src: src, atEOF: atEOF} 153 c := &t.context 154 for c.next() && t.f(c) { 155 c.checkpoint() 156 } 157 return c.ret() 158 } 159 160 // lowerCaser implements the Transformer interface. The default Unicode lower 161 // casing requires different treatment for the first and subsequent characters 162 // of a word, most notably to handle the Greek final Sigma. 163 type lowerCaser struct { 164 context 165 166 first, midWord mapFunc 167 } 168 169 func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 170 t.context = context{dst: dst, src: src, atEOF: atEOF} 171 c := &t.context 172 173 for isInterWord := true; c.next(); { 174 if isInterWord { 175 if c.info.isCased() { 176 if !t.first(c) { 177 break 178 } 179 isInterWord = false 180 } else if !c.copy() { 181 break 182 } 183 } else { 184 if c.info.isNotCasedAndNotCaseIgnorable() { 185 if !c.copy() { 186 break 187 } 188 isInterWord = true 189 } else if !t.midWord(c) { 190 break 191 } 192 } 193 c.checkpoint() 194 } 195 return c.ret() 196 } 197 198 // titleCaser implements the Transformer interface. Title casing algorithms 199 // distinguish between the first letter of a word and subsequent letters of the 200 // same word. It uses state to avoid requiring a potentially infinite lookahead. 201 type titleCaser struct { 202 context 203 204 // rune mappings used by the actual casing algorithms. 205 title, lower mapFunc 206 207 rewrite func(*context) 208 } 209 210 // Transform implements the standard Unicode title case algorithm as defined in 211 // Chapter 3 of The Unicode Standard: 212 // toTitlecase(X): Find the word boundaries in X according to Unicode Standard 213 // Annex #29, "Unicode Text Segmentation." For each word boundary, find the 214 // first cased character F following the word boundary. If F exists, map F to 215 // Titlecase_Mapping(F); then map all characters C between F and the following 216 // word boundary to Lowercase_Mapping(C). 217 func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 218 t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord} 219 c := &t.context 220 221 if !c.next() { 222 return c.ret() 223 } 224 225 for { 226 p := c.info 227 if t.rewrite != nil { 228 t.rewrite(c) 229 } 230 231 wasMid := p.isCaseIgnorableAndNonBreakStarter() 232 // Break out of this loop on failure to ensure we do not modify the 233 // state incorrectly. 234 if p.isCased() && !p.isCaseIgnorableAndNotCased() { 235 if !c.isMidWord { 236 if !t.title(c) { 237 break 238 } 239 c.isMidWord = true 240 } else if !t.lower(c) { 241 break 242 } 243 } else if !c.copy() { 244 break 245 } 246 247 // TODO: make this an "else if" if we can prove that no rune that does 248 // not match the first condition of the if statement can be a break. 249 if p.isBreak() { 250 c.isMidWord = false 251 } 252 253 // As we save the state of the transformer, it is safe to call 254 // checkpoint after any successful write. 255 c.checkpoint() 256 257 if !c.next() { 258 break 259 } 260 if wasMid && c.info.isCaseIgnorableAndNonBreakStarter() { 261 c.isMidWord = false 262 } 263 } 264 return c.ret() 265 } 266 267 // finalSigma adds Greek final Sigma handing to another casing function. It 268 // determines whether a lowercased sigma should be σ or ς, by looking ahead for 269 // case-ignorables and a cased letters. 270 func finalSigma(f mapFunc) mapFunc { 271 return func(c *context) bool { 272 // ::NFD(); 273 // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 274 // Σ } [:case-ignorable:]* [:cased:] → σ; 275 // [:cased:] [:case-ignorable:]* { Σ → ς; 276 // ::Any-Lower; 277 // ::NFC(); 278 279 if !c.hasPrefix("Σ") { 280 return f(c) 281 } 282 283 p := c.pDst 284 c.writeString("ς") 285 // We need to do one more iteration after maxIgnorable, as a cased 286 // letter is not an ignorable and may modify the result. 287 for i := 0; i < maxIgnorable+1; i++ { 288 if !c.next() { 289 return false 290 } 291 if !c.info.isCaseIgnorable() { 292 if c.info.isCased() { 293 // p+1 is guaranteed to be in bounds: if writing ς was 294 // successful, p+1 will contain the second byte of ς. If not, 295 // this function will have returned after c.next returned false. 296 c.dst[p+1]++ // ς → σ 297 } 298 c.unreadRune() 299 return true 300 } 301 // A case ignorable may also introduce a word break, so we may need 302 // to continue searching even after detecting a break. 303 c.isMidWord = c.isMidWord && !c.info.isBreak() 304 c.copy() 305 } 306 return true 307 } 308 } 309 310 // elUpper implements Greek upper casing, which entails removing a predefined 311 // set of non-blocked modifiers. Note that these accents should not be removed 312 // for title casing! 313 // Example: "Οδός" -> "ΟΔΟΣ". 314 func elUpper(c *context) bool { 315 // From CLDR: 316 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; 317 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; 318 319 r, _ := utf8.DecodeRune(c.src[c.pSrc:]) 320 oldPDst := c.pDst 321 if !upper(c) { 322 return false 323 } 324 if !unicode.Is(unicode.Greek, r) { 325 return true 326 } 327 i := 0 328 // Take the properties of the uppercased rune that is already written to the 329 // destination. This saves us the trouble of having to uppercase the 330 // decomposed rune again. 331 if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil { 332 // Restore the destination position and process the decomposed rune. 333 r, sz := utf8.DecodeRune(b) 334 if r <= 0xFF { // See A.6.1 335 return true 336 } 337 c.pDst = oldPDst 338 // Insert the first rune and ignore the modifiers. See A.6.2. 339 c.writeBytes(b[:sz]) 340 i = len(b[sz:]) / 2 // Greek modifiers are always of length 2. 341 } 342 343 for ; i < maxIgnorable && c.next(); i++ { 344 switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r { 345 // Above and Iota Subscript 346 case 0x0300, // U+0300 COMBINING GRAVE ACCENT 347 0x0301, // U+0301 COMBINING ACUTE ACCENT 348 0x0304, // U+0304 COMBINING MACRON 349 0x0306, // U+0306 COMBINING BREVE 350 0x0308, // U+0308 COMBINING DIAERESIS 351 0x0313, // U+0313 COMBINING COMMA ABOVE 352 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE 353 0x0342, // U+0342 COMBINING GREEK PERISPOMENI 354 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI 355 // No-op. Gobble the modifier. 356 357 default: 358 switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() { 359 case cccZero: 360 c.unreadRune() 361 return true 362 363 // We don't need to test for IotaSubscript as the only rune that 364 // qualifies (U+0345) was already excluded in the switch statement 365 // above. See A.4. 366 367 case cccAbove: 368 return c.copy() 369 default: 370 // Some other modifier. We're still allowed to gobble Greek 371 // modifiers after this. 372 c.copy() 373 } 374 } 375 } 376 return i == maxIgnorable 377 } 378 379 func ltLower(c *context) bool { 380 // From CLDR: 381 // # Introduce an explicit dot above when lowercasing capital I's and J's 382 // # whenever there are more accents above. 383 // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 384 // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 385 // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 386 // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 387 // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 388 // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 389 // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 390 // ::NFD(); 391 // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; 392 // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; 393 // Į } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → į \u0307; 394 // Ì → i \u0307 \u0300; 395 // Í → i \u0307 \u0301; 396 // Ĩ → i \u0307 \u0303; 397 // ::Any-Lower(); 398 // ::NFC(); 399 400 i := 0 401 if r := c.src[c.pSrc]; r < utf8.RuneSelf { 402 lower(c) 403 if r != 'I' && r != 'J' { 404 return true 405 } 406 } else { 407 p := norm.NFD.Properties(c.src[c.pSrc:]) 408 if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') { 409 // UTF-8 optimization: the decomposition will only have an above 410 // modifier if the last rune of the decomposition is in [U+300-U+311]. 411 // In all other cases, a decomposition starting with I is always 412 // an I followed by modifiers that are not cased themselves. See A.2. 413 if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4. 414 if !c.writeBytes(d[:1]) { 415 return false 416 } 417 c.dst[c.pDst-1] += 'a' - 'A' // lower 418 419 // Assumption: modifier never changes on lowercase. See A.1. 420 // Assumption: all modifiers added have CCC = Above. See A.2.3. 421 return c.writeString("\u0307") && c.writeBytes(d[1:]) 422 } 423 // In all other cases the additional modifiers will have a CCC 424 // that is less than 230 (Above). We will insert the U+0307, if 425 // needed, after these modifiers so that a string in FCD form 426 // will remain so. See A.2.2. 427 lower(c) 428 i = 1 429 } else { 430 return lower(c) 431 } 432 } 433 434 for ; i < maxIgnorable && c.next(); i++ { 435 switch c.info.cccType() { 436 case cccZero: 437 c.unreadRune() 438 return true 439 case cccAbove: 440 return c.writeString("\u0307") && c.copy() // See A.1. 441 default: 442 c.copy() // See A.1. 443 } 444 } 445 return i == maxIgnorable 446 } 447 448 func ltUpper(f mapFunc) mapFunc { 449 return func(c *context) bool { 450 // From CLDR: 451 // ::NFD(); 452 // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; 453 // ::Any-Upper(); 454 // ::NFC(); 455 456 // TODO: See A.5. A soft-dotted rune never has an exception. This would 457 // allow us to overload the exception bit and encode this property in 458 // info. Need to measure performance impact of this. 459 r, _ := utf8.DecodeRune(c.src[c.pSrc:]) 460 oldPDst := c.pDst 461 if !f(c) { 462 return false 463 } 464 if !unicode.Is(unicode.Soft_Dotted, r) { 465 return true 466 } 467 468 // We don't need to do an NFD normalization, as a soft-dotted rune never 469 // contains U+0307. See A.3. 470 471 i := 0 472 for ; i < maxIgnorable && c.next(); i++ { 473 switch c.info.cccType() { 474 case cccZero: 475 c.unreadRune() 476 return true 477 case cccAbove: 478 if c.hasPrefix("\u0307") { 479 // We don't do a full NFC, but rather combine runes for 480 // some of the common cases. (Returning NFC or 481 // preserving normal form is neither a requirement nor 482 // a possibility anyway). 483 if !c.next() { 484 return false 485 } 486 if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc { 487 s := "" 488 switch c.src[c.pSrc+1] { 489 case 0x80: // U+0300 COMBINING GRAVE ACCENT 490 s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE 491 case 0x81: // U+0301 COMBINING ACUTE ACCENT 492 s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE 493 case 0x83: // U+0303 COMBINING TILDE 494 s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE 495 case 0x88: // U+0308 COMBINING DIAERESIS 496 s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS 497 default: 498 } 499 if s != "" { 500 c.pDst = oldPDst 501 return c.writeString(s) 502 } 503 } 504 } 505 return c.copy() 506 default: 507 c.copy() 508 } 509 } 510 return i == maxIgnorable 511 } 512 } 513 514 func aztrUpper(f mapFunc) mapFunc { 515 return func(c *context) bool { 516 // i→İ; 517 if c.src[c.pSrc] == 'i' { 518 return c.writeString("İ") 519 } 520 return f(c) 521 } 522 } 523 524 func aztrLower(c *context) (done bool) { 525 // From CLDR: 526 // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 527 // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE 528 // İ→i; 529 // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 530 // # This matches the behavior of the canonically equivalent I-dot_above 531 // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 532 // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 533 // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 534 // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; 535 // I→ı ; 536 // ::Any-Lower(); 537 if c.hasPrefix("\u0130") { // İ 538 return c.writeString("i") 539 } 540 if c.src[c.pSrc] != 'I' { 541 return lower(c) 542 } 543 544 // We ignore the lower-case I for now, but insert it later when we know 545 // which form we need. 546 start := c.pSrc + c.sz 547 548 i := 0 549 Loop: 550 // We check for up to n ignorables before \u0307. As \u0307 is an 551 // ignorable as well, n is maxIgnorable-1. 552 for ; i < maxIgnorable && c.next(); i++ { 553 switch c.info.cccType() { 554 case cccAbove: 555 if c.hasPrefix("\u0307") { 556 return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307 557 } 558 done = true 559 break Loop 560 case cccZero: 561 c.unreadRune() 562 done = true 563 break Loop 564 default: 565 // We'll write this rune after we know which starter to use. 566 } 567 } 568 if i == maxIgnorable { 569 done = true 570 } 571 return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done 572 } 573 574 func nlTitle(c *context) bool { 575 // From CLDR: 576 // # Special titlecasing for Dutch initial "ij". 577 // ::Any-Title(); 578 // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) 579 // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; 580 if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' { 581 return title(c) 582 } 583 584 if !c.writeString("I") || !c.next() { 585 return false 586 } 587 if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' { 588 return c.writeString("J") 589 } 590 c.unreadRune() 591 return true 592 } 593 594 // Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078. 595 func afnlRewrite(c *context) { 596 if c.hasPrefix("'") || c.hasPrefix("’") { 597 c.isMidWord = true 598 } 599 }