github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/cases/info.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cases 6 7 func (c info) cccVal() info { 8 if c&exceptionBit != 0 { 9 return info(exceptions[c>>exceptionShift]) & cccMask 10 } 11 return c & cccMask 12 } 13 14 func (c info) cccType() info { 15 ccc := c.cccVal() 16 if ccc <= cccZero { 17 return cccZero 18 } 19 return ccc 20 } 21 22 // TODO: Implement full Unicode breaking algorithm: 23 // 1) Implement breaking in separate package. 24 // 2) Use the breaker here. 25 // 3) Compare table size and performance of using the more generic breaker. 26 // 27 // Note that we can extend the current algorithm to be much more accurate. This 28 // only makes sense, though, if the performance and/or space penalty of using 29 // the generic breaker is big. Extra data will only be needed for non-cased 30 // runes, which means there are sufficient bits left in the caseType. 31 // Also note that the standard breaking algorithm doesn't always make sense 32 // for title casing. For example, a4a -> A4a, but a"4a -> A"4A (where " stands 33 // for modifier \u0308). 34 // ICU prohibits breaking in such cases as well. 35 36 // For the purpose of title casing we use an approximation of the Unicode Word 37 // Breaking algorithm defined in Annex #29: 38 // http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table. 39 // 40 // For our approximation, we group the Word Break types into the following 41 // categories, with associated rules: 42 // 43 // 1) Letter: 44 // ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend. 45 // Rule: Never break between consecutive runes of this category. 46 // 47 // 2) Mid: 48 // Format, MidLetter, MidNumLet, Single_Quote. 49 // (Cf. case-ignorable: MidLetter, MidNumLet or cat is Mn, Me, Cf, Lm or Sk). 50 // Rule: Don't break between Letter and Mid, but break between two Mids. 51 // 52 // 3) Break: 53 // Any other category, including NewLine, CR, LF and Double_Quote. These 54 // categories should always result in a break between two cased letters. 55 // Rule: Always break. 56 // 57 // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in 58 // preventing a break between two cased letters. For now we will ignore this 59 // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and 60 // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].) 61 // 62 // Note 2: the rule for Mid is very approximate, but works in most cases. To 63 // improve, we could store the categories in the trie value and use a FA to 64 // manage breaks. See TODO comment above. 65 // 66 // Note 3: according to the spec, it is possible for the Extend category to 67 // introduce breaks between other categories grouped in Letter. However, this 68 // is undesirable for our purposes. ICU prevents breaks in such cases as well. 69 70 // isBreak returns whether this rune should introduce a break. 71 func (c info) isBreak() bool { 72 return c.cccVal() == cccBreak 73 } 74 75 // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter, 76 // Numeric, ExtendNumLet, or Extend. 77 func (c info) isLetter() bool { 78 ccc := c.cccVal() 79 if ccc == cccZero { 80 return !c.isCaseIgnorable() 81 } 82 return ccc != cccBreak 83 }