github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/cases/info.go

github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/cases/info.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package cases
     6  
     7  func (c info) cccVal() info {
     8  	if c&exceptionBit != 0 {
     9  		return info(exceptions[c>>exceptionShift]) & cccMask
    10  	}
    11  	return c & cccMask
    12  }
    13  
    14  func (c info) cccType() info {
    15  	ccc := c.cccVal()
    16  	if ccc <= cccZero {
    17  		return cccZero
    18  	}
    19  	return ccc
    20  }
    21  
    22  // TODO: Implement full Unicode breaking algorithm:
    23  // 1) Implement breaking in separate package.
    24  // 2) Use the breaker here.
    25  // 3) Compare table size and performance of using the more generic breaker.
    26  //
    27  // Note that we can extend the current algorithm to be much more accurate. This
    28  // only makes sense, though, if the performance and/or space penalty of using
    29  // the generic breaker is big. Extra data will only be needed for non-cased
    30  // runes, which means there are sufficient bits left in the caseType.
    31  // Also note that the standard breaking algorithm doesn't always make sense
    32  // for title casing. For example, a4a -> A4a, but a"4a -> A"4A (where " stands
    33  // for modifier \u0308).
    34  // ICU prohibits breaking in such cases as well.
    35  
    36  // For the purpose of title casing we use an approximation of the Unicode Word
    37  // Breaking algorithm defined in Annex #29:
    38  // http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table.
    39  //
    40  // For our approximation, we group the Word Break types into the following
    41  // categories, with associated rules:
    42  //
    43  // 1) Letter:
    44  //    ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend.
    45  //    Rule: Never break between consecutive runes of this category.
    46  //
    47  // 2) Mid:
    48  //    Format, MidLetter, MidNumLet, Single_Quote.
    49  //    (Cf. case-ignorable: MidLetter, MidNumLet or cat is Mn, Me, Cf, Lm or Sk).
    50  //    Rule: Don't break between Letter and Mid, but break between two Mids.
    51  //
    52  // 3) Break:
    53  //    Any other category, including NewLine, CR, LF and Double_Quote. These
    54  //    categories should always result in a break between two cased letters.
    55  //    Rule: Always break.
    56  //
    57  // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in
    58  // preventing a break between two cased letters. For now we will ignore this
    59  // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and
    60  // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].)
    61  //
    62  // Note 2: the rule for Mid is very approximate, but works in most cases. To
    63  // improve, we could store the categories in the trie value and use a FA to
    64  // manage breaks. See TODO comment above.
    65  //
    66  // Note 3: according to the spec, it is possible for the Extend category to
    67  // introduce breaks between other categories grouped in Letter. However, this
    68  // is undesirable for our purposes. ICU prevents breaks in such cases as well.
    69  
    70  // isBreak returns whether this rune should introduce a break.
    71  func (c info) isBreak() bool {
    72  	return c.cccVal() == cccBreak
    73  }
    74  
    75  // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter,
    76  // Numeric, ExtendNumLet, or Extend.
    77  func (c info) isLetter() bool {
    78  	ccc := c.cccVal()
    79  	if ccc == cccZero {
    80  		return !c.isCaseIgnorable()
    81  	}
    82  	return ccc != cccBreak
    83  }