github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/cases/trieval.go (about) 1 // This file was generated by go generate; DO NOT EDIT 2 3 package cases 4 5 // This file contains definitions for interpreting the trie value of the case 6 // trie generated by "go run gen*.go". It is shared by both the generator 7 // program and the resultant package. Sharing is achieved by the generator 8 // copying gen_trieval.go to trieval.go and changing what's above this comment. 9 10 // info holds case information for a single rune. It is the value returned 11 // by a trie lookup. Most mapping information can be stored in a single 16-bit 12 // value. If not, for example when a rune is mapped to multiple runes, the value 13 // stores some basic case data and an index into an array with additional data. 14 // 15 // The per-rune values have the following format: 16 // 17 // if (exception) { 18 // 15..5 unsigned exception index 19 // 4 unused 20 // } else { 21 // 15..7 XOR pattern or index to XOR pattern for case mapping 22 // 6 index: interpret the XOR pattern as an index 23 // 5..4 CCC: zero (normal or break), above or other 24 // } 25 // 3 exception: interpret this value as an exception index 26 // 2..0 case mode 27 // 28 // For the non-exceptional cases, a rune must be either uncased, lowercase or 29 // uppercase. If the rune is cased, the XOR pattern maps either a lowercase 30 // rune to uppercase or an uppercase rune to lowercase (applied to the 10 31 // least-significant bits of the rune). 32 // 33 // See the definitions below for a more detailed description of the various 34 // bits. 35 type info uint16 36 37 const ( 38 casedMask = 0x0003 39 fullCasedMask = 0x0007 40 ignorableMask = 0x0006 41 ignorableValue = 0x0004 42 43 exceptionBit = 1 << 3 44 exceptionShift = 5 45 numExceptionBits = 11 46 47 xorIndexBit = 1 << 6 48 xorShift = 7 49 50 // There is no mapping if all xor bits and the exception bit are zero. 51 hasMappingMask = 0xffc0 | exceptionBit 52 ) 53 54 // The case mode bits encodes the case type of a rune. This includes uncased, 55 // title, upper and lower case and case ignorable. (For a definition of these 56 // terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare 57 // cases, a rune can be both cased and case-ignorable. This is encoded by 58 // cIgnorableCased. A rune of this type is always lower case. Some runes are 59 // cased while not having a mapping. 60 // 61 // A common pattern for scripts in the Unicode standard is for upper and lower 62 // case runes to alternate for increasing rune values (e.g. the accented Latin 63 // ranges starting from U+0100 and U+1E00 among others andsome Cyrillic 64 // characters). We use this property by defining a cXORCase mode, where the case 65 // mode (always upper or lower case) is derived from the rune value. As the XOR 66 // pattern for case mappings is often identical for successive runes, using 67 // cXORCase can result in large series of identical trie values. This, in turn, 68 // allows us to better compress the trie blocks. 69 const ( 70 cUncased info = iota // 000 71 cTitle // 001 72 cLower // 010 73 cUpper // 011 74 cIgnorableUncased // 100 75 cIgnorableCased // 101 // lower case if mappings exist 76 cXORCase // 11x // case is cLower | ((rune&1) ^ x) 77 78 maxCaseMode = cUpper 79 ) 80 81 func (c info) isCased() bool { 82 return c&casedMask != 0 83 } 84 85 func (c info) isCaseIgnorable() bool { 86 return c&ignorableMask == ignorableValue 87 } 88 89 func (c info) isCaseIgnorableAndNonBreakStarter() bool { 90 return c&(fullCasedMask|cccMask) == (ignorableValue | cccZero) 91 } 92 93 func (c info) isNotCasedAndNotCaseIgnorable() bool { 94 return c&fullCasedMask == 0 95 } 96 97 func (c info) isCaseIgnorableAndNotCased() bool { 98 return c&fullCasedMask == cIgnorableUncased 99 } 100 101 // The case mapping implementation will need to know about various Canonical 102 // Combining Class (CCC) values. We encode two of these in the trie value: 103 // cccZero (0) and cccAbove (230). If the value is cccOther, it means that 104 // CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that 105 // the rune also has the break category Break (see below). 106 const ( 107 cccBreak info = iota << 4 108 cccZero 109 cccAbove 110 cccOther 111 112 cccMask = cccBreak | cccZero | cccAbove | cccOther 113 ) 114 115 func (c info) cccVal() info { 116 if c&exceptionBit != 0 { 117 return cccZero 118 } 119 return c & cccMask 120 } 121 122 func (c info) cccType() info { 123 ccc := c.cccVal() 124 if ccc <= cccZero { 125 return cccZero 126 } 127 return ccc 128 } 129 130 const ( 131 starter = 0 132 above = 230 133 iotaSubscript = 240 134 ) 135 136 // TODO: Implement full Unicode breaking algorithm: 137 // 1) Implement breaking in separate package. 138 // 2) Use the breaker here. 139 // 3) Compare table size and performance of using the more generic breaker. 140 // 141 // Note that we can extend the current algorithm to be much more accurate. This 142 // only makes sense, though, if the performance and/or space penalty of using 143 // the generic breaker is big. Extra data will only be needed for non-cased 144 // runes, which means there are sufficient bits left in the caseType. 145 // Also note that the standard breaking algorithm doesn't always make sense 146 // for title casing. For example, a4a -> A4a, but a"4a -> A"4A (where " stands 147 // for modifier \u0308). 148 // ICU prohibits breaking in such cases as well. 149 150 // For the purpose of title casing we use an approximation of the Unicode Word 151 // Breaking algorithm defined in Annex #29: 152 // http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table. 153 // 154 // For our approximation, we group the Word Break types into the following 155 // categories, with associated rules: 156 // 157 // 1) Letter: 158 // ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend. 159 // Rule: Never break between consecutive runes of this category. 160 // 161 // 2) Mid: 162 // Format, MidLetter, MidNumLet, Single_Quote. 163 // (Cf. case-ignorable: MidLetter, MidNumLet or cat is Mn, Me, Cf, Lm or Sk). 164 // Rule: Don't break between Letter and Mid, but break between two Mids. 165 // 166 // 3) Break: 167 // Any other category, including NewLine, CR, LF and Double_Quote. These 168 // categories should always result in a break between two cased letters. 169 // Rule: Always break. 170 // 171 // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in 172 // preventing a break between two cased letters. For now we will ignore this 173 // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and 174 // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].) 175 // 176 // Note 2: the rule for Mid is very approximate, but works in most cases. To 177 // improve, we could store the categories in the trie value and use a FA to 178 // manage breaks. See TODO comment above. 179 // 180 // Note 3: according to the spec, it is possible for the Extend category to 181 // introduce breaks between other categories grouped in Letter. However, this 182 // is undesirable for our purposes. ICU prevents breaks in such cases as well. 183 184 // isBreak returns whether this rune should introduce a break. 185 func (c info) isBreak() bool { 186 return c.cccVal() == cccBreak 187 } 188 189 // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter, 190 // Numeric, ExtendNumLet, or Extend. 191 func (c info) isLetter() bool { 192 ccc := c.cccVal() 193 if ccc == cccZero { 194 return !c.isCaseIgnorable() 195 } 196 return ccc != cccBreak 197 } 198 199 // The exceptions slice holds data that does not fit in a normal info entry. 200 // The entry is pointed to by the exception index in an entry. It has the 201 // following format: 202 // 203 // Header: 204 // byte 0: // TODO: case folding not implemented yet. 205 // 7 conditional case folding 206 // 6 conditional special casing 207 // 6..3 length of case folding 208 // 2..0 length of closure mapping (up to 7). 209 // 210 // byte 1: 211 // 7..6 unused 212 // 5..3 length of 1st mapping of case type 213 // 2..0 length of 2nd mapping of case type 214 // 215 // case 1st 2nd 216 // lower -> upper, title 217 // upper -> lower, title 218 // title -> lower, upper 219 // 220 // Lengths with the value 0x7 indicate no value and implies no change. 221 // A length of 0 indicates a mapping to zero-length string. 222 // 223 // Body bytes: 224 // lowercase mapping bytes 225 // uppercase mapping bytes 226 // titlecase mapping bytes 227 // case folding bytes 228 // closure mapping bytes 229 // 230 // Fallbacks: 231 // missing fold -> lower 232 // missing title -> upper 233 // all missing -> original rune 234 // 235 // exceptions starts with a dummy byte to enforce that there is no zero index 236 // value. 237 const ( 238 lengthMask = 0x07 239 lengthBits = 3 240 noChange = 0 241 ) 242 243 // References to generated trie. 244 245 var trie = newCaseTrie(0) 246 247 var sparse = sparseBlocks{ 248 values: sparseValues[:], 249 offsets: sparseOffsets[:], 250 } 251 252 // Sparse block lookup code. 253 254 // valueRange is an entry in a sparse block. 255 type valueRange struct { 256 value uint16 257 lo, hi byte 258 } 259 260 type sparseBlocks struct { 261 values []valueRange 262 offsets []uint16 263 } 264 265 // lookup returns the value from values block n for byte b using binary search. 266 func (s *sparseBlocks) lookup(n uint32, b byte) uint16 { 267 lo := s.offsets[n] 268 hi := s.offsets[n+1] 269 for lo < hi { 270 m := lo + (hi-lo)/2 271 r := s.values[m] 272 if r.lo <= b && b <= r.hi { 273 return r.value 274 } 275 if b < r.lo { 276 hi = m 277 } else { 278 lo = m + 1 279 } 280 } 281 return 0 282 } 283 284 // lastRuneForTesting is the last rune used for testing. Everything after this 285 // is boring. 286 const lastRuneForTesting = rune(0x1FFFF)