github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/cases/gen_trieval.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 package main 8 9 // This file contains definitions for interpreting the trie value of the case 10 // trie generated by "go run gen*.go". It is shared by both the generator 11 // program and the resultant package. Sharing is achieved by the generator 12 // copying gen_trieval.go to trieval.go and changing what's above this comment. 13 14 // info holds case information for a single rune. It is the value returned 15 // by a trie lookup. Most mapping information can be stored in a single 16-bit 16 // value. If not, for example when a rune is mapped to multiple runes, the value 17 // stores some basic case data and an index into an array with additional data. 18 // 19 // The per-rune values have the following format: 20 // 21 // if (exception) { 22 // 15..5 unsigned exception index 23 // 4 unused 24 // } else { 25 // 15..7 XOR pattern or index to XOR pattern for case mapping 26 // 6 index: interpret the XOR pattern as an index 27 // 5..4 CCC: zero (normal or break), above or other 28 // } 29 // 3 exception: interpret this value as an exception index 30 // 2..0 case mode 31 // 32 // For the non-exceptional cases, a rune must be either uncased, lowercase or 33 // uppercase. If the rune is cased, the XOR pattern maps either a lowercase 34 // rune to uppercase or an uppercase rune to lowercase (applied to the 10 35 // least-significant bits of the rune). 36 // 37 // See the definitions below for a more detailed description of the various 38 // bits. 39 type info uint16 40 41 const ( 42 casedMask = 0x0003 43 fullCasedMask = 0x0007 44 ignorableMask = 0x0006 45 ignorableValue = 0x0004 46 47 exceptionBit = 1 << 3 48 exceptionShift = 5 49 numExceptionBits = 11 50 51 xorIndexBit = 1 << 6 52 xorShift = 7 53 54 // There is no mapping if all xor bits and the exception bit are zero. 55 hasMappingMask = 0xffc0 | exceptionBit 56 ) 57 58 // The case mode bits encodes the case type of a rune. This includes uncased, 59 // title, upper and lower case and case ignorable. (For a definition of these 60 // terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare 61 // cases, a rune can be both cased and case-ignorable. This is encoded by 62 // cIgnorableCased. A rune of this type is always lower case. Some runes are 63 // cased while not having a mapping. 64 // 65 // A common pattern for scripts in the Unicode standard is for upper and lower 66 // case runes to alternate for increasing rune values (e.g. the accented Latin 67 // ranges starting from U+0100 and U+1E00 among others andsome Cyrillic 68 // characters). We use this property by defining a cXORCase mode, where the case 69 // mode (always upper or lower case) is derived from the rune value. As the XOR 70 // pattern for case mappings is often identical for successive runes, using 71 // cXORCase can result in large series of identical trie values. This, in turn, 72 // allows us to better compress the trie blocks. 73 const ( 74 cUncased info = iota // 000 75 cTitle // 001 76 cLower // 010 77 cUpper // 011 78 cIgnorableUncased // 100 79 cIgnorableCased // 101 // lower case if mappings exist 80 cXORCase // 11x // case is cLower | ((rune&1) ^ x) 81 82 maxCaseMode = cUpper 83 ) 84 85 func (c info) isCased() bool { 86 return c&casedMask != 0 87 } 88 89 func (c info) isCaseIgnorable() bool { 90 return c&ignorableMask == ignorableValue 91 } 92 93 func (c info) isCaseIgnorableAndNonBreakStarter() bool { 94 return c&(fullCasedMask|cccMask) == (ignorableValue | cccZero) 95 } 96 97 func (c info) isNotCasedAndNotCaseIgnorable() bool { 98 return c&fullCasedMask == 0 99 } 100 101 func (c info) isCaseIgnorableAndNotCased() bool { 102 return c&fullCasedMask == cIgnorableUncased 103 } 104 105 // The case mapping implementation will need to know about various Canonical 106 // Combining Class (CCC) values. We encode two of these in the trie value: 107 // cccZero (0) and cccAbove (230). If the value is cccOther, it means that 108 // CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that 109 // the rune also has the break category Break (see below). 110 const ( 111 cccBreak info = iota << 4 112 cccZero 113 cccAbove 114 cccOther 115 116 cccMask = cccBreak | cccZero | cccAbove | cccOther 117 ) 118 119 func (c info) cccVal() info { 120 if c&exceptionBit != 0 { 121 return cccZero 122 } 123 return c & cccMask 124 } 125 126 func (c info) cccType() info { 127 ccc := c.cccVal() 128 if ccc <= cccZero { 129 return cccZero 130 } 131 return ccc 132 } 133 134 const ( 135 starter = 0 136 above = 230 137 iotaSubscript = 240 138 ) 139 140 // TODO: Implement full Unicode breaking algorithm: 141 // 1) Implement breaking in separate package. 142 // 2) Use the breaker here. 143 // 3) Compare table size and performance of using the more generic breaker. 144 // 145 // Note that we can extend the current algorithm to be much more accurate. This 146 // only makes sense, though, if the performance and/or space penalty of using 147 // the generic breaker is big. Extra data will only be needed for non-cased 148 // runes, which means there are sufficient bits left in the caseType. 149 // Also note that the standard breaking algorithm doesn't always make sense 150 // for title casing. For example, a4a -> A4a, but a"4a -> A"4A (where " stands 151 // for modifier \u0308). 152 // ICU prohibits breaking in such cases as well. 153 154 // For the purpose of title casing we use an approximation of the Unicode Word 155 // Breaking algorithm defined in Annex #29: 156 // http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table. 157 // 158 // For our approximation, we group the Word Break types into the following 159 // categories, with associated rules: 160 // 161 // 1) Letter: 162 // ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend. 163 // Rule: Never break between consecutive runes of this category. 164 // 165 // 2) Mid: 166 // Format, MidLetter, MidNumLet, Single_Quote. 167 // (Cf. case-ignorable: MidLetter, MidNumLet or cat is Mn, Me, Cf, Lm or Sk). 168 // Rule: Don't break between Letter and Mid, but break between two Mids. 169 // 170 // 3) Break: 171 // Any other category, including NewLine, CR, LF and Double_Quote. These 172 // categories should always result in a break between two cased letters. 173 // Rule: Always break. 174 // 175 // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in 176 // preventing a break between two cased letters. For now we will ignore this 177 // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and 178 // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].) 179 // 180 // Note 2: the rule for Mid is very approximate, but works in most cases. To 181 // improve, we could store the categories in the trie value and use a FA to 182 // manage breaks. See TODO comment above. 183 // 184 // Note 3: according to the spec, it is possible for the Extend category to 185 // introduce breaks between other categories grouped in Letter. However, this 186 // is undesirable for our purposes. ICU prevents breaks in such cases as well. 187 188 // isBreak returns whether this rune should introduce a break. 189 func (c info) isBreak() bool { 190 return c.cccVal() == cccBreak 191 } 192 193 // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter, 194 // Numeric, ExtendNumLet, or Extend. 195 func (c info) isLetter() bool { 196 ccc := c.cccVal() 197 if ccc == cccZero { 198 return !c.isCaseIgnorable() 199 } 200 return ccc != cccBreak 201 } 202 203 // The exceptions slice holds data that does not fit in a normal info entry. 204 // The entry is pointed to by the exception index in an entry. It has the 205 // following format: 206 // 207 // Header: 208 // byte 0: // TODO: case folding not implemented yet. 209 // 7 conditional case folding 210 // 6 conditional special casing 211 // 6..3 length of case folding 212 // 2..0 length of closure mapping (up to 7). 213 // 214 // byte 1: 215 // 7..6 unused 216 // 5..3 length of 1st mapping of case type 217 // 2..0 length of 2nd mapping of case type 218 // 219 // case 1st 2nd 220 // lower -> upper, title 221 // upper -> lower, title 222 // title -> lower, upper 223 // 224 // Lengths with the value 0x7 indicate no value and implies no change. 225 // A length of 0 indicates a mapping to zero-length string. 226 // 227 // Body bytes: 228 // lowercase mapping bytes 229 // uppercase mapping bytes 230 // titlecase mapping bytes 231 // case folding bytes 232 // closure mapping bytes 233 // 234 // Fallbacks: 235 // missing fold -> lower 236 // missing title -> upper 237 // all missing -> original rune 238 // 239 // exceptions starts with a dummy byte to enforce that there is no zero index 240 // value. 241 const ( 242 lengthMask = 0x07 243 lengthBits = 3 244 noChange = 0 245 ) 246 247 // References to generated trie. 248 249 var trie = newCaseTrie(0) 250 251 var sparse = sparseBlocks{ 252 values: sparseValues[:], 253 offsets: sparseOffsets[:], 254 } 255 256 // Sparse block lookup code. 257 258 // valueRange is an entry in a sparse block. 259 type valueRange struct { 260 value uint16 261 lo, hi byte 262 } 263 264 type sparseBlocks struct { 265 values []valueRange 266 offsets []uint16 267 } 268 269 // lookup returns the value from values block n for byte b using binary search. 270 func (s *sparseBlocks) lookup(n uint32, b byte) uint16 { 271 lo := s.offsets[n] 272 hi := s.offsets[n+1] 273 for lo < hi { 274 m := lo + (hi-lo)/2 275 r := s.values[m] 276 if r.lo <= b && b <= r.hi { 277 return r.value 278 } 279 if b < r.lo { 280 hi = m 281 } else { 282 lo = m + 1 283 } 284 } 285 return 0 286 } 287 288 // lastRuneForTesting is the last rune used for testing. Everything after this 289 // is boring. 290 const lastRuneForTesting = rune(0x1FFFF)