github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/text/cases/trieval.go (about) 1 // This file was generated by go generate; DO NOT EDIT 2 3 package cases 4 5 // This file contains definitions for interpreting the trie value of the case 6 // trie generated by "go run gen*.go". It is shared by both the generator 7 // program and the resultant package. Sharing is achieved by the generator 8 // copying gen_trieval.go to trieval.go and changing what's above this comment. 9 10 // info holds case information for a single rune. It is the value returned 11 // by a trie lookup. Most mapping information can be stored in a single 16-bit 12 // value. If not, for example when a rune is mapped to multiple runes, the value 13 // stores some basic case data and an index into an array with additional data. 14 // 15 // The per-rune values have the following format: 16 // 17 // if (exception) { 18 // 15..5 unsigned exception index 19 // 4 unused 20 // } else { 21 // 15..8 XOR pattern or index to XOR pattern for case mapping 22 // Only 13..8 are used for XOR patterns. 23 // 7 inverseFold (fold to upper, not to lower) 24 // 6 index: interpret the XOR pattern as an index 25 // 5..4 CCC: zero (normal or break), above or other 26 // } 27 // 3 exception: interpret this value as an exception index 28 // (TODO: is this bit necessary? Probably implied from case mode.) 29 // 2..0 case mode 30 // 31 // For the non-exceptional cases, a rune must be either uncased, lowercase or 32 // uppercase. If the rune is cased, the XOR pattern maps either a lowercase 33 // rune to uppercase or an uppercase rune to lowercase (applied to the 10 34 // least-significant bits of the rune). 35 // 36 // See the definitions below for a more detailed description of the various 37 // bits. 38 type info uint16 39 40 const ( 41 casedMask = 0x0003 42 fullCasedMask = 0x0007 43 ignorableMask = 0x0006 44 ignorableValue = 0x0004 45 46 inverseFoldBit = 1 << 7 47 48 exceptionBit = 1 << 3 49 exceptionShift = 5 50 numExceptionBits = 11 51 52 xorIndexBit = 1 << 6 53 xorShift = 8 54 55 // There is no mapping if all xor bits and the exception bit are zero. 56 hasMappingMask = 0xffc0 | exceptionBit 57 ) 58 59 // The case mode bits encodes the case type of a rune. This includes uncased, 60 // title, upper and lower case and case ignorable. (For a definition of these 61 // terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare 62 // cases, a rune can be both cased and case-ignorable. This is encoded by 63 // cIgnorableCased. A rune of this type is always lower case. Some runes are 64 // cased while not having a mapping. 65 // 66 // A common pattern for scripts in the Unicode standard is for upper and lower 67 // case runes to alternate for increasing rune values (e.g. the accented Latin 68 // ranges starting from U+0100 and U+1E00 among others and some Cyrillic 69 // characters). We use this property by defining a cXORCase mode, where the case 70 // mode (always upper or lower case) is derived from the rune value. As the XOR 71 // pattern for case mappings is often identical for successive runes, using 72 // cXORCase can result in large series of identical trie values. This, in turn, 73 // allows us to better compress the trie blocks. 74 const ( 75 cUncased info = iota // 000 76 cTitle // 001 77 cLower // 010 78 cUpper // 011 79 cIgnorableUncased // 100 80 cIgnorableCased // 101 // lower case if mappings exist 81 cXORCase // 11x // case is cLower | ((rune&1) ^ x) 82 83 maxCaseMode = cUpper 84 ) 85 86 func (c info) isCased() bool { 87 return c&casedMask != 0 88 } 89 90 func (c info) isCaseIgnorable() bool { 91 return c&ignorableMask == ignorableValue 92 } 93 94 func (c info) isCaseIgnorableAndNonBreakStarter() bool { 95 return c&(fullCasedMask|cccMask) == (ignorableValue | cccZero) 96 } 97 98 func (c info) isNotCasedAndNotCaseIgnorable() bool { 99 return c&fullCasedMask == 0 100 } 101 102 func (c info) isCaseIgnorableAndNotCased() bool { 103 return c&fullCasedMask == cIgnorableUncased 104 } 105 106 // The case mapping implementation will need to know about various Canonical 107 // Combining Class (CCC) values. We encode two of these in the trie value: 108 // cccZero (0) and cccAbove (230). If the value is cccOther, it means that 109 // CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that 110 // the rune also has the break category Break (see below). 111 const ( 112 cccBreak info = iota << 4 113 cccZero 114 cccAbove 115 cccOther 116 117 cccMask = cccBreak | cccZero | cccAbove | cccOther 118 ) 119 120 const ( 121 starter = 0 122 above = 230 123 iotaSubscript = 240 124 ) 125 126 // The exceptions slice holds data that does not fit in a normal info entry. 127 // The entry is pointed to by the exception index in an entry. It has the 128 // following format: 129 // 130 // Header 131 // byte 0: 132 // 7..6 unused 133 // 5..4 CCC type (same bits as entry) 134 // 3 unused 135 // 2..0 length of fold 136 // 137 // byte 1: 138 // 7..6 unused 139 // 5..3 length of 1st mapping of case type 140 // 2..0 length of 2nd mapping of case type 141 // 142 // case 1st 2nd 143 // lower -> upper, title 144 // upper -> lower, title 145 // title -> lower, upper 146 // 147 // Lengths with the value 0x7 indicate no value and implies no change. 148 // A length of 0 indicates a mapping to zero-length string. 149 // 150 // Body bytes: 151 // case folding bytes 152 // lowercase mapping bytes 153 // uppercase mapping bytes 154 // titlecase mapping bytes 155 // closure mapping bytes (for NFKC_Casefold). (TODO) 156 // 157 // Fallbacks: 158 // missing fold -> lower 159 // missing title -> upper 160 // all missing -> original rune 161 // 162 // exceptions starts with a dummy byte to enforce that there is no zero index 163 // value. 164 const ( 165 lengthMask = 0x07 166 lengthBits = 3 167 noChange = 0 168 ) 169 170 // References to generated trie. 171 172 var trie = newCaseTrie(0) 173 174 var sparse = sparseBlocks{ 175 values: sparseValues[:], 176 offsets: sparseOffsets[:], 177 } 178 179 // Sparse block lookup code. 180 181 // valueRange is an entry in a sparse block. 182 type valueRange struct { 183 value uint16 184 lo, hi byte 185 } 186 187 type sparseBlocks struct { 188 values []valueRange 189 offsets []uint16 190 } 191 192 // lookup returns the value from values block n for byte b using binary search. 193 func (s *sparseBlocks) lookup(n uint32, b byte) uint16 { 194 lo := s.offsets[n] 195 hi := s.offsets[n+1] 196 for lo < hi { 197 m := lo + (hi-lo)/2 198 r := s.values[m] 199 if r.lo <= b && b <= r.hi { 200 return r.value 201 } 202 if b < r.lo { 203 hi = m 204 } else { 205 lo = m + 1 206 } 207 } 208 return 0 209 } 210 211 // lastRuneForTesting is the last rune used for testing. Everything after this 212 // is boring. 213 const lastRuneForTesting = rune(0x1FFFF)