gitee.com/quant1x/gox@v1.21.2/text/uniseg/grapheme.go (about) 1 package uniseg 2 3 import "unicode/utf8" 4 5 // The states of the grapheme cluster parser. 6 const ( 7 grAny = iota 8 grCR 9 grControlLF 10 grL 11 grLVV 12 grLVTT 13 grPrepend 14 grExtendedPictographic 15 grExtendedPictographicZWJ 16 grRIOdd 17 grRIEven 18 ) 19 20 // The grapheme cluster parser's breaking instructions. 21 const ( 22 grNoBoundary = iota 23 grBoundary 24 ) 25 26 // The grapheme cluster parser's state transitions. Maps (state, property) to 27 // (new state, breaking instruction, rule number). The breaking instruction 28 // always refers to the boundary between the last and next code point. 29 // 30 // This map is queried as follows: 31 // 32 // 1. Find specific state + specific property. Stop if found. 33 // 2. Find specific state + any property. 34 // 3. Find any state + specific property. 35 // 4. If only (2) or (3) (but not both) was found, stop. 36 // 5. If both (2) and (3) were found, use state and breaking instruction from 37 // the transition with the lower rule number, prefer (3) if rule numbers 38 // are equal. Stop. 39 // 6. Assume grAny and grBoundary. 40 var grTransitions = map[[2]int][3]int{ 41 // GB5 42 {grAny, prCR}: {grCR, grBoundary, 50}, 43 {grAny, prLF}: {grControlLF, grBoundary, 50}, 44 {grAny, prControl}: {grControlLF, grBoundary, 50}, 45 46 // GB4 47 {grCR, prAny}: {grAny, grBoundary, 40}, 48 {grControlLF, prAny}: {grAny, grBoundary, 40}, 49 50 // GB3. 51 {grCR, prLF}: {grAny, grNoBoundary, 30}, 52 53 // GB6. 54 {grAny, prL}: {grL, grBoundary, 9990}, 55 {grL, prL}: {grL, grNoBoundary, 60}, 56 {grL, prV}: {grLVV, grNoBoundary, 60}, 57 {grL, prLV}: {grLVV, grNoBoundary, 60}, 58 {grL, prLVT}: {grLVTT, grNoBoundary, 60}, 59 60 // GB7. 61 {grAny, prLV}: {grLVV, grBoundary, 9990}, 62 {grAny, prV}: {grLVV, grBoundary, 9990}, 63 {grLVV, prV}: {grLVV, grNoBoundary, 70}, 64 {grLVV, prT}: {grLVTT, grNoBoundary, 70}, 65 66 // GB8. 67 {grAny, prLVT}: {grLVTT, grBoundary, 9990}, 68 {grAny, prT}: {grLVTT, grBoundary, 9990}, 69 {grLVTT, prT}: {grLVTT, grNoBoundary, 80}, 70 71 // GB9. 72 {grAny, prExtend}: {grAny, grNoBoundary, 90}, 73 {grAny, prZWJ}: {grAny, grNoBoundary, 90}, 74 75 // GB9a. 76 {grAny, prSpacingMark}: {grAny, grNoBoundary, 91}, 77 78 // GB9b. 79 {grAny, prPreprend}: {grPrepend, grBoundary, 9990}, 80 {grPrepend, prAny}: {grAny, grNoBoundary, 92}, 81 82 // GB11. 83 {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990}, 84 {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110}, 85 {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110}, 86 {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110}, 87 88 // GB12 / GB13. 89 {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990}, 90 {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120}, 91 {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120}, 92 } 93 94 // Graphemes implements an iterator over Unicode extended grapheme clusters, 95 // specified in the Unicode Standard Annex #29. Grapheme clusters correspond to 96 // "user-perceived characters". These characters often consist of multiple 97 // code points (e.g. the "woman kissing woman" emoji consists of 8 code points: 98 // woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ + 99 // woman) and the rules described in Annex #29 must be applied to group those 100 // code points into clusters perceived by the user as one character. 101 type Graphemes struct { 102 // The code points over which this class iterates. 103 codePoints []rune 104 105 // The (byte-based) indices of the code points into the original string plus 106 // len(original string). Thus, len(indices) = len(codePoints) + 1. 107 indices []int 108 109 // The current grapheme cluster to be returned. These are indices into 110 // codePoints/indices. If start == end, we either haven't started iterating 111 // yet (0) or the iteration has already completed (1). 112 start, end int 113 114 // The index of the next code point to be parsed. 115 pos int 116 117 // The current state of the code point parser. 118 state int 119 } 120 121 // NewGraphemes returns a new grapheme cluster iterator. 122 func NewGraphemes(s string) *Graphemes { 123 l := utf8.RuneCountInString(s) 124 codePoints := make([]rune, l) 125 indices := make([]int, l+1) 126 i := 0 127 for pos, r := range s { 128 codePoints[i] = r 129 indices[i] = pos 130 i++ 131 } 132 indices[l] = len(s) 133 g := &Graphemes{ 134 codePoints: codePoints, 135 indices: indices, 136 } 137 g.Next() // Parse ahead. 138 return g 139 } 140 141 // Next advances the iterator by one grapheme cluster and returns false if no 142 // clusters are left. This function must be called before the first cluster is 143 // accessed. 144 func (g *Graphemes) Next() bool { 145 g.start = g.end 146 147 // The state transition gives us a boundary instruction BEFORE the next code 148 // point so we always need to stay ahead by one code point. 149 150 // Parse the next code point. 151 for g.pos <= len(g.codePoints) { 152 // GB2. 153 if g.pos == len(g.codePoints) { 154 g.end = g.pos 155 g.pos++ 156 break 157 } 158 159 // Determine the property of the next character. 160 nextProperty := property(g.codePoints[g.pos]) 161 g.pos++ 162 163 // Find the applicable transition. 164 var boundary bool 165 transition, ok := grTransitions[[2]int{g.state, nextProperty}] 166 if ok { 167 // We have a specific transition. We'll use it. 168 g.state = transition[0] 169 boundary = transition[1] == grBoundary 170 } else { 171 // No specific transition found. Try the less specific ones. 172 transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}] 173 transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}] 174 if okAnyProp && okAnyState { 175 // Both apply. We'll use a mix (see comments for grTransitions). 176 g.state = transAnyState[0] 177 boundary = transAnyState[1] == grBoundary 178 if transAnyProp[2] < transAnyState[2] { 179 g.state = transAnyProp[0] 180 boundary = transAnyProp[1] == grBoundary 181 } 182 } else if okAnyProp { 183 // We only have a specific state. 184 g.state = transAnyProp[0] 185 boundary = transAnyProp[1] == grBoundary 186 // This branch will probably never be reached because okAnyState will 187 // always be true given the current transition map. But we keep it here 188 // for future modifications to the transition map where this may not be 189 // true anymore. 190 } else if okAnyState { 191 // We only have a specific property. 192 g.state = transAnyState[0] 193 boundary = transAnyState[1] == grBoundary 194 } else { 195 // No known transition. GB999: Any x Any. 196 g.state = grAny 197 boundary = true 198 } 199 } 200 201 // If we found a cluster boundary, let's stop here. The current cluster will 202 // be the one that just ended. 203 if g.pos-1 == 0 /* GB1 */ || boundary { 204 g.end = g.pos - 1 205 break 206 } 207 } 208 209 return g.start != g.end 210 } 211 212 // Runes returns a slice of runes (code points) which corresponds to the current 213 // grapheme cluster. If the iterator is already past the end or Next() has not 214 // yet been called, nil is returned. 215 func (g *Graphemes) Runes() []rune { 216 if g.start == g.end { 217 return nil 218 } 219 return g.codePoints[g.start:g.end] 220 } 221 222 // Str returns a substring of the original string which corresponds to the 223 // current grapheme cluster. If the iterator is already past the end or Next() 224 // has not yet been called, an empty string is returned. 225 func (g *Graphemes) Str() string { 226 if g.start == g.end { 227 return "" 228 } 229 return string(g.codePoints[g.start:g.end]) 230 } 231 232 // Bytes returns a byte slice which corresponds to the current grapheme cluster. 233 // If the iterator is already past the end or Next() has not yet been called, 234 // nil is returned. 235 func (g *Graphemes) Bytes() []byte { 236 if g.start == g.end { 237 return nil 238 } 239 return []byte(string(g.codePoints[g.start:g.end])) 240 } 241 242 // Positions returns the interval of the current grapheme cluster as byte 243 // positions into the original string. The first returned value "from" indexes 244 // the first byte and the second returned value "to" indexes the first byte that 245 // is not included anymore, i.e. str[from:to] is the current grapheme cluster of 246 // the original string "str". If Next() has not yet been called, both values are 247 // 0. If the iterator is already past the end, both values are 1. 248 func (g *Graphemes) Positions() (int, int) { 249 return g.indices[g.start], g.indices[g.end] 250 } 251 252 // Reset puts the iterator into its initial state such that the next call to 253 // Next() sets it to the first grapheme cluster again. 254 func (g *Graphemes) Reset() { 255 g.start, g.end, g.pos, g.state = 0, 0, 0, grAny 256 g.Next() // Parse ahead again. 257 } 258 259 // GraphemeClusterCount returns the number of user-perceived characters 260 // (grapheme clusters) for the given string. To calculate this number, it 261 // iterates through the string using the Graphemes iterator. 262 func GraphemeClusterCount(s string) (n int) { 263 g := NewGraphemes(s) 264 for g.Next() { 265 n++ 266 } 267 return 268 }