gitee.com/quant1x/gox@v1.21.2/text/uniseg/grapheme.go (about)

     1  package uniseg
     2  
     3  import "unicode/utf8"
     4  
     5  // The states of the grapheme cluster parser.
     6  const (
     7  	grAny = iota
     8  	grCR
     9  	grControlLF
    10  	grL
    11  	grLVV
    12  	grLVTT
    13  	grPrepend
    14  	grExtendedPictographic
    15  	grExtendedPictographicZWJ
    16  	grRIOdd
    17  	grRIEven
    18  )
    19  
    20  // The grapheme cluster parser's breaking instructions.
    21  const (
    22  	grNoBoundary = iota
    23  	grBoundary
    24  )
    25  
    26  // The grapheme cluster parser's state transitions. Maps (state, property) to
    27  // (new state, breaking instruction, rule number). The breaking instruction
    28  // always refers to the boundary between the last and next code point.
    29  //
    30  // This map is queried as follows:
    31  //
    32  //  1. Find specific state + specific property. Stop if found.
    33  //  2. Find specific state + any property.
    34  //  3. Find any state + specific property.
    35  //  4. If only (2) or (3) (but not both) was found, stop.
    36  //  5. If both (2) and (3) were found, use state and breaking instruction from
    37  //     the transition with the lower rule number, prefer (3) if rule numbers
    38  //     are equal. Stop.
    39  //  6. Assume grAny and grBoundary.
    40  var grTransitions = map[[2]int][3]int{
    41  	// GB5
    42  	{grAny, prCR}:      {grCR, grBoundary, 50},
    43  	{grAny, prLF}:      {grControlLF, grBoundary, 50},
    44  	{grAny, prControl}: {grControlLF, grBoundary, 50},
    45  
    46  	// GB4
    47  	{grCR, prAny}:        {grAny, grBoundary, 40},
    48  	{grControlLF, prAny}: {grAny, grBoundary, 40},
    49  
    50  	// GB3.
    51  	{grCR, prLF}: {grAny, grNoBoundary, 30},
    52  
    53  	// GB6.
    54  	{grAny, prL}: {grL, grBoundary, 9990},
    55  	{grL, prL}:   {grL, grNoBoundary, 60},
    56  	{grL, prV}:   {grLVV, grNoBoundary, 60},
    57  	{grL, prLV}:  {grLVV, grNoBoundary, 60},
    58  	{grL, prLVT}: {grLVTT, grNoBoundary, 60},
    59  
    60  	// GB7.
    61  	{grAny, prLV}: {grLVV, grBoundary, 9990},
    62  	{grAny, prV}:  {grLVV, grBoundary, 9990},
    63  	{grLVV, prV}:  {grLVV, grNoBoundary, 70},
    64  	{grLVV, prT}:  {grLVTT, grNoBoundary, 70},
    65  
    66  	// GB8.
    67  	{grAny, prLVT}: {grLVTT, grBoundary, 9990},
    68  	{grAny, prT}:   {grLVTT, grBoundary, 9990},
    69  	{grLVTT, prT}:  {grLVTT, grNoBoundary, 80},
    70  
    71  	// GB9.
    72  	{grAny, prExtend}: {grAny, grNoBoundary, 90},
    73  	{grAny, prZWJ}:    {grAny, grNoBoundary, 90},
    74  
    75  	// GB9a.
    76  	{grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
    77  
    78  	// GB9b.
    79  	{grAny, prPreprend}: {grPrepend, grBoundary, 9990},
    80  	{grPrepend, prAny}:  {grAny, grNoBoundary, 92},
    81  
    82  	// GB11.
    83  	{grAny, prExtendedPictographic}:                     {grExtendedPictographic, grBoundary, 9990},
    84  	{grExtendedPictographic, prExtend}:                  {grExtendedPictographic, grNoBoundary, 110},
    85  	{grExtendedPictographic, prZWJ}:                     {grExtendedPictographicZWJ, grNoBoundary, 110},
    86  	{grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
    87  
    88  	// GB12 / GB13.
    89  	{grAny, prRegionalIndicator}:    {grRIOdd, grBoundary, 9990},
    90  	{grRIOdd, prRegionalIndicator}:  {grRIEven, grNoBoundary, 120},
    91  	{grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
    92  }
    93  
    94  // Graphemes implements an iterator over Unicode extended grapheme clusters,
    95  // specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
    96  // "user-perceived characters". These characters often consist of multiple
    97  // code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
    98  // woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
    99  // woman) and the rules described in Annex #29 must be applied to group those
   100  // code points into clusters perceived by the user as one character.
   101  type Graphemes struct {
   102  	// The code points over which this class iterates.
   103  	codePoints []rune
   104  
   105  	// The (byte-based) indices of the code points into the original string plus
   106  	// len(original string). Thus, len(indices) = len(codePoints) + 1.
   107  	indices []int
   108  
   109  	// The current grapheme cluster to be returned. These are indices into
   110  	// codePoints/indices. If start == end, we either haven't started iterating
   111  	// yet (0) or the iteration has already completed (1).
   112  	start, end int
   113  
   114  	// The index of the next code point to be parsed.
   115  	pos int
   116  
   117  	// The current state of the code point parser.
   118  	state int
   119  }
   120  
   121  // NewGraphemes returns a new grapheme cluster iterator.
   122  func NewGraphemes(s string) *Graphemes {
   123  	l := utf8.RuneCountInString(s)
   124  	codePoints := make([]rune, l)
   125  	indices := make([]int, l+1)
   126  	i := 0
   127  	for pos, r := range s {
   128  		codePoints[i] = r
   129  		indices[i] = pos
   130  		i++
   131  	}
   132  	indices[l] = len(s)
   133  	g := &Graphemes{
   134  		codePoints: codePoints,
   135  		indices:    indices,
   136  	}
   137  	g.Next() // Parse ahead.
   138  	return g
   139  }
   140  
   141  // Next advances the iterator by one grapheme cluster and returns false if no
   142  // clusters are left. This function must be called before the first cluster is
   143  // accessed.
   144  func (g *Graphemes) Next() bool {
   145  	g.start = g.end
   146  
   147  	// The state transition gives us a boundary instruction BEFORE the next code
   148  	// point so we always need to stay ahead by one code point.
   149  
   150  	// Parse the next code point.
   151  	for g.pos <= len(g.codePoints) {
   152  		// GB2.
   153  		if g.pos == len(g.codePoints) {
   154  			g.end = g.pos
   155  			g.pos++
   156  			break
   157  		}
   158  
   159  		// Determine the property of the next character.
   160  		nextProperty := property(g.codePoints[g.pos])
   161  		g.pos++
   162  
   163  		// Find the applicable transition.
   164  		var boundary bool
   165  		transition, ok := grTransitions[[2]int{g.state, nextProperty}]
   166  		if ok {
   167  			// We have a specific transition. We'll use it.
   168  			g.state = transition[0]
   169  			boundary = transition[1] == grBoundary
   170  		} else {
   171  			// No specific transition found. Try the less specific ones.
   172  			transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
   173  			transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
   174  			if okAnyProp && okAnyState {
   175  				// Both apply. We'll use a mix (see comments for grTransitions).
   176  				g.state = transAnyState[0]
   177  				boundary = transAnyState[1] == grBoundary
   178  				if transAnyProp[2] < transAnyState[2] {
   179  					g.state = transAnyProp[0]
   180  					boundary = transAnyProp[1] == grBoundary
   181  				}
   182  			} else if okAnyProp {
   183  				// We only have a specific state.
   184  				g.state = transAnyProp[0]
   185  				boundary = transAnyProp[1] == grBoundary
   186  				// This branch will probably never be reached because okAnyState will
   187  				// always be true given the current transition map. But we keep it here
   188  				// for future modifications to the transition map where this may not be
   189  				// true anymore.
   190  			} else if okAnyState {
   191  				// We only have a specific property.
   192  				g.state = transAnyState[0]
   193  				boundary = transAnyState[1] == grBoundary
   194  			} else {
   195  				// No known transition. GB999: Any x Any.
   196  				g.state = grAny
   197  				boundary = true
   198  			}
   199  		}
   200  
   201  		// If we found a cluster boundary, let's stop here. The current cluster will
   202  		// be the one that just ended.
   203  		if g.pos-1 == 0 /* GB1 */ || boundary {
   204  			g.end = g.pos - 1
   205  			break
   206  		}
   207  	}
   208  
   209  	return g.start != g.end
   210  }
   211  
   212  // Runes returns a slice of runes (code points) which corresponds to the current
   213  // grapheme cluster. If the iterator is already past the end or Next() has not
   214  // yet been called, nil is returned.
   215  func (g *Graphemes) Runes() []rune {
   216  	if g.start == g.end {
   217  		return nil
   218  	}
   219  	return g.codePoints[g.start:g.end]
   220  }
   221  
   222  // Str returns a substring of the original string which corresponds to the
   223  // current grapheme cluster. If the iterator is already past the end or Next()
   224  // has not yet been called, an empty string is returned.
   225  func (g *Graphemes) Str() string {
   226  	if g.start == g.end {
   227  		return ""
   228  	}
   229  	return string(g.codePoints[g.start:g.end])
   230  }
   231  
   232  // Bytes returns a byte slice which corresponds to the current grapheme cluster.
   233  // If the iterator is already past the end or Next() has not yet been called,
   234  // nil is returned.
   235  func (g *Graphemes) Bytes() []byte {
   236  	if g.start == g.end {
   237  		return nil
   238  	}
   239  	return []byte(string(g.codePoints[g.start:g.end]))
   240  }
   241  
   242  // Positions returns the interval of the current grapheme cluster as byte
   243  // positions into the original string. The first returned value "from" indexes
   244  // the first byte and the second returned value "to" indexes the first byte that
   245  // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
   246  // the original string "str". If Next() has not yet been called, both values are
   247  // 0. If the iterator is already past the end, both values are 1.
   248  func (g *Graphemes) Positions() (int, int) {
   249  	return g.indices[g.start], g.indices[g.end]
   250  }
   251  
   252  // Reset puts the iterator into its initial state such that the next call to
   253  // Next() sets it to the first grapheme cluster again.
   254  func (g *Graphemes) Reset() {
   255  	g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
   256  	g.Next() // Parse ahead again.
   257  }
   258  
   259  // GraphemeClusterCount returns the number of user-perceived characters
   260  // (grapheme clusters) for the given string. To calculate this number, it
   261  // iterates through the string using the Graphemes iterator.
   262  func GraphemeClusterCount(s string) (n int) {
   263  	g := NewGraphemes(s)
   264  	for g.Next() {
   265  		n++
   266  	}
   267  	return
   268  }