github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/colltab/iter.go

github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/colltab/iter.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package colltab
     6  
     7  // An Iter incrementally converts chunks of the input text to collation
     8  // elements, while ensuring that the collation elements are in normalized order
     9  // (that is, they are in the order as if the input text were normalized first).
    10  type Iter struct {
    11  	Weighter Weighter
    12  	Elems    []Elem
    13  	// N is the number of elements in Elems that will not be reordered on
    14  	// subsequent iterations, N <= len(Elems).
    15  	N int
    16  
    17  	bytes []byte
    18  	str   string
    19  	// Because the Elems buffer may contain collation elements that are needed
    20  	// for look-ahead, we need two positions in the text (bytes or str): one for
    21  	// the end position in the text for the current iteration and one for the
    22  	// start of the next call to appendNext.
    23  	pEnd  int // end position in text corresponding to N.
    24  	pNext int // pEnd <= pNext.
    25  }
    26  
    27  // Reset sets the position in the current input text to p and discards any
    28  // results obtained so far.
    29  func (i *Iter) Reset(p int) {
    30  	i.Elems = i.Elems[:0]
    31  	i.N = 0
    32  	i.pEnd = p
    33  	i.pNext = p
    34  }
    35  
    36  // Len returns the length of the input text.
    37  func (i *Iter) Len() int {
    38  	if i.bytes != nil {
    39  		return len(i.bytes)
    40  	}
    41  	return len(i.str)
    42  }
    43  
    44  // Discard removes the collation elements up to N.
    45  func (i *Iter) Discard() {
    46  	// TODO: change this such that only modifiers following starters will have
    47  	// to be copied.
    48  	i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])]
    49  	i.N = 0
    50  }
    51  
    52  // End returns the end position of the input text for which Next has returned
    53  // results.
    54  func (i *Iter) End() int {
    55  	return i.pEnd
    56  }
    57  
    58  // SetInput resets i to input s.
    59  func (i *Iter) SetInput(s []byte) {
    60  	i.bytes = s
    61  	i.str = ""
    62  	i.Reset(0)
    63  }
    64  
    65  // SetInputString resets i to input s.
    66  func (i *Iter) SetInputString(s string) {
    67  	i.str = s
    68  	i.bytes = nil
    69  	i.Reset(0)
    70  }
    71  
    72  func (i *Iter) done() bool {
    73  	return i.pNext >= len(i.str) && i.pNext >= len(i.bytes)
    74  }
    75  
    76  func (i *Iter) appendNext() bool {
    77  	if i.done() {
    78  		return false
    79  	}
    80  	var sz int
    81  	if i.bytes == nil {
    82  		i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:])
    83  	} else {
    84  		i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:])
    85  	}
    86  	if sz == 0 {
    87  		sz = 1
    88  	}
    89  	i.pNext += sz
    90  	return true
    91  }
    92  
    93  // Next appends Elems to the internal array. On each iteration, it will either
    94  // add starters or modifiers. In the majority of cases, an Elem with a primary
    95  // value > 0 will have a CCC of 0. The CCC values of collation elements are also
    96  // used to detect if the input string was not normalized and to adjust the
    97  // result accordingly.
    98  func (i *Iter) Next() bool {
    99  	if i.N == len(i.Elems) && !i.appendNext() {
   100  		return false
   101  	}
   102  
   103  	// Check if the current segment starts with a starter.
   104  	prevCCC := i.Elems[len(i.Elems)-1].CCC()
   105  	if prevCCC == 0 {
   106  		i.N = len(i.Elems)
   107  		i.pEnd = i.pNext
   108  		return true
   109  	} else if i.Elems[i.N].CCC() == 0 {
   110  		// set i.N to only cover part of i.Elems for which prevCCC == 0 and
   111  		// use rest for the next call to next.
   112  		for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ {
   113  		}
   114  		i.pEnd = i.pNext
   115  		return true
   116  	}
   117  
   118  	// The current (partial) segment starts with modifiers. We need to collect
   119  	// all successive modifiers to ensure that they are normalized.
   120  	for {
   121  		p := len(i.Elems)
   122  		i.pEnd = i.pNext
   123  		if !i.appendNext() {
   124  			break
   125  		}
   126  
   127  		if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters {
   128  			// Leave the starter for the next iteration. This ensures that we
   129  			// do not return sequences of collation elements that cross two
   130  			// segments.
   131  			//
   132  			// TODO: handle large number of combining characters by fully
   133  			// normalizing the input segment before iteration. This ensures
   134  			// results are consistent across the text repo.
   135  			i.N = p
   136  			return true
   137  		} else if ccc < prevCCC {
   138  			i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC.
   139  		} else {
   140  			prevCCC = ccc
   141  		}
   142  	}
   143  
   144  	done := len(i.Elems) != i.N
   145  	i.N = len(i.Elems)
   146  	return done
   147  }
   148  
   149  // nextNoNorm is the same as next, but does not "normalize" the collation
   150  // elements.
   151  func (i *Iter) nextNoNorm() bool {
   152  	// TODO: remove this function. Using this instead of next does not seem
   153  	// to improve performance in any significant way. We retain this until
   154  	// later for evaluation purposes.
   155  	if i.done() {
   156  		return false
   157  	}
   158  	i.appendNext()
   159  	i.N = len(i.Elems)
   160  	return true
   161  }
   162  
   163  const maxCombiningCharacters = 30
   164  
   165  // doNorm reorders the collation elements in i.Elems.
   166  // It assumes that blocks of collation elements added with appendNext
   167  // either start and end with the same CCC or start with CCC == 0.
   168  // This allows for a single insertion point for the entire block.
   169  // The correctness of this assumption is verified in builder.go.
   170  func (i *Iter) doNorm(p int, ccc uint8) {
   171  	n := len(i.Elems)
   172  	k := p
   173  	for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- {
   174  	}
   175  	i.Elems = append(i.Elems, i.Elems[p:k]...)
   176  	copy(i.Elems[p:], i.Elems[k:])
   177  	i.Elems = i.Elems[:n]
   178  }