github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/internal/colltab/iter.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package colltab
     6  
     7  import (
     8  	"golang.org/x/text/collate/colltab"
     9  )
    10  
    11  // An Iter incrementally converts chunks of the input text to collation
    12  // elements, while ensuring that the collation elements are in normalized order
    13  // (that is, they are in the order as if the input text were normalized first).
    14  type Iter struct {
    15  	Weighter colltab.Weighter
    16  	Elems    []colltab.Elem
    17  	// N is the number of elements in Elems that will not be reordered on
    18  	// subsequent iterations, N <= len(Elems).
    19  	N int
    20  
    21  	bytes []byte
    22  	str   string
    23  	// Because the Elems buffer may contain collation elements that are needed
    24  	// for look-ahead, we need two positions in the text (bytes or str): one for
    25  	// the end position in the text for the current iteration and one for the
    26  	// start of the next call to appendNext.
    27  	pEnd  int // end position in text corresponding to N.
    28  	pNext int // pEnd <= pNext.
    29  }
    30  
    31  // Reset sets the position in the current input text to p and discards any
    32  // results obtained so far.
    33  func (i *Iter) Reset(p int) {
    34  	i.Elems = i.Elems[:0]
    35  	i.N = 0
    36  	i.pEnd = p
    37  	i.pNext = p
    38  }
    39  
    40  // Len returns the length of the input text.
    41  func (i *Iter) Len() int {
    42  	if i.bytes != nil {
    43  		return len(i.bytes)
    44  	}
    45  	return len(i.str)
    46  }
    47  
    48  // Discard removes the collation elements up to N.
    49  func (i *Iter) Discard() {
    50  	// TODO: change this such that only modifiers following starters will have
    51  	// to be copied.
    52  	i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])]
    53  	i.N = 0
    54  }
    55  
    56  // End returns the end position of the input text for which Next has returned
    57  // results.
    58  func (i *Iter) End() int {
    59  	return i.pEnd
    60  }
    61  
    62  // SetInput resets i to input s.
    63  func (i *Iter) SetInput(s []byte) {
    64  	i.bytes = s
    65  	i.str = ""
    66  	i.Reset(0)
    67  }
    68  
    69  // SetInputString resets i to input s.
    70  func (i *Iter) SetInputString(s string) {
    71  	i.str = s
    72  	i.bytes = nil
    73  	i.Reset(0)
    74  }
    75  
    76  func (i *Iter) done() bool {
    77  	return i.pNext >= len(i.str) && i.pNext >= len(i.bytes)
    78  }
    79  
    80  func (i *Iter) appendNext() bool {
    81  	if i.done() {
    82  		return false
    83  	}
    84  	var sz int
    85  	if i.bytes == nil {
    86  		i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:])
    87  	} else {
    88  		i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:])
    89  	}
    90  	i.pNext += sz
    91  	return true
    92  }
    93  
    94  // Next appends Elems to the internal array. On each iteration, it will either
    95  // add starters or modifiers. In the majority of cases, an Elem with a primary
    96  // value > 0 will have a CCC of 0. The CCC values of collation elements are also
    97  // used to detect if the input string was not normalized and to adjust the
    98  // result accordingly.
    99  func (i *Iter) Next() bool {
   100  	if i.N == len(i.Elems) && !i.appendNext() {
   101  		return false
   102  	}
   103  
   104  	// Check if the current segment starts with a starter.
   105  	prevCCC := i.Elems[len(i.Elems)-1].CCC()
   106  	if prevCCC == 0 {
   107  		i.N = len(i.Elems)
   108  		i.pEnd = i.pNext
   109  		return true
   110  	} else if i.Elems[i.N].CCC() == 0 {
   111  		// set i.N to only cover part of i.Elems for which prevCCC == 0 and
   112  		// use rest for the next call to next.
   113  		for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ {
   114  		}
   115  		i.pEnd = i.pNext
   116  		return true
   117  	}
   118  
   119  	// The current (partial) segment starts with modifiers. We need to collect
   120  	// all successive modifiers to ensure that they are normalized.
   121  	for {
   122  		p := len(i.Elems)
   123  		i.pEnd = i.pNext
   124  		if !i.appendNext() {
   125  			break
   126  		}
   127  
   128  		if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters {
   129  			// Leave the starter for the next iteration. This ensures that we
   130  			// do not return sequences of collation elements that cross two
   131  			// segments.
   132  			//
   133  			// TODO: handle large number of combining characters by fully
   134  			// normalizing the input segment before iteration. This ensures
   135  			// results are consistent across the text repo.
   136  			i.N = p
   137  			return true
   138  		} else if ccc < prevCCC {
   139  			i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC.
   140  		} else {
   141  			prevCCC = ccc
   142  		}
   143  	}
   144  
   145  	done := len(i.Elems) != i.N
   146  	i.N = len(i.Elems)
   147  	return done
   148  }
   149  
   150  // nextNoNorm is the same as next, but does not "normalize" the collation
   151  // elements.
   152  func (i *Iter) nextNoNorm() bool {
   153  	// TODO: remove this function. Using this instead of next does not seem
   154  	// to improve performance in any significant way. We retain this until
   155  	// later for evaluation purposes.
   156  	if i.done() {
   157  		return false
   158  	}
   159  	i.appendNext()
   160  	i.N = len(i.Elems)
   161  	return true
   162  }
   163  
   164  const maxCombiningCharacters = 30
   165  
   166  // doNorm reorders the collation elements in i.Elems.
   167  // It assumes that blocks of collation elements added with appendNext
   168  // either start and end with the same CCC or start with CCC == 0.
   169  // This allows for a single insertion point for the entire block.
   170  // The correctness of this assumption is verified in builder.go.
   171  func (i *Iter) doNorm(p int, ccc uint8) {
   172  	n := len(i.Elems)
   173  	k := p
   174  	for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- {
   175  	}
   176  	i.Elems = append(i.Elems, i.Elems[p:k]...)
   177  	copy(i.Elems[p:], i.Elems[k:])
   178  	i.Elems = i.Elems[:n]
   179  }