github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/internal/colltab/iter.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package colltab 6 7 import ( 8 "golang.org/x/text/collate/colltab" 9 ) 10 11 // An Iter incrementally converts chunks of the input text to collation 12 // elements, while ensuring that the collation elements are in normalized order 13 // (that is, they are in the order as if the input text were normalized first). 14 type Iter struct { 15 Weighter colltab.Weighter 16 Elems []colltab.Elem 17 // N is the number of elements in Elems that will not be reordered on 18 // subsequent iterations, N <= len(Elems). 19 N int 20 21 bytes []byte 22 str string 23 // Because the Elems buffer may contain collation elements that are needed 24 // for look-ahead, we need two positions in the text (bytes or str): one for 25 // the end position in the text for the current iteration and one for the 26 // start of the next call to appendNext. 27 pEnd int // end position in text corresponding to N. 28 pNext int // pEnd <= pNext. 29 } 30 31 // Reset sets the position in the current input text to p and discards any 32 // results obtained so far. 33 func (i *Iter) Reset(p int) { 34 i.Elems = i.Elems[:0] 35 i.N = 0 36 i.pEnd = p 37 i.pNext = p 38 } 39 40 // Len returns the length of the input text. 41 func (i *Iter) Len() int { 42 if i.bytes != nil { 43 return len(i.bytes) 44 } 45 return len(i.str) 46 } 47 48 // Discard removes the collation elements up to N. 49 func (i *Iter) Discard() { 50 // TODO: change this such that only modifiers following starters will have 51 // to be copied. 52 i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])] 53 i.N = 0 54 } 55 56 // End returns the end position of the input text for which Next has returned 57 // results. 58 func (i *Iter) End() int { 59 return i.pEnd 60 } 61 62 // SetInput resets i to input s. 63 func (i *Iter) SetInput(s []byte) { 64 i.bytes = s 65 i.str = "" 66 i.Reset(0) 67 } 68 69 // SetInputString resets i to input s. 70 func (i *Iter) SetInputString(s string) { 71 i.str = s 72 i.bytes = nil 73 i.Reset(0) 74 } 75 76 func (i *Iter) done() bool { 77 return i.pNext >= len(i.str) && i.pNext >= len(i.bytes) 78 } 79 80 func (i *Iter) appendNext() bool { 81 if i.done() { 82 return false 83 } 84 var sz int 85 if i.bytes == nil { 86 i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:]) 87 } else { 88 i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:]) 89 } 90 i.pNext += sz 91 return true 92 } 93 94 // Next appends Elems to the internal array. On each iteration, it will either 95 // add starters or modifiers. In the majority of cases, an Elem with a primary 96 // value > 0 will have a CCC of 0. The CCC values of collation elements are also 97 // used to detect if the input string was not normalized and to adjust the 98 // result accordingly. 99 func (i *Iter) Next() bool { 100 if i.N == len(i.Elems) && !i.appendNext() { 101 return false 102 } 103 104 // Check if the current segment starts with a starter. 105 prevCCC := i.Elems[len(i.Elems)-1].CCC() 106 if prevCCC == 0 { 107 i.N = len(i.Elems) 108 i.pEnd = i.pNext 109 return true 110 } else if i.Elems[i.N].CCC() == 0 { 111 // set i.N to only cover part of i.Elems for which prevCCC == 0 and 112 // use rest for the next call to next. 113 for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ { 114 } 115 i.pEnd = i.pNext 116 return true 117 } 118 119 // The current (partial) segment starts with modifiers. We need to collect 120 // all successive modifiers to ensure that they are normalized. 121 for { 122 p := len(i.Elems) 123 i.pEnd = i.pNext 124 if !i.appendNext() { 125 break 126 } 127 128 if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters { 129 // Leave the starter for the next iteration. This ensures that we 130 // do not return sequences of collation elements that cross two 131 // segments. 132 // 133 // TODO: handle large number of combining characters by fully 134 // normalizing the input segment before iteration. This ensures 135 // results are consistent across the text repo. 136 i.N = p 137 return true 138 } else if ccc < prevCCC { 139 i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC. 140 } else { 141 prevCCC = ccc 142 } 143 } 144 145 done := len(i.Elems) != i.N 146 i.N = len(i.Elems) 147 return done 148 } 149 150 // nextNoNorm is the same as next, but does not "normalize" the collation 151 // elements. 152 func (i *Iter) nextNoNorm() bool { 153 // TODO: remove this function. Using this instead of next does not seem 154 // to improve performance in any significant way. We retain this until 155 // later for evaluation purposes. 156 if i.done() { 157 return false 158 } 159 i.appendNext() 160 i.N = len(i.Elems) 161 return true 162 } 163 164 const maxCombiningCharacters = 30 165 166 // doNorm reorders the collation elements in i.Elems. 167 // It assumes that blocks of collation elements added with appendNext 168 // either start and end with the same CCC or start with CCC == 0. 169 // This allows for a single insertion point for the entire block. 170 // The correctness of this assumption is verified in builder.go. 171 func (i *Iter) doNorm(p int, ccc uint8) { 172 n := len(i.Elems) 173 k := p 174 for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- { 175 } 176 i.Elems = append(i.Elems, i.Elems[p:k]...) 177 copy(i.Elems[p:], i.Elems[k:]) 178 i.Elems = i.Elems[:n] 179 }