github.com/mdaxf/iac@v0.0.0-20240519030858-58a061660378/vendor_skip/golang.org/x/text/unicode/norm/iter.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package norm 6 7 import ( 8 "fmt" 9 "unicode/utf8" 10 ) 11 12 // MaxSegmentSize is the maximum size of a byte buffer needed to consider any 13 // sequence of starter and non-starter runes for the purpose of normalization. 14 const MaxSegmentSize = maxByteBufferSize 15 16 // An Iter iterates over a string or byte slice, while normalizing it 17 // to a given Form. 18 type Iter struct { 19 rb reorderBuffer 20 buf [maxByteBufferSize]byte 21 info Properties // first character saved from previous iteration 22 next iterFunc // implementation of next depends on form 23 asciiF iterFunc 24 25 p int // current position in input source 26 multiSeg []byte // remainder of multi-segment decomposition 27 } 28 29 type iterFunc func(*Iter) []byte 30 31 // Init initializes i to iterate over src after normalizing it to Form f. 32 func (i *Iter) Init(f Form, src []byte) { 33 i.p = 0 34 if len(src) == 0 { 35 i.setDone() 36 i.rb.nsrc = 0 37 return 38 } 39 i.multiSeg = nil 40 i.rb.init(f, src) 41 i.next = i.rb.f.nextMain 42 i.asciiF = nextASCIIBytes 43 i.info = i.rb.f.info(i.rb.src, i.p) 44 i.rb.ss.first(i.info) 45 } 46 47 // InitString initializes i to iterate over src after normalizing it to Form f. 48 func (i *Iter) InitString(f Form, src string) { 49 i.p = 0 50 if len(src) == 0 { 51 i.setDone() 52 i.rb.nsrc = 0 53 return 54 } 55 i.multiSeg = nil 56 i.rb.initString(f, src) 57 i.next = i.rb.f.nextMain 58 i.asciiF = nextASCIIString 59 i.info = i.rb.f.info(i.rb.src, i.p) 60 i.rb.ss.first(i.info) 61 } 62 63 // Seek sets the segment to be returned by the next call to Next to start 64 // at position p. It is the responsibility of the caller to set p to the 65 // start of a segment. 66 func (i *Iter) Seek(offset int64, whence int) (int64, error) { 67 var abs int64 68 switch whence { 69 case 0: 70 abs = offset 71 case 1: 72 abs = int64(i.p) + offset 73 case 2: 74 abs = int64(i.rb.nsrc) + offset 75 default: 76 return 0, fmt.Errorf("norm: invalid whence") 77 } 78 if abs < 0 { 79 return 0, fmt.Errorf("norm: negative position") 80 } 81 if int(abs) >= i.rb.nsrc { 82 i.setDone() 83 return int64(i.p), nil 84 } 85 i.p = int(abs) 86 i.multiSeg = nil 87 i.next = i.rb.f.nextMain 88 i.info = i.rb.f.info(i.rb.src, i.p) 89 i.rb.ss.first(i.info) 90 return abs, nil 91 } 92 93 // returnSlice returns a slice of the underlying input type as a byte slice. 94 // If the underlying is of type []byte, it will simply return a slice. 95 // If the underlying is of type string, it will copy the slice to the buffer 96 // and return that. 97 func (i *Iter) returnSlice(a, b int) []byte { 98 if i.rb.src.bytes == nil { 99 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] 100 } 101 return i.rb.src.bytes[a:b] 102 } 103 104 // Pos returns the byte position at which the next call to Next will commence processing. 105 func (i *Iter) Pos() int { 106 return i.p 107 } 108 109 func (i *Iter) setDone() { 110 i.next = nextDone 111 i.p = i.rb.nsrc 112 } 113 114 // Done returns true if there is no more input to process. 115 func (i *Iter) Done() bool { 116 return i.p >= i.rb.nsrc 117 } 118 119 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. 120 // For any input a and b for which f(a) == f(b), subsequent calls 121 // to Next will return the same segments. 122 // Modifying runes are grouped together with the preceding starter, if such a starter exists. 123 // Although not guaranteed, n will typically be the smallest possible n. 124 func (i *Iter) Next() []byte { 125 return i.next(i) 126 } 127 128 func nextASCIIBytes(i *Iter) []byte { 129 p := i.p + 1 130 if p >= i.rb.nsrc { 131 p0 := i.p 132 i.setDone() 133 return i.rb.src.bytes[p0:p] 134 } 135 if i.rb.src.bytes[p] < utf8.RuneSelf { 136 p0 := i.p 137 i.p = p 138 return i.rb.src.bytes[p0:p] 139 } 140 i.info = i.rb.f.info(i.rb.src, i.p) 141 i.next = i.rb.f.nextMain 142 return i.next(i) 143 } 144 145 func nextASCIIString(i *Iter) []byte { 146 p := i.p + 1 147 if p >= i.rb.nsrc { 148 i.buf[0] = i.rb.src.str[i.p] 149 i.setDone() 150 return i.buf[:1] 151 } 152 if i.rb.src.str[p] < utf8.RuneSelf { 153 i.buf[0] = i.rb.src.str[i.p] 154 i.p = p 155 return i.buf[:1] 156 } 157 i.info = i.rb.f.info(i.rb.src, i.p) 158 i.next = i.rb.f.nextMain 159 return i.next(i) 160 } 161 162 func nextHangul(i *Iter) []byte { 163 p := i.p 164 next := p + hangulUTF8Size 165 if next >= i.rb.nsrc { 166 i.setDone() 167 } else if i.rb.src.hangul(next) == 0 { 168 i.rb.ss.next(i.info) 169 i.info = i.rb.f.info(i.rb.src, i.p) 170 i.next = i.rb.f.nextMain 171 return i.next(i) 172 } 173 i.p = next 174 return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] 175 } 176 177 func nextDone(i *Iter) []byte { 178 return nil 179 } 180 181 // nextMulti is used for iterating over multi-segment decompositions 182 // for decomposing normal forms. 183 func nextMulti(i *Iter) []byte { 184 j := 0 185 d := i.multiSeg 186 // skip first rune 187 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { 188 } 189 for j < len(d) { 190 info := i.rb.f.info(input{bytes: d}, j) 191 if info.BoundaryBefore() { 192 i.multiSeg = d[j:] 193 return d[:j] 194 } 195 j += int(info.size) 196 } 197 // treat last segment as normal decomposition 198 i.next = i.rb.f.nextMain 199 return i.next(i) 200 } 201 202 // nextMultiNorm is used for iterating over multi-segment decompositions 203 // for composing normal forms. 204 func nextMultiNorm(i *Iter) []byte { 205 j := 0 206 d := i.multiSeg 207 for j < len(d) { 208 info := i.rb.f.info(input{bytes: d}, j) 209 if info.BoundaryBefore() { 210 i.rb.compose() 211 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 212 i.rb.insertUnsafe(input{bytes: d}, j, info) 213 i.multiSeg = d[j+int(info.size):] 214 return seg 215 } 216 i.rb.insertUnsafe(input{bytes: d}, j, info) 217 j += int(info.size) 218 } 219 i.multiSeg = nil 220 i.next = nextComposed 221 return doNormComposed(i) 222 } 223 224 // nextDecomposed is the implementation of Next for forms NFD and NFKD. 225 func nextDecomposed(i *Iter) (next []byte) { 226 outp := 0 227 inCopyStart, outCopyStart := i.p, 0 228 for { 229 if sz := int(i.info.size); sz <= 1 { 230 i.rb.ss = 0 231 p := i.p 232 i.p++ // ASCII or illegal byte. Either way, advance by 1. 233 if i.p >= i.rb.nsrc { 234 i.setDone() 235 return i.returnSlice(p, i.p) 236 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 237 i.next = i.asciiF 238 return i.returnSlice(p, i.p) 239 } 240 outp++ 241 } else if d := i.info.Decomposition(); d != nil { 242 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. 243 // Case 1: there is a leftover to copy. In this case the decomposition 244 // must begin with a modifier and should always be appended. 245 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. 246 p := outp + len(d) 247 if outp > 0 { 248 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 249 // TODO: this condition should not be possible, but we leave it 250 // in for defensive purposes. 251 if p > len(i.buf) { 252 return i.buf[:outp] 253 } 254 } else if i.info.multiSegment() { 255 // outp must be 0 as multi-segment decompositions always 256 // start a new segment. 257 if i.multiSeg == nil { 258 i.multiSeg = d 259 i.next = nextMulti 260 return nextMulti(i) 261 } 262 // We are in the last segment. Treat as normal decomposition. 263 d = i.multiSeg 264 i.multiSeg = nil 265 p = len(d) 266 } 267 prevCC := i.info.tccc 268 if i.p += sz; i.p >= i.rb.nsrc { 269 i.setDone() 270 i.info = Properties{} // Force BoundaryBefore to succeed. 271 } else { 272 i.info = i.rb.f.info(i.rb.src, i.p) 273 } 274 switch i.rb.ss.next(i.info) { 275 case ssOverflow: 276 i.next = nextCGJDecompose 277 fallthrough 278 case ssStarter: 279 if outp > 0 { 280 copy(i.buf[outp:], d) 281 return i.buf[:p] 282 } 283 return d 284 } 285 copy(i.buf[outp:], d) 286 outp = p 287 inCopyStart, outCopyStart = i.p, outp 288 if i.info.ccc < prevCC { 289 goto doNorm 290 } 291 continue 292 } else if r := i.rb.src.hangul(i.p); r != 0 { 293 outp = decomposeHangul(i.buf[:], r) 294 i.p += hangulUTF8Size 295 inCopyStart, outCopyStart = i.p, outp 296 if i.p >= i.rb.nsrc { 297 i.setDone() 298 break 299 } else if i.rb.src.hangul(i.p) != 0 { 300 i.next = nextHangul 301 return i.buf[:outp] 302 } 303 } else { 304 p := outp + sz 305 if p > len(i.buf) { 306 break 307 } 308 outp = p 309 i.p += sz 310 } 311 if i.p >= i.rb.nsrc { 312 i.setDone() 313 break 314 } 315 prevCC := i.info.tccc 316 i.info = i.rb.f.info(i.rb.src, i.p) 317 if v := i.rb.ss.next(i.info); v == ssStarter { 318 break 319 } else if v == ssOverflow { 320 i.next = nextCGJDecompose 321 break 322 } 323 if i.info.ccc < prevCC { 324 goto doNorm 325 } 326 } 327 if outCopyStart == 0 { 328 return i.returnSlice(inCopyStart, i.p) 329 } else if inCopyStart < i.p { 330 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 331 } 332 return i.buf[:outp] 333 doNorm: 334 // Insert what we have decomposed so far in the reorderBuffer. 335 // As we will only reorder, there will always be enough room. 336 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 337 i.rb.insertDecomposed(i.buf[0:outp]) 338 return doNormDecomposed(i) 339 } 340 341 func doNormDecomposed(i *Iter) []byte { 342 for { 343 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 344 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 345 i.setDone() 346 break 347 } 348 i.info = i.rb.f.info(i.rb.src, i.p) 349 if i.info.ccc == 0 { 350 break 351 } 352 if s := i.rb.ss.next(i.info); s == ssOverflow { 353 i.next = nextCGJDecompose 354 break 355 } 356 } 357 // new segment or too many combining characters: exit normalization 358 return i.buf[:i.rb.flushCopy(i.buf[:])] 359 } 360 361 func nextCGJDecompose(i *Iter) []byte { 362 i.rb.ss = 0 363 i.rb.insertCGJ() 364 i.next = nextDecomposed 365 i.rb.ss.first(i.info) 366 buf := doNormDecomposed(i) 367 return buf 368 } 369 370 // nextComposed is the implementation of Next for forms NFC and NFKC. 371 func nextComposed(i *Iter) []byte { 372 outp, startp := 0, i.p 373 var prevCC uint8 374 for { 375 if !i.info.isYesC() { 376 goto doNorm 377 } 378 prevCC = i.info.tccc 379 sz := int(i.info.size) 380 if sz == 0 { 381 sz = 1 // illegal rune: copy byte-by-byte 382 } 383 p := outp + sz 384 if p > len(i.buf) { 385 break 386 } 387 outp = p 388 i.p += sz 389 if i.p >= i.rb.nsrc { 390 i.setDone() 391 break 392 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 393 i.rb.ss = 0 394 i.next = i.asciiF 395 break 396 } 397 i.info = i.rb.f.info(i.rb.src, i.p) 398 if v := i.rb.ss.next(i.info); v == ssStarter { 399 break 400 } else if v == ssOverflow { 401 i.next = nextCGJCompose 402 break 403 } 404 if i.info.ccc < prevCC { 405 goto doNorm 406 } 407 } 408 return i.returnSlice(startp, i.p) 409 doNorm: 410 // reset to start position 411 i.p = startp 412 i.info = i.rb.f.info(i.rb.src, i.p) 413 i.rb.ss.first(i.info) 414 if i.info.multiSegment() { 415 d := i.info.Decomposition() 416 info := i.rb.f.info(input{bytes: d}, 0) 417 i.rb.insertUnsafe(input{bytes: d}, 0, info) 418 i.multiSeg = d[int(info.size):] 419 i.next = nextMultiNorm 420 return nextMultiNorm(i) 421 } 422 i.rb.ss.first(i.info) 423 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 424 return doNormComposed(i) 425 } 426 427 func doNormComposed(i *Iter) []byte { 428 // First rune should already be inserted. 429 for { 430 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 431 i.setDone() 432 break 433 } 434 i.info = i.rb.f.info(i.rb.src, i.p) 435 if s := i.rb.ss.next(i.info); s == ssStarter { 436 break 437 } else if s == ssOverflow { 438 i.next = nextCGJCompose 439 break 440 } 441 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 442 } 443 i.rb.compose() 444 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 445 return seg 446 } 447 448 func nextCGJCompose(i *Iter) []byte { 449 i.rb.ss = 0 // instead of first 450 i.rb.insertCGJ() 451 i.next = nextComposed 452 // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, 453 // even if they are not. This is particularly dubious for U+FF9E and UFF9A. 454 // If we ever change that, insert a check here. 455 i.rb.ss.first(i.info) 456 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 457 return doNormComposed(i) 458 }