github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/unicode/norm/iter.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package norm 6 7 import ( 8 "fmt" 9 "unicode/utf8" 10 ) 11 12 // MaxSegmentSize is the maximum size of a byte buffer needed to consider any 13 // sequence of starter and non-starter runes for the purpose of normalization. 14 const MaxSegmentSize = maxByteBufferSize 15 16 // An Iter iterates over a string or byte slice, while normalizing it 17 // to a given Form. 18 type Iter struct { 19 rb reorderBuffer 20 buf [maxByteBufferSize]byte 21 info Properties // first character saved from previous iteration 22 next iterFunc // implementation of next depends on form 23 asciiF iterFunc 24 25 p int // current position in input source 26 multiSeg []byte // remainder of multi-segment decomposition 27 } 28 29 type iterFunc func(*Iter) []byte 30 31 // Init initializes i to iterate over src after normalizing it to Form f. 32 func (i *Iter) Init(f Form, src []byte) { 33 i.p = 0 34 if len(src) == 0 { 35 i.setDone() 36 i.rb.nsrc = 0 37 return 38 } 39 i.multiSeg = nil 40 i.rb.init(f, src) 41 i.next = i.rb.f.nextMain 42 i.asciiF = nextASCIIBytes 43 i.info = i.rb.f.info(i.rb.src, i.p) 44 } 45 46 // InitString initializes i to iterate over src after normalizing it to Form f. 47 func (i *Iter) InitString(f Form, src string) { 48 i.p = 0 49 if len(src) == 0 { 50 i.setDone() 51 i.rb.nsrc = 0 52 return 53 } 54 i.multiSeg = nil 55 i.rb.initString(f, src) 56 i.next = i.rb.f.nextMain 57 i.asciiF = nextASCIIString 58 i.info = i.rb.f.info(i.rb.src, i.p) 59 } 60 61 // Seek sets the segment to be returned by the next call to Next to start 62 // at position p. It is the responsibility of the caller to set p to the 63 // start of a UTF8 rune. 64 func (i *Iter) Seek(offset int64, whence int) (int64, error) { 65 var abs int64 66 switch whence { 67 case 0: 68 abs = offset 69 case 1: 70 abs = int64(i.p) + offset 71 case 2: 72 abs = int64(i.rb.nsrc) + offset 73 default: 74 return 0, fmt.Errorf("norm: invalid whence") 75 } 76 if abs < 0 { 77 return 0, fmt.Errorf("norm: negative position") 78 } 79 if int(abs) >= i.rb.nsrc { 80 i.setDone() 81 return int64(i.p), nil 82 } 83 i.p = int(abs) 84 i.multiSeg = nil 85 i.next = i.rb.f.nextMain 86 i.info = i.rb.f.info(i.rb.src, i.p) 87 return abs, nil 88 } 89 90 // returnSlice returns a slice of the underlying input type as a byte slice. 91 // If the underlying is of type []byte, it will simply return a slice. 92 // If the underlying is of type string, it will copy the slice to the buffer 93 // and return that. 94 func (i *Iter) returnSlice(a, b int) []byte { 95 if i.rb.src.bytes == nil { 96 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] 97 } 98 return i.rb.src.bytes[a:b] 99 } 100 101 // Pos returns the byte position at which the next call to Next will commence processing. 102 func (i *Iter) Pos() int { 103 return i.p 104 } 105 106 func (i *Iter) setDone() { 107 i.next = nextDone 108 i.p = i.rb.nsrc 109 } 110 111 // Done returns true if there is no more input to process. 112 func (i *Iter) Done() bool { 113 return i.p >= i.rb.nsrc 114 } 115 116 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. 117 // For any input a and b for which f(a) == f(b), subsequent calls 118 // to Next will return the same segments. 119 // Modifying runes are grouped together with the preceding starter, if such a starter exists. 120 // Although not guaranteed, n will typically be the smallest possible n. 121 func (i *Iter) Next() []byte { 122 return i.next(i) 123 } 124 125 func nextASCIIBytes(i *Iter) []byte { 126 p := i.p + 1 127 if p >= i.rb.nsrc { 128 i.setDone() 129 return i.rb.src.bytes[i.p:p] 130 } 131 if i.rb.src.bytes[p] < utf8.RuneSelf { 132 p0 := i.p 133 i.p = p 134 return i.rb.src.bytes[p0:p] 135 } 136 i.info = i.rb.f.info(i.rb.src, i.p) 137 i.next = i.rb.f.nextMain 138 return i.next(i) 139 } 140 141 func nextASCIIString(i *Iter) []byte { 142 p := i.p + 1 143 if p >= i.rb.nsrc { 144 i.buf[0] = i.rb.src.str[i.p] 145 i.setDone() 146 return i.buf[:1] 147 } 148 if i.rb.src.str[p] < utf8.RuneSelf { 149 i.buf[0] = i.rb.src.str[i.p] 150 i.p = p 151 return i.buf[:1] 152 } 153 i.info = i.rb.f.info(i.rb.src, i.p) 154 i.next = i.rb.f.nextMain 155 return i.next(i) 156 } 157 158 func nextHangul(i *Iter) []byte { 159 p := i.p 160 next := p + hangulUTF8Size 161 if next >= i.rb.nsrc { 162 i.setDone() 163 } else if i.rb.src.hangul(next) == 0 { 164 i.info = i.rb.f.info(i.rb.src, i.p) 165 i.next = i.rb.f.nextMain 166 return i.next(i) 167 } 168 i.p = next 169 return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] 170 } 171 172 func nextDone(i *Iter) []byte { 173 return nil 174 } 175 176 // nextMulti is used for iterating over multi-segment decompositions 177 // for decomposing normal forms. 178 func nextMulti(i *Iter) []byte { 179 j := 0 180 d := i.multiSeg 181 // skip first rune 182 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { 183 } 184 for j < len(d) { 185 info := i.rb.f.info(input{bytes: d}, j) 186 if info.BoundaryBefore() { 187 i.multiSeg = d[j:] 188 return d[:j] 189 } 190 j += int(info.size) 191 } 192 // treat last segment as normal decomposition 193 i.next = i.rb.f.nextMain 194 return i.next(i) 195 } 196 197 // nextMultiNorm is used for iterating over multi-segment decompositions 198 // for composing normal forms. 199 func nextMultiNorm(i *Iter) []byte { 200 j := 0 201 d := i.multiSeg 202 for j < len(d) { 203 info := i.rb.f.info(input{bytes: d}, j) 204 if info.BoundaryBefore() { 205 i.rb.compose() 206 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 207 i.rb.ss.first(info) 208 i.rb.insertUnsafe(input{bytes: d}, j, info) 209 i.multiSeg = d[j+int(info.size):] 210 return seg 211 } 212 i.rb.ss.next(info) 213 i.rb.insertUnsafe(input{bytes: d}, j, info) 214 j += int(info.size) 215 } 216 i.multiSeg = nil 217 i.next = nextComposed 218 return doNormComposed(i) 219 } 220 221 // nextDecomposed is the implementation of Next for forms NFD and NFKD. 222 func nextDecomposed(i *Iter) (next []byte) { 223 outp := 0 224 inCopyStart, outCopyStart := i.p, 0 225 ss := mkStreamSafe(i.info) 226 for { 227 if sz := int(i.info.size); sz <= 1 { 228 p := i.p 229 i.p++ // ASCII or illegal byte. Either way, advance by 1. 230 if i.p >= i.rb.nsrc { 231 i.setDone() 232 return i.returnSlice(p, i.p) 233 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 234 i.next = i.asciiF 235 return i.returnSlice(p, i.p) 236 } 237 outp++ 238 } else if d := i.info.Decomposition(); d != nil { 239 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. 240 // Case 1: there is a leftover to copy. In this case the decomposition 241 // must begin with a modifier and should always be appended. 242 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. 243 p := outp + len(d) 244 if outp > 0 { 245 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 246 if p > len(i.buf) { 247 return i.buf[:outp] 248 } 249 } else if i.info.multiSegment() { 250 // outp must be 0 as multi-segment decompositions always 251 // start a new segment. 252 if i.multiSeg == nil { 253 i.multiSeg = d 254 i.next = nextMulti 255 return nextMulti(i) 256 } 257 // We are in the last segment. Treat as normal decomposition. 258 d = i.multiSeg 259 i.multiSeg = nil 260 p = len(d) 261 } 262 prevCC := i.info.tccc 263 if i.p += sz; i.p >= i.rb.nsrc { 264 i.setDone() 265 i.info = Properties{} // Force BoundaryBefore to succeed. 266 } else { 267 i.info = i.rb.f.info(i.rb.src, i.p) 268 } 269 switch ss.next(i.info) { 270 case ssOverflow: 271 i.next = nextCGJDecompose 272 fallthrough 273 case ssStarter: 274 if outp > 0 { 275 copy(i.buf[outp:], d) 276 return i.buf[:p] 277 } 278 return d 279 } 280 copy(i.buf[outp:], d) 281 outp = p 282 inCopyStart, outCopyStart = i.p, outp 283 if i.info.ccc < prevCC { 284 goto doNorm 285 } 286 continue 287 } else if r := i.rb.src.hangul(i.p); r != 0 { 288 outp = decomposeHangul(i.buf[:], r) 289 i.p += hangulUTF8Size 290 inCopyStart, outCopyStart = i.p, outp 291 if i.p >= i.rb.nsrc { 292 i.setDone() 293 break 294 } else if i.rb.src.hangul(i.p) != 0 { 295 i.next = nextHangul 296 return i.buf[:outp] 297 } 298 } else { 299 p := outp + sz 300 if p > len(i.buf) { 301 break 302 } 303 outp = p 304 i.p += sz 305 } 306 if i.p >= i.rb.nsrc { 307 i.setDone() 308 break 309 } 310 prevCC := i.info.tccc 311 i.info = i.rb.f.info(i.rb.src, i.p) 312 if v := ss.next(i.info); v == ssStarter { 313 break 314 } else if v == ssOverflow { 315 i.next = nextCGJDecompose 316 break 317 } 318 if i.info.ccc < prevCC { 319 goto doNorm 320 } 321 } 322 if outCopyStart == 0 { 323 return i.returnSlice(inCopyStart, i.p) 324 } else if inCopyStart < i.p { 325 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 326 } 327 return i.buf[:outp] 328 doNorm: 329 // Insert what we have decomposed so far in the reorderBuffer. 330 // As we will only reorder, there will always be enough room. 331 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 332 i.rb.insertDecomposed(i.buf[0:outp]) 333 return doNormDecomposed(i) 334 } 335 336 func doNormDecomposed(i *Iter) []byte { 337 for { 338 if s := i.rb.ss.next(i.info); s == ssOverflow { 339 i.next = nextCGJDecompose 340 break 341 } 342 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 343 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 344 i.setDone() 345 break 346 } 347 i.info = i.rb.f.info(i.rb.src, i.p) 348 if i.info.ccc == 0 { 349 break 350 } 351 } 352 // new segment or too many combining characters: exit normalization 353 return i.buf[:i.rb.flushCopy(i.buf[:])] 354 } 355 356 func nextCGJDecompose(i *Iter) []byte { 357 i.rb.ss = 0 358 i.rb.insertCGJ() 359 i.next = nextDecomposed 360 buf := doNormDecomposed(i) 361 return buf 362 } 363 364 // nextComposed is the implementation of Next for forms NFC and NFKC. 365 func nextComposed(i *Iter) []byte { 366 outp, startp := 0, i.p 367 var prevCC uint8 368 ss := mkStreamSafe(i.info) 369 for { 370 if !i.info.isYesC() { 371 goto doNorm 372 } 373 prevCC = i.info.tccc 374 sz := int(i.info.size) 375 if sz == 0 { 376 sz = 1 // illegal rune: copy byte-by-byte 377 } 378 p := outp + sz 379 if p > len(i.buf) { 380 break 381 } 382 outp = p 383 i.p += sz 384 if i.p >= i.rb.nsrc { 385 i.setDone() 386 break 387 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 388 i.next = i.asciiF 389 break 390 } 391 i.info = i.rb.f.info(i.rb.src, i.p) 392 if v := ss.next(i.info); v == ssStarter { 393 break 394 } else if v == ssOverflow { 395 i.next = nextCGJCompose 396 break 397 } 398 if i.info.ccc < prevCC { 399 goto doNorm 400 } 401 } 402 return i.returnSlice(startp, i.p) 403 doNorm: 404 i.p = startp 405 i.info = i.rb.f.info(i.rb.src, i.p) 406 if i.info.multiSegment() { 407 d := i.info.Decomposition() 408 info := i.rb.f.info(input{bytes: d}, 0) 409 i.rb.insertUnsafe(input{bytes: d}, 0, info) 410 i.multiSeg = d[int(info.size):] 411 i.next = nextMultiNorm 412 return nextMultiNorm(i) 413 } 414 i.rb.ss.first(i.info) 415 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 416 return doNormComposed(i) 417 } 418 419 func doNormComposed(i *Iter) []byte { 420 // First rune should already be inserted. 421 for { 422 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 423 i.setDone() 424 break 425 } 426 i.info = i.rb.f.info(i.rb.src, i.p) 427 if s := i.rb.ss.next(i.info); s == ssStarter { 428 break 429 } else if s == ssOverflow { 430 i.next = nextCGJCompose 431 break 432 } 433 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 434 } 435 i.rb.compose() 436 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 437 return seg 438 } 439 440 func nextCGJCompose(i *Iter) []byte { 441 i.rb.ss = 0 // instead of first 442 i.rb.insertCGJ() 443 i.next = nextComposed 444 // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, 445 // even if they are not. This is particularly dubious for U+FF9E and UFF9A. 446 // If we ever change that, insert a check here. 447 i.rb.ss.first(i.info) 448 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 449 return doNormComposed(i) 450 }