github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/text/unicode/norm/normalize.go (about) 1 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT. 2 3 // Copyright 2011 The Go Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 7 // Note: the file data_test.go that is generated should not be checked in. 8 9 // Package norm contains types and functions for normalizing Unicode strings. 10 package norm 11 12 import ( 13 "unicode/utf8" 14 15 "github.com/icodeface/tls/internal/x/text/transform" 16 ) 17 18 // A Form denotes a canonical representation of Unicode code points. 19 // The Unicode-defined normalization and equivalence forms are: 20 // 21 // NFC Unicode Normalization Form C 22 // NFD Unicode Normalization Form D 23 // NFKC Unicode Normalization Form KC 24 // NFKD Unicode Normalization Form KD 25 // 26 // For a Form f, this documentation uses the notation f(x) to mean 27 // the bytes or string x converted to the given form. 28 // A position n in x is called a boundary if conversion to the form can 29 // proceed independently on both sides: 30 // f(x) == append(f(x[0:n]), f(x[n:])...) 31 // 32 // References: http://unicode.org/reports/tr15/ and 33 // http://unicode.org/notes/tn5/. 34 type Form int 35 36 const ( 37 NFC Form = iota 38 NFD 39 NFKC 40 NFKD 41 ) 42 43 // Bytes returns f(b). May return b if f(b) = b. 44 func (f Form) Bytes(b []byte) []byte { 45 src := inputBytes(b) 46 ft := formTable[f] 47 n, ok := ft.quickSpan(src, 0, len(b), true) 48 if ok { 49 return b 50 } 51 out := make([]byte, n, len(b)) 52 copy(out, b[0:n]) 53 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} 54 return doAppendInner(&rb, n) 55 } 56 57 // String returns f(s). 58 func (f Form) String(s string) string { 59 src := inputString(s) 60 ft := formTable[f] 61 n, ok := ft.quickSpan(src, 0, len(s), true) 62 if ok { 63 return s 64 } 65 out := make([]byte, n, len(s)) 66 copy(out, s[0:n]) 67 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} 68 return string(doAppendInner(&rb, n)) 69 } 70 71 // IsNormal returns true if b == f(b). 72 func (f Form) IsNormal(b []byte) bool { 73 src := inputBytes(b) 74 ft := formTable[f] 75 bp, ok := ft.quickSpan(src, 0, len(b), true) 76 if ok { 77 return true 78 } 79 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} 80 rb.setFlusher(nil, cmpNormalBytes) 81 for bp < len(b) { 82 rb.out = b[bp:] 83 if bp = decomposeSegment(&rb, bp, true); bp < 0 { 84 return false 85 } 86 bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) 87 } 88 return true 89 } 90 91 func cmpNormalBytes(rb *reorderBuffer) bool { 92 b := rb.out 93 for i := 0; i < rb.nrune; i++ { 94 info := rb.rune[i] 95 if int(info.size) > len(b) { 96 return false 97 } 98 p := info.pos 99 pe := p + info.size 100 for ; p < pe; p++ { 101 if b[0] != rb.byte[p] { 102 return false 103 } 104 b = b[1:] 105 } 106 } 107 return true 108 } 109 110 // IsNormalString returns true if s == f(s). 111 func (f Form) IsNormalString(s string) bool { 112 src := inputString(s) 113 ft := formTable[f] 114 bp, ok := ft.quickSpan(src, 0, len(s), true) 115 if ok { 116 return true 117 } 118 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} 119 rb.setFlusher(nil, func(rb *reorderBuffer) bool { 120 for i := 0; i < rb.nrune; i++ { 121 info := rb.rune[i] 122 if bp+int(info.size) > len(s) { 123 return false 124 } 125 p := info.pos 126 pe := p + info.size 127 for ; p < pe; p++ { 128 if s[bp] != rb.byte[p] { 129 return false 130 } 131 bp++ 132 } 133 } 134 return true 135 }) 136 for bp < len(s) { 137 if bp = decomposeSegment(&rb, bp, true); bp < 0 { 138 return false 139 } 140 bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) 141 } 142 return true 143 } 144 145 // patchTail fixes a case where a rune may be incorrectly normalized 146 // if it is followed by illegal continuation bytes. It returns the 147 // patched buffer and whether the decomposition is still in progress. 148 func patchTail(rb *reorderBuffer) bool { 149 info, p := lastRuneStart(&rb.f, rb.out) 150 if p == -1 || info.size == 0 { 151 return true 152 } 153 end := p + int(info.size) 154 extra := len(rb.out) - end 155 if extra > 0 { 156 // Potentially allocating memory. However, this only 157 // happens with ill-formed UTF-8. 158 x := make([]byte, 0) 159 x = append(x, rb.out[len(rb.out)-extra:]...) 160 rb.out = rb.out[:end] 161 decomposeToLastBoundary(rb) 162 rb.doFlush() 163 rb.out = append(rb.out, x...) 164 return false 165 } 166 buf := rb.out[p:] 167 rb.out = rb.out[:p] 168 decomposeToLastBoundary(rb) 169 if s := rb.ss.next(info); s == ssStarter { 170 rb.doFlush() 171 rb.ss.first(info) 172 } else if s == ssOverflow { 173 rb.doFlush() 174 rb.insertCGJ() 175 rb.ss = 0 176 } 177 rb.insertUnsafe(inputBytes(buf), 0, info) 178 return true 179 } 180 181 func appendQuick(rb *reorderBuffer, i int) int { 182 if rb.nsrc == i { 183 return i 184 } 185 end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) 186 rb.out = rb.src.appendSlice(rb.out, i, end) 187 return end 188 } 189 190 // Append returns f(append(out, b...)). 191 // The buffer out must be nil, empty, or equal to f(out). 192 func (f Form) Append(out []byte, src ...byte) []byte { 193 return f.doAppend(out, inputBytes(src), len(src)) 194 } 195 196 func (f Form) doAppend(out []byte, src input, n int) []byte { 197 if n == 0 { 198 return out 199 } 200 ft := formTable[f] 201 // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer. 202 if len(out) == 0 { 203 p, _ := ft.quickSpan(src, 0, n, true) 204 out = src.appendSlice(out, 0, p) 205 if p == n { 206 return out 207 } 208 rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} 209 return doAppendInner(&rb, p) 210 } 211 rb := reorderBuffer{f: *ft, src: src, nsrc: n} 212 return doAppend(&rb, out, 0) 213 } 214 215 func doAppend(rb *reorderBuffer, out []byte, p int) []byte { 216 rb.setFlusher(out, appendFlush) 217 src, n := rb.src, rb.nsrc 218 doMerge := len(out) > 0 219 if q := src.skipContinuationBytes(p); q > p { 220 // Move leading non-starters to destination. 221 rb.out = src.appendSlice(rb.out, p, q) 222 p = q 223 doMerge = patchTail(rb) 224 } 225 fd := &rb.f 226 if doMerge { 227 var info Properties 228 if p < n { 229 info = fd.info(src, p) 230 if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { 231 if p == 0 { 232 decomposeToLastBoundary(rb) 233 } 234 p = decomposeSegment(rb, p, true) 235 } 236 } 237 if info.size == 0 { 238 rb.doFlush() 239 // Append incomplete UTF-8 encoding. 240 return src.appendSlice(rb.out, p, n) 241 } 242 if rb.nrune > 0 { 243 return doAppendInner(rb, p) 244 } 245 } 246 p = appendQuick(rb, p) 247 return doAppendInner(rb, p) 248 } 249 250 func doAppendInner(rb *reorderBuffer, p int) []byte { 251 for n := rb.nsrc; p < n; { 252 p = decomposeSegment(rb, p, true) 253 p = appendQuick(rb, p) 254 } 255 return rb.out 256 } 257 258 // AppendString returns f(append(out, []byte(s))). 259 // The buffer out must be nil, empty, or equal to f(out). 260 func (f Form) AppendString(out []byte, src string) []byte { 261 return f.doAppend(out, inputString(src), len(src)) 262 } 263 264 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). 265 // It is not guaranteed to return the largest such n. 266 func (f Form) QuickSpan(b []byte) int { 267 n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) 268 return n 269 } 270 271 // Span implements transform.SpanningTransformer. It returns a boundary n such 272 // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n. 273 func (f Form) Span(b []byte, atEOF bool) (n int, err error) { 274 n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) 275 if n < len(b) { 276 if !ok { 277 err = transform.ErrEndOfSpan 278 } else { 279 err = transform.ErrShortSrc 280 } 281 } 282 return n, err 283 } 284 285 // SpanString returns a boundary n such that s[0:n] == f(s[0:n]). 286 // It is not guaranteed to return the largest such n. 287 func (f Form) SpanString(s string, atEOF bool) (n int, err error) { 288 n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) 289 if n < len(s) { 290 if !ok { 291 err = transform.ErrEndOfSpan 292 } else { 293 err = transform.ErrShortSrc 294 } 295 } 296 return n, err 297 } 298 299 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and 300 // whether any non-normalized parts were found. If atEOF is false, n will 301 // not point past the last segment if this segment might be become 302 // non-normalized by appending other runes. 303 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { 304 var lastCC uint8 305 ss := streamSafe(0) 306 lastSegStart := i 307 for n = end; i < n; { 308 if j := src.skipASCII(i, n); i != j { 309 i = j 310 lastSegStart = i - 1 311 lastCC = 0 312 ss = 0 313 continue 314 } 315 info := f.info(src, i) 316 if info.size == 0 { 317 if atEOF { 318 // include incomplete runes 319 return n, true 320 } 321 return lastSegStart, true 322 } 323 // This block needs to be before the next, because it is possible to 324 // have an overflow for runes that are starters (e.g. with U+FF9E). 325 switch ss.next(info) { 326 case ssStarter: 327 lastSegStart = i 328 case ssOverflow: 329 return lastSegStart, false 330 case ssSuccess: 331 if lastCC > info.ccc { 332 return lastSegStart, false 333 } 334 } 335 if f.composing { 336 if !info.isYesC() { 337 break 338 } 339 } else { 340 if !info.isYesD() { 341 break 342 } 343 } 344 lastCC = info.ccc 345 i += int(info.size) 346 } 347 if i == n { 348 if !atEOF { 349 n = lastSegStart 350 } 351 return n, true 352 } 353 return lastSegStart, false 354 } 355 356 // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]). 357 // It is not guaranteed to return the largest such n. 358 func (f Form) QuickSpanString(s string) int { 359 n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) 360 return n 361 } 362 363 // FirstBoundary returns the position i of the first boundary in b 364 // or -1 if b contains no boundary. 365 func (f Form) FirstBoundary(b []byte) int { 366 return f.firstBoundary(inputBytes(b), len(b)) 367 } 368 369 func (f Form) firstBoundary(src input, nsrc int) int { 370 i := src.skipContinuationBytes(0) 371 if i >= nsrc { 372 return -1 373 } 374 fd := formTable[f] 375 ss := streamSafe(0) 376 // We should call ss.first here, but we can't as the first rune is 377 // skipped already. This means FirstBoundary can't really determine 378 // CGJ insertion points correctly. Luckily it doesn't have to. 379 for { 380 info := fd.info(src, i) 381 if info.size == 0 { 382 return -1 383 } 384 if s := ss.next(info); s != ssSuccess { 385 return i 386 } 387 i += int(info.size) 388 if i >= nsrc { 389 if !info.BoundaryAfter() && !ss.isMax() { 390 return -1 391 } 392 return nsrc 393 } 394 } 395 } 396 397 // FirstBoundaryInString returns the position i of the first boundary in s 398 // or -1 if s contains no boundary. 399 func (f Form) FirstBoundaryInString(s string) int { 400 return f.firstBoundary(inputString(s), len(s)) 401 } 402 403 // NextBoundary reports the index of the boundary between the first and next 404 // segment in b or -1 if atEOF is false and there are not enough bytes to 405 // determine this boundary. 406 func (f Form) NextBoundary(b []byte, atEOF bool) int { 407 return f.nextBoundary(inputBytes(b), len(b), atEOF) 408 } 409 410 // NextBoundaryInString reports the index of the boundary between the first and 411 // next segment in b or -1 if atEOF is false and there are not enough bytes to 412 // determine this boundary. 413 func (f Form) NextBoundaryInString(s string, atEOF bool) int { 414 return f.nextBoundary(inputString(s), len(s), atEOF) 415 } 416 417 func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { 418 if nsrc == 0 { 419 if atEOF { 420 return 0 421 } 422 return -1 423 } 424 fd := formTable[f] 425 info := fd.info(src, 0) 426 if info.size == 0 { 427 if atEOF { 428 return 1 429 } 430 return -1 431 } 432 ss := streamSafe(0) 433 ss.first(info) 434 435 for i := int(info.size); i < nsrc; i += int(info.size) { 436 info = fd.info(src, i) 437 if info.size == 0 { 438 if atEOF { 439 return i 440 } 441 return -1 442 } 443 // TODO: Using streamSafe to determine the boundary isn't the same as 444 // using BoundaryBefore. Determine which should be used. 445 if s := ss.next(info); s != ssSuccess { 446 return i 447 } 448 } 449 if !atEOF && !info.BoundaryAfter() && !ss.isMax() { 450 return -1 451 } 452 return nsrc 453 } 454 455 // LastBoundary returns the position i of the last boundary in b 456 // or -1 if b contains no boundary. 457 func (f Form) LastBoundary(b []byte) int { 458 return lastBoundary(formTable[f], b) 459 } 460 461 func lastBoundary(fd *formInfo, b []byte) int { 462 i := len(b) 463 info, p := lastRuneStart(fd, b) 464 if p == -1 { 465 return -1 466 } 467 if info.size == 0 { // ends with incomplete rune 468 if p == 0 { // starts with incomplete rune 469 return -1 470 } 471 i = p 472 info, p = lastRuneStart(fd, b[:i]) 473 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter 474 return i 475 } 476 } 477 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 478 return i 479 } 480 if info.BoundaryAfter() { 481 return i 482 } 483 ss := streamSafe(0) 484 v := ss.backwards(info) 485 for i = p; i >= 0 && v != ssStarter; i = p { 486 info, p = lastRuneStart(fd, b[:i]) 487 if v = ss.backwards(info); v == ssOverflow { 488 break 489 } 490 if p+int(info.size) != i { 491 if p == -1 { // no boundary found 492 return -1 493 } 494 return i // boundary after an illegal UTF-8 encoding 495 } 496 } 497 return i 498 } 499 500 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f 501 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters 502 // and returns the number of bytes consumed from src or iShortDst or iShortSrc. 503 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { 504 // Force one character to be consumed. 505 info := rb.f.info(rb.src, sp) 506 if info.size == 0 { 507 return 0 508 } 509 if s := rb.ss.next(info); s == ssStarter { 510 // TODO: this could be removed if we don't support merging. 511 if rb.nrune > 0 { 512 goto end 513 } 514 } else if s == ssOverflow { 515 rb.insertCGJ() 516 goto end 517 } 518 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { 519 return int(err) 520 } 521 for { 522 sp += int(info.size) 523 if sp >= rb.nsrc { 524 if !atEOF && !info.BoundaryAfter() { 525 return int(iShortSrc) 526 } 527 break 528 } 529 info = rb.f.info(rb.src, sp) 530 if info.size == 0 { 531 if !atEOF { 532 return int(iShortSrc) 533 } 534 break 535 } 536 if s := rb.ss.next(info); s == ssStarter { 537 break 538 } else if s == ssOverflow { 539 rb.insertCGJ() 540 break 541 } 542 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { 543 return int(err) 544 } 545 } 546 end: 547 if !rb.doFlush() { 548 return int(iShortDst) 549 } 550 return sp 551 } 552 553 // lastRuneStart returns the runeInfo and position of the last 554 // rune in buf or the zero runeInfo and -1 if no rune was found. 555 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { 556 p := len(buf) - 1 557 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { 558 } 559 if p < 0 { 560 return Properties{}, -1 561 } 562 return fd.info(inputBytes(buf), p), p 563 } 564 565 // decomposeToLastBoundary finds an open segment at the end of the buffer 566 // and scans it into rb. Returns the buffer minus the last segment. 567 func decomposeToLastBoundary(rb *reorderBuffer) { 568 fd := &rb.f 569 info, i := lastRuneStart(fd, rb.out) 570 if int(info.size) != len(rb.out)-i { 571 // illegal trailing continuation bytes 572 return 573 } 574 if info.BoundaryAfter() { 575 return 576 } 577 var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order 578 padd := 0 579 ss := streamSafe(0) 580 p := len(rb.out) 581 for { 582 add[padd] = info 583 v := ss.backwards(info) 584 if v == ssOverflow { 585 // Note that if we have an overflow, it the string we are appending to 586 // is not correctly normalized. In this case the behavior is undefined. 587 break 588 } 589 padd++ 590 p -= int(info.size) 591 if v == ssStarter || p < 0 { 592 break 593 } 594 info, i = lastRuneStart(fd, rb.out[:p]) 595 if int(info.size) != p-i { 596 break 597 } 598 } 599 rb.ss = ss 600 // Copy bytes for insertion as we may need to overwrite rb.out. 601 var buf [maxBufferSize * utf8.UTFMax]byte 602 cp := buf[:copy(buf[:], rb.out[p:])] 603 rb.out = rb.out[:p] 604 for padd--; padd >= 0; padd-- { 605 info = add[padd] 606 rb.insertUnsafe(inputBytes(cp), 0, info) 607 cp = cp[info.size:] 608 } 609 }