golang.org/x/text@v0.14.0/unicode/norm/normalize.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Note: the file data_test.go that is generated should not be checked in. 6 //go:generate go run maketables.go triegen.go 7 //go:generate go test -tags test 8 9 // Package norm contains types and functions for normalizing Unicode strings. 10 package norm // import "golang.org/x/text/unicode/norm" 11 12 import ( 13 "unicode/utf8" 14 15 "golang.org/x/text/transform" 16 ) 17 18 // A Form denotes a canonical representation of Unicode code points. 19 // The Unicode-defined normalization and equivalence forms are: 20 // 21 // NFC Unicode Normalization Form C 22 // NFD Unicode Normalization Form D 23 // NFKC Unicode Normalization Form KC 24 // NFKD Unicode Normalization Form KD 25 // 26 // For a Form f, this documentation uses the notation f(x) to mean 27 // the bytes or string x converted to the given form. 28 // A position n in x is called a boundary if conversion to the form can 29 // proceed independently on both sides: 30 // 31 // f(x) == append(f(x[0:n]), f(x[n:])...) 32 // 33 // References: https://unicode.org/reports/tr15/ and 34 // https://unicode.org/notes/tn5/. 35 type Form int 36 37 const ( 38 NFC Form = iota 39 NFD 40 NFKC 41 NFKD 42 ) 43 44 // Bytes returns f(b). May return b if f(b) = b. 45 func (f Form) Bytes(b []byte) []byte { 46 src := inputBytes(b) 47 ft := formTable[f] 48 n, ok := ft.quickSpan(src, 0, len(b), true) 49 if ok { 50 return b 51 } 52 out := make([]byte, n, len(b)) 53 copy(out, b[0:n]) 54 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} 55 return doAppendInner(&rb, n) 56 } 57 58 // String returns f(s). 59 func (f Form) String(s string) string { 60 src := inputString(s) 61 ft := formTable[f] 62 n, ok := ft.quickSpan(src, 0, len(s), true) 63 if ok { 64 return s 65 } 66 out := make([]byte, n, len(s)) 67 copy(out, s[0:n]) 68 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} 69 return string(doAppendInner(&rb, n)) 70 } 71 72 // IsNormal returns true if b == f(b). 73 func (f Form) IsNormal(b []byte) bool { 74 src := inputBytes(b) 75 ft := formTable[f] 76 bp, ok := ft.quickSpan(src, 0, len(b), true) 77 if ok { 78 return true 79 } 80 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} 81 rb.setFlusher(nil, cmpNormalBytes) 82 for bp < len(b) { 83 rb.out = b[bp:] 84 if bp = decomposeSegment(&rb, bp, true); bp < 0 { 85 return false 86 } 87 bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) 88 } 89 return true 90 } 91 92 func cmpNormalBytes(rb *reorderBuffer) bool { 93 b := rb.out 94 for i := 0; i < rb.nrune; i++ { 95 info := rb.rune[i] 96 if int(info.size) > len(b) { 97 return false 98 } 99 p := info.pos 100 pe := p + info.size 101 for ; p < pe; p++ { 102 if b[0] != rb.byte[p] { 103 return false 104 } 105 b = b[1:] 106 } 107 } 108 return true 109 } 110 111 // IsNormalString returns true if s == f(s). 112 func (f Form) IsNormalString(s string) bool { 113 src := inputString(s) 114 ft := formTable[f] 115 bp, ok := ft.quickSpan(src, 0, len(s), true) 116 if ok { 117 return true 118 } 119 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} 120 rb.setFlusher(nil, func(rb *reorderBuffer) bool { 121 for i := 0; i < rb.nrune; i++ { 122 info := rb.rune[i] 123 if bp+int(info.size) > len(s) { 124 return false 125 } 126 p := info.pos 127 pe := p + info.size 128 for ; p < pe; p++ { 129 if s[bp] != rb.byte[p] { 130 return false 131 } 132 bp++ 133 } 134 } 135 return true 136 }) 137 for bp < len(s) { 138 if bp = decomposeSegment(&rb, bp, true); bp < 0 { 139 return false 140 } 141 bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) 142 } 143 return true 144 } 145 146 // patchTail fixes a case where a rune may be incorrectly normalized 147 // if it is followed by illegal continuation bytes. It returns the 148 // patched buffer and whether the decomposition is still in progress. 149 func patchTail(rb *reorderBuffer) bool { 150 info, p := lastRuneStart(&rb.f, rb.out) 151 if p == -1 || info.size == 0 { 152 return true 153 } 154 end := p + int(info.size) 155 extra := len(rb.out) - end 156 if extra > 0 { 157 // Potentially allocating memory. However, this only 158 // happens with ill-formed UTF-8. 159 x := make([]byte, 0) 160 x = append(x, rb.out[len(rb.out)-extra:]...) 161 rb.out = rb.out[:end] 162 decomposeToLastBoundary(rb) 163 rb.doFlush() 164 rb.out = append(rb.out, x...) 165 return false 166 } 167 buf := rb.out[p:] 168 rb.out = rb.out[:p] 169 decomposeToLastBoundary(rb) 170 if s := rb.ss.next(info); s == ssStarter { 171 rb.doFlush() 172 rb.ss.first(info) 173 } else if s == ssOverflow { 174 rb.doFlush() 175 rb.insertCGJ() 176 rb.ss = 0 177 } 178 rb.insertUnsafe(inputBytes(buf), 0, info) 179 return true 180 } 181 182 func appendQuick(rb *reorderBuffer, i int) int { 183 if rb.nsrc == i { 184 return i 185 } 186 end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) 187 rb.out = rb.src.appendSlice(rb.out, i, end) 188 return end 189 } 190 191 // Append returns f(append(out, b...)). 192 // The buffer out must be nil, empty, or equal to f(out). 193 func (f Form) Append(out []byte, src ...byte) []byte { 194 return f.doAppend(out, inputBytes(src), len(src)) 195 } 196 197 func (f Form) doAppend(out []byte, src input, n int) []byte { 198 if n == 0 { 199 return out 200 } 201 ft := formTable[f] 202 // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer. 203 if len(out) == 0 { 204 p, _ := ft.quickSpan(src, 0, n, true) 205 out = src.appendSlice(out, 0, p) 206 if p == n { 207 return out 208 } 209 rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} 210 return doAppendInner(&rb, p) 211 } 212 rb := reorderBuffer{f: *ft, src: src, nsrc: n} 213 return doAppend(&rb, out, 0) 214 } 215 216 func doAppend(rb *reorderBuffer, out []byte, p int) []byte { 217 rb.setFlusher(out, appendFlush) 218 src, n := rb.src, rb.nsrc 219 doMerge := len(out) > 0 220 if q := src.skipContinuationBytes(p); q > p { 221 // Move leading non-starters to destination. 222 rb.out = src.appendSlice(rb.out, p, q) 223 p = q 224 doMerge = patchTail(rb) 225 } 226 fd := &rb.f 227 if doMerge { 228 var info Properties 229 if p < n { 230 info = fd.info(src, p) 231 if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { 232 if p == 0 { 233 decomposeToLastBoundary(rb) 234 } 235 p = decomposeSegment(rb, p, true) 236 } 237 } 238 if info.size == 0 { 239 rb.doFlush() 240 // Append incomplete UTF-8 encoding. 241 return src.appendSlice(rb.out, p, n) 242 } 243 if rb.nrune > 0 { 244 return doAppendInner(rb, p) 245 } 246 } 247 p = appendQuick(rb, p) 248 return doAppendInner(rb, p) 249 } 250 251 func doAppendInner(rb *reorderBuffer, p int) []byte { 252 for n := rb.nsrc; p < n; { 253 p = decomposeSegment(rb, p, true) 254 p = appendQuick(rb, p) 255 } 256 return rb.out 257 } 258 259 // AppendString returns f(append(out, []byte(s))). 260 // The buffer out must be nil, empty, or equal to f(out). 261 func (f Form) AppendString(out []byte, src string) []byte { 262 return f.doAppend(out, inputString(src), len(src)) 263 } 264 265 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). 266 // It is not guaranteed to return the largest such n. 267 func (f Form) QuickSpan(b []byte) int { 268 n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) 269 return n 270 } 271 272 // Span implements transform.SpanningTransformer. It returns a boundary n such 273 // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n. 274 func (f Form) Span(b []byte, atEOF bool) (n int, err error) { 275 n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) 276 if n < len(b) { 277 if !ok { 278 err = transform.ErrEndOfSpan 279 } else { 280 err = transform.ErrShortSrc 281 } 282 } 283 return n, err 284 } 285 286 // SpanString returns a boundary n such that s[0:n] == f(s[0:n]). 287 // It is not guaranteed to return the largest such n. 288 func (f Form) SpanString(s string, atEOF bool) (n int, err error) { 289 n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) 290 if n < len(s) { 291 if !ok { 292 err = transform.ErrEndOfSpan 293 } else { 294 err = transform.ErrShortSrc 295 } 296 } 297 return n, err 298 } 299 300 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and 301 // whether any non-normalized parts were found. If atEOF is false, n will 302 // not point past the last segment if this segment might be become 303 // non-normalized by appending other runes. 304 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { 305 var lastCC uint8 306 ss := streamSafe(0) 307 lastSegStart := i 308 for n = end; i < n; { 309 if j := src.skipASCII(i, n); i != j { 310 i = j 311 lastSegStart = i - 1 312 lastCC = 0 313 ss = 0 314 continue 315 } 316 info := f.info(src, i) 317 if info.size == 0 { 318 if atEOF { 319 // include incomplete runes 320 return n, true 321 } 322 return lastSegStart, true 323 } 324 // This block needs to be before the next, because it is possible to 325 // have an overflow for runes that are starters (e.g. with U+FF9E). 326 switch ss.next(info) { 327 case ssStarter: 328 lastSegStart = i 329 case ssOverflow: 330 return lastSegStart, false 331 case ssSuccess: 332 if lastCC > info.ccc { 333 return lastSegStart, false 334 } 335 } 336 if f.composing { 337 if !info.isYesC() { 338 break 339 } 340 } else { 341 if !info.isYesD() { 342 break 343 } 344 } 345 lastCC = info.ccc 346 i += int(info.size) 347 } 348 if i == n { 349 if !atEOF { 350 n = lastSegStart 351 } 352 return n, true 353 } 354 return lastSegStart, false 355 } 356 357 // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]). 358 // It is not guaranteed to return the largest such n. 359 func (f Form) QuickSpanString(s string) int { 360 n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) 361 return n 362 } 363 364 // FirstBoundary returns the position i of the first boundary in b 365 // or -1 if b contains no boundary. 366 func (f Form) FirstBoundary(b []byte) int { 367 return f.firstBoundary(inputBytes(b), len(b)) 368 } 369 370 func (f Form) firstBoundary(src input, nsrc int) int { 371 i := src.skipContinuationBytes(0) 372 if i >= nsrc { 373 return -1 374 } 375 fd := formTable[f] 376 ss := streamSafe(0) 377 // We should call ss.first here, but we can't as the first rune is 378 // skipped already. This means FirstBoundary can't really determine 379 // CGJ insertion points correctly. Luckily it doesn't have to. 380 for { 381 info := fd.info(src, i) 382 if info.size == 0 { 383 return -1 384 } 385 if s := ss.next(info); s != ssSuccess { 386 return i 387 } 388 i += int(info.size) 389 if i >= nsrc { 390 if !info.BoundaryAfter() && !ss.isMax() { 391 return -1 392 } 393 return nsrc 394 } 395 } 396 } 397 398 // FirstBoundaryInString returns the position i of the first boundary in s 399 // or -1 if s contains no boundary. 400 func (f Form) FirstBoundaryInString(s string) int { 401 return f.firstBoundary(inputString(s), len(s)) 402 } 403 404 // NextBoundary reports the index of the boundary between the first and next 405 // segment in b or -1 if atEOF is false and there are not enough bytes to 406 // determine this boundary. 407 func (f Form) NextBoundary(b []byte, atEOF bool) int { 408 return f.nextBoundary(inputBytes(b), len(b), atEOF) 409 } 410 411 // NextBoundaryInString reports the index of the boundary between the first and 412 // next segment in b or -1 if atEOF is false and there are not enough bytes to 413 // determine this boundary. 414 func (f Form) NextBoundaryInString(s string, atEOF bool) int { 415 return f.nextBoundary(inputString(s), len(s), atEOF) 416 } 417 418 func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { 419 if nsrc == 0 { 420 if atEOF { 421 return 0 422 } 423 return -1 424 } 425 fd := formTable[f] 426 info := fd.info(src, 0) 427 if info.size == 0 { 428 if atEOF { 429 return 1 430 } 431 return -1 432 } 433 ss := streamSafe(0) 434 ss.first(info) 435 436 for i := int(info.size); i < nsrc; i += int(info.size) { 437 info = fd.info(src, i) 438 if info.size == 0 { 439 if atEOF { 440 return i 441 } 442 return -1 443 } 444 // TODO: Using streamSafe to determine the boundary isn't the same as 445 // using BoundaryBefore. Determine which should be used. 446 if s := ss.next(info); s != ssSuccess { 447 return i 448 } 449 } 450 if !atEOF && !info.BoundaryAfter() && !ss.isMax() { 451 return -1 452 } 453 return nsrc 454 } 455 456 // LastBoundary returns the position i of the last boundary in b 457 // or -1 if b contains no boundary. 458 func (f Form) LastBoundary(b []byte) int { 459 return lastBoundary(formTable[f], b) 460 } 461 462 func lastBoundary(fd *formInfo, b []byte) int { 463 i := len(b) 464 info, p := lastRuneStart(fd, b) 465 if p == -1 { 466 return -1 467 } 468 if info.size == 0 { // ends with incomplete rune 469 if p == 0 { // starts with incomplete rune 470 return -1 471 } 472 i = p 473 info, p = lastRuneStart(fd, b[:i]) 474 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter 475 return i 476 } 477 } 478 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 479 return i 480 } 481 if info.BoundaryAfter() { 482 return i 483 } 484 ss := streamSafe(0) 485 v := ss.backwards(info) 486 for i = p; i >= 0 && v != ssStarter; i = p { 487 info, p = lastRuneStart(fd, b[:i]) 488 if v = ss.backwards(info); v == ssOverflow { 489 break 490 } 491 if p+int(info.size) != i { 492 if p == -1 { // no boundary found 493 return -1 494 } 495 return i // boundary after an illegal UTF-8 encoding 496 } 497 } 498 return i 499 } 500 501 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f 502 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters 503 // and returns the number of bytes consumed from src or iShortDst or iShortSrc. 504 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { 505 // Force one character to be consumed. 506 info := rb.f.info(rb.src, sp) 507 if info.size == 0 { 508 return 0 509 } 510 if s := rb.ss.next(info); s == ssStarter { 511 // TODO: this could be removed if we don't support merging. 512 if rb.nrune > 0 { 513 goto end 514 } 515 } else if s == ssOverflow { 516 rb.insertCGJ() 517 goto end 518 } 519 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { 520 return int(err) 521 } 522 for { 523 sp += int(info.size) 524 if sp >= rb.nsrc { 525 if !atEOF && !info.BoundaryAfter() { 526 return int(iShortSrc) 527 } 528 break 529 } 530 info = rb.f.info(rb.src, sp) 531 if info.size == 0 { 532 if !atEOF { 533 return int(iShortSrc) 534 } 535 break 536 } 537 if s := rb.ss.next(info); s == ssStarter { 538 break 539 } else if s == ssOverflow { 540 rb.insertCGJ() 541 break 542 } 543 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { 544 return int(err) 545 } 546 } 547 end: 548 if !rb.doFlush() { 549 return int(iShortDst) 550 } 551 return sp 552 } 553 554 // lastRuneStart returns the runeInfo and position of the last 555 // rune in buf or the zero runeInfo and -1 if no rune was found. 556 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { 557 p := len(buf) - 1 558 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { 559 } 560 if p < 0 { 561 return Properties{}, -1 562 } 563 return fd.info(inputBytes(buf), p), p 564 } 565 566 // decomposeToLastBoundary finds an open segment at the end of the buffer 567 // and scans it into rb. Returns the buffer minus the last segment. 568 func decomposeToLastBoundary(rb *reorderBuffer) { 569 fd := &rb.f 570 info, i := lastRuneStart(fd, rb.out) 571 if int(info.size) != len(rb.out)-i { 572 // illegal trailing continuation bytes 573 return 574 } 575 if info.BoundaryAfter() { 576 return 577 } 578 var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order 579 padd := 0 580 ss := streamSafe(0) 581 p := len(rb.out) 582 for { 583 add[padd] = info 584 v := ss.backwards(info) 585 if v == ssOverflow { 586 // Note that if we have an overflow, it the string we are appending to 587 // is not correctly normalized. In this case the behavior is undefined. 588 break 589 } 590 padd++ 591 p -= int(info.size) 592 if v == ssStarter || p < 0 { 593 break 594 } 595 info, i = lastRuneStart(fd, rb.out[:p]) 596 if int(info.size) != p-i { 597 break 598 } 599 } 600 rb.ss = ss 601 // Copy bytes for insertion as we may need to overwrite rb.out. 602 var buf [maxBufferSize * utf8.UTFMax]byte 603 cp := buf[:copy(buf[:], rb.out[p:])] 604 rb.out = rb.out[:p] 605 for padd--; padd >= 0; padd-- { 606 info = add[padd] 607 rb.insertUnsafe(inputBytes(cp), 0, info) 608 cp = cp[info.size:] 609 } 610 }