github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/unicode/norm/normalize.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run maketables.go triegen.go 6 //go:generate go run maketables.go triegen.go -test 7 8 // Package norm contains types and functions for normalizing Unicode strings. 9 package norm // import "golang.org/x/text/unicode/norm" 10 11 import "unicode/utf8" 12 13 // A Form denotes a canonical representation of Unicode code points. 14 // The Unicode-defined normalization and equivalence forms are: 15 // 16 // NFC Unicode Normalization Form C 17 // NFD Unicode Normalization Form D 18 // NFKC Unicode Normalization Form KC 19 // NFKD Unicode Normalization Form KD 20 // 21 // For a Form f, this documentation uses the notation f(x) to mean 22 // the bytes or string x converted to the given form. 23 // A position n in x is called a boundary if conversion to the form can 24 // proceed independently on both sides: 25 // f(x) == append(f(x[0:n]), f(x[n:])...) 26 // 27 // References: http://unicode.org/reports/tr15/ and 28 // http://unicode.org/notes/tn5/. 29 type Form int 30 31 const ( 32 NFC Form = iota 33 NFD 34 NFKC 35 NFKD 36 ) 37 38 // Bytes returns f(b). May return b if f(b) = b. 39 func (f Form) Bytes(b []byte) []byte { 40 src := inputBytes(b) 41 ft := formTable[f] 42 n, ok := ft.quickSpan(src, 0, len(b), true) 43 if ok { 44 return b 45 } 46 out := make([]byte, n, len(b)) 47 copy(out, b[0:n]) 48 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} 49 return doAppendInner(&rb, n) 50 } 51 52 // String returns f(s). 53 func (f Form) String(s string) string { 54 src := inputString(s) 55 ft := formTable[f] 56 n, ok := ft.quickSpan(src, 0, len(s), true) 57 if ok { 58 return s 59 } 60 out := make([]byte, n, len(s)) 61 copy(out, s[0:n]) 62 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} 63 return string(doAppendInner(&rb, n)) 64 } 65 66 // IsNormal returns true if b == f(b). 67 func (f Form) IsNormal(b []byte) bool { 68 src := inputBytes(b) 69 ft := formTable[f] 70 bp, ok := ft.quickSpan(src, 0, len(b), true) 71 if ok { 72 return true 73 } 74 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} 75 rb.setFlusher(nil, cmpNormalBytes) 76 for bp < len(b) { 77 rb.out = b[bp:] 78 if bp = decomposeSegment(&rb, bp, true); bp < 0 { 79 return false 80 } 81 bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) 82 } 83 return true 84 } 85 86 func cmpNormalBytes(rb *reorderBuffer) bool { 87 b := rb.out 88 for i := 0; i < rb.nrune; i++ { 89 info := rb.rune[i] 90 if int(info.size) > len(b) { 91 return false 92 } 93 p := info.pos 94 pe := p + info.size 95 for ; p < pe; p++ { 96 if b[0] != rb.byte[p] { 97 return false 98 } 99 b = b[1:] 100 } 101 } 102 return true 103 } 104 105 // IsNormalString returns true if s == f(s). 106 func (f Form) IsNormalString(s string) bool { 107 src := inputString(s) 108 ft := formTable[f] 109 bp, ok := ft.quickSpan(src, 0, len(s), true) 110 if ok { 111 return true 112 } 113 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} 114 rb.setFlusher(nil, func(rb *reorderBuffer) bool { 115 for i := 0; i < rb.nrune; i++ { 116 info := rb.rune[i] 117 if bp+int(info.size) > len(s) { 118 return false 119 } 120 p := info.pos 121 pe := p + info.size 122 for ; p < pe; p++ { 123 if s[bp] != rb.byte[p] { 124 return false 125 } 126 bp++ 127 } 128 } 129 return true 130 }) 131 for bp < len(s) { 132 if bp = decomposeSegment(&rb, bp, true); bp < 0 { 133 return false 134 } 135 bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) 136 } 137 return true 138 } 139 140 // patchTail fixes a case where a rune may be incorrectly normalized 141 // if it is followed by illegal continuation bytes. It returns the 142 // patched buffer and whether the decomposition is still in progress. 143 func patchTail(rb *reorderBuffer) bool { 144 info, p := lastRuneStart(&rb.f, rb.out) 145 if p == -1 || info.size == 0 { 146 return true 147 } 148 end := p + int(info.size) 149 extra := len(rb.out) - end 150 if extra > 0 { 151 // Potentially allocating memory. However, this only 152 // happens with ill-formed UTF-8. 153 x := make([]byte, 0) 154 x = append(x, rb.out[len(rb.out)-extra:]...) 155 rb.out = rb.out[:end] 156 decomposeToLastBoundary(rb) 157 rb.doFlush() 158 rb.out = append(rb.out, x...) 159 return false 160 } 161 buf := rb.out[p:] 162 rb.out = rb.out[:p] 163 decomposeToLastBoundary(rb) 164 if s := rb.ss.next(info); s == ssStarter { 165 rb.doFlush() 166 rb.ss.first(info) 167 } else if s == ssOverflow { 168 rb.doFlush() 169 rb.insertCGJ() 170 rb.ss = 0 171 } 172 rb.insertUnsafe(inputBytes(buf), 0, info) 173 return true 174 } 175 176 func appendQuick(rb *reorderBuffer, i int) int { 177 if rb.nsrc == i { 178 return i 179 } 180 end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) 181 rb.out = rb.src.appendSlice(rb.out, i, end) 182 return end 183 } 184 185 // Append returns f(append(out, b...)). 186 // The buffer out must be nil, empty, or equal to f(out). 187 func (f Form) Append(out []byte, src ...byte) []byte { 188 return f.doAppend(out, inputBytes(src), len(src)) 189 } 190 191 func (f Form) doAppend(out []byte, src input, n int) []byte { 192 if n == 0 { 193 return out 194 } 195 ft := formTable[f] 196 // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer. 197 if len(out) == 0 { 198 p, _ := ft.quickSpan(src, 0, n, true) 199 out = src.appendSlice(out, 0, p) 200 if p == n { 201 return out 202 } 203 rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} 204 return doAppendInner(&rb, p) 205 } 206 rb := reorderBuffer{f: *ft, src: src, nsrc: n} 207 return doAppend(&rb, out, 0) 208 } 209 210 func doAppend(rb *reorderBuffer, out []byte, p int) []byte { 211 rb.setFlusher(out, appendFlush) 212 src, n := rb.src, rb.nsrc 213 doMerge := len(out) > 0 214 if q := src.skipContinuationBytes(p); q > p { 215 // Move leading non-starters to destination. 216 rb.out = src.appendSlice(rb.out, p, q) 217 p = q 218 doMerge = patchTail(rb) 219 } 220 fd := &rb.f 221 if doMerge { 222 var info Properties 223 if p < n { 224 info = fd.info(src, p) 225 if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { 226 if p == 0 { 227 decomposeToLastBoundary(rb) 228 } 229 p = decomposeSegment(rb, p, true) 230 } 231 } 232 if info.size == 0 { 233 rb.doFlush() 234 // Append incomplete UTF-8 encoding. 235 return src.appendSlice(rb.out, p, n) 236 } 237 if rb.nrune > 0 { 238 return doAppendInner(rb, p) 239 } 240 } 241 p = appendQuick(rb, p) 242 return doAppendInner(rb, p) 243 } 244 245 func doAppendInner(rb *reorderBuffer, p int) []byte { 246 for n := rb.nsrc; p < n; { 247 p = decomposeSegment(rb, p, true) 248 p = appendQuick(rb, p) 249 } 250 return rb.out 251 } 252 253 // AppendString returns f(append(out, []byte(s))). 254 // The buffer out must be nil, empty, or equal to f(out). 255 func (f Form) AppendString(out []byte, src string) []byte { 256 return f.doAppend(out, inputString(src), len(src)) 257 } 258 259 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). 260 // It is not guaranteed to return the largest such n. 261 func (f Form) QuickSpan(b []byte) int { 262 n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) 263 return n 264 } 265 266 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and 267 // whether any non-normalized parts were found. If atEOF is false, n will 268 // not point past the last segment if this segment might be become 269 // non-normalized by appending other runes. 270 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { 271 var lastCC uint8 272 ss := streamSafe(0) 273 lastSegStart := i 274 for n = end; i < n; { 275 if j := src.skipASCII(i, n); i != j { 276 i = j 277 lastSegStart = i - 1 278 lastCC = 0 279 ss = 0 280 continue 281 } 282 info := f.info(src, i) 283 if info.size == 0 { 284 if atEOF { 285 // include incomplete runes 286 return n, true 287 } 288 return lastSegStart, true 289 } 290 // This block needs to be before the next, because it is possible to 291 // have an overflow for runes that are starters (e.g. with U+FF9E). 292 switch ss.next(info) { 293 case ssStarter: 294 ss.first(info) 295 lastSegStart = i 296 case ssOverflow: 297 return lastSegStart, false 298 case ssSuccess: 299 if lastCC > info.ccc { 300 return lastSegStart, false 301 } 302 } 303 if f.composing { 304 if !info.isYesC() { 305 break 306 } 307 } else { 308 if !info.isYesD() { 309 break 310 } 311 } 312 lastCC = info.ccc 313 i += int(info.size) 314 } 315 if i == n { 316 if !atEOF { 317 n = lastSegStart 318 } 319 return n, true 320 } 321 return lastSegStart, false 322 } 323 324 // QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]). 325 // It is not guaranteed to return the largest such n. 326 func (f Form) QuickSpanString(s string) int { 327 n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) 328 return n 329 } 330 331 // FirstBoundary returns the position i of the first boundary in b 332 // or -1 if b contains no boundary. 333 func (f Form) FirstBoundary(b []byte) int { 334 return f.firstBoundary(inputBytes(b), len(b)) 335 } 336 337 func (f Form) firstBoundary(src input, nsrc int) int { 338 i := src.skipContinuationBytes(0) 339 if i >= nsrc { 340 return -1 341 } 342 fd := formTable[f] 343 ss := streamSafe(0) 344 // We should call ss.first here, but we can't as the first rune is 345 // skipped already. This means FirstBoundary can't really determine 346 // CGJ insertion points correctly. Luckily it doesn't have to. 347 // TODO: consider adding NextBoundary 348 for { 349 info := fd.info(src, i) 350 if info.size == 0 { 351 return -1 352 } 353 if s := ss.next(info); s != ssSuccess { 354 return i 355 } 356 i += int(info.size) 357 if i >= nsrc { 358 if !info.BoundaryAfter() && !ss.isMax() { 359 return -1 360 } 361 return nsrc 362 } 363 } 364 } 365 366 // FirstBoundaryInString returns the position i of the first boundary in s 367 // or -1 if s contains no boundary. 368 func (f Form) FirstBoundaryInString(s string) int { 369 return f.firstBoundary(inputString(s), len(s)) 370 } 371 372 // LastBoundary returns the position i of the last boundary in b 373 // or -1 if b contains no boundary. 374 func (f Form) LastBoundary(b []byte) int { 375 return lastBoundary(formTable[f], b) 376 } 377 378 func lastBoundary(fd *formInfo, b []byte) int { 379 i := len(b) 380 info, p := lastRuneStart(fd, b) 381 if p == -1 { 382 return -1 383 } 384 if info.size == 0 { // ends with incomplete rune 385 if p == 0 { // starts with incomplete rune 386 return -1 387 } 388 i = p 389 info, p = lastRuneStart(fd, b[:i]) 390 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter 391 return i 392 } 393 } 394 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 395 return i 396 } 397 if info.BoundaryAfter() { 398 return i 399 } 400 ss := streamSafe(0) 401 v := ss.backwards(info) 402 for i = p; i >= 0 && v != ssStarter; i = p { 403 info, p = lastRuneStart(fd, b[:i]) 404 if v = ss.backwards(info); v == ssOverflow { 405 break 406 } 407 if p+int(info.size) != i { 408 if p == -1 { // no boundary found 409 return -1 410 } 411 return i // boundary after an illegal UTF-8 encoding 412 } 413 } 414 return i 415 } 416 417 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f 418 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters 419 // and returns the number of bytes consumed from src or iShortDst or iShortSrc. 420 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { 421 // Force one character to be consumed. 422 info := rb.f.info(rb.src, sp) 423 if info.size == 0 { 424 return 0 425 } 426 if rb.nrune > 0 { 427 if s := rb.ss.next(info); s == ssStarter { 428 goto end 429 } else if s == ssOverflow { 430 rb.insertCGJ() 431 goto end 432 } 433 } else { 434 rb.ss.first(info) 435 } 436 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { 437 return int(err) 438 } 439 for { 440 sp += int(info.size) 441 if sp >= rb.nsrc { 442 if !atEOF && !info.BoundaryAfter() { 443 return int(iShortSrc) 444 } 445 break 446 } 447 info = rb.f.info(rb.src, sp) 448 if info.size == 0 { 449 if !atEOF { 450 return int(iShortSrc) 451 } 452 break 453 } 454 if s := rb.ss.next(info); s == ssStarter { 455 break 456 } else if s == ssOverflow { 457 rb.insertCGJ() 458 break 459 } 460 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { 461 return int(err) 462 } 463 } 464 end: 465 if !rb.doFlush() { 466 return int(iShortDst) 467 } 468 return sp 469 } 470 471 // lastRuneStart returns the runeInfo and position of the last 472 // rune in buf or the zero runeInfo and -1 if no rune was found. 473 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { 474 p := len(buf) - 1 475 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { 476 } 477 if p < 0 { 478 return Properties{}, -1 479 } 480 return fd.info(inputBytes(buf), p), p 481 } 482 483 // decomposeToLastBoundary finds an open segment at the end of the buffer 484 // and scans it into rb. Returns the buffer minus the last segment. 485 func decomposeToLastBoundary(rb *reorderBuffer) { 486 fd := &rb.f 487 info, i := lastRuneStart(fd, rb.out) 488 if int(info.size) != len(rb.out)-i { 489 // illegal trailing continuation bytes 490 return 491 } 492 if info.BoundaryAfter() { 493 return 494 } 495 var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order 496 padd := 0 497 ss := streamSafe(0) 498 p := len(rb.out) 499 for { 500 add[padd] = info 501 v := ss.backwards(info) 502 if v == ssOverflow { 503 // Note that if we have an overflow, it the string we are appending to 504 // is not correctly normalized. In this case the behavior is undefined. 505 break 506 } 507 padd++ 508 p -= int(info.size) 509 if v == ssStarter || p < 0 { 510 break 511 } 512 info, i = lastRuneStart(fd, rb.out[:p]) 513 if int(info.size) != p-i { 514 break 515 } 516 } 517 rb.ss = ss 518 // Copy bytes for insertion as we may need to overwrite rb.out. 519 var buf [maxBufferSize * utf8.UTFMax]byte 520 cp := buf[:copy(buf[:], rb.out[p:])] 521 rb.out = rb.out[:p] 522 for padd--; padd >= 0; padd-- { 523 info = add[padd] 524 rb.insertUnsafe(inputBytes(cp), 0, info) 525 cp = cp[info.size:] 526 } 527 }