github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/unicode/norm/composition.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package norm 6 7 import "unicode/utf8" 8 9 const ( 10 maxNonStarters = 30 11 // The maximum number of characters needed for a buffer is 12 // maxNonStarters + 1 for the starter + 1 for the GCJ 13 maxBufferSize = maxNonStarters + 2 14 maxNFCExpansion = 3 // NFC(0x1D160) 15 maxNFKCExpansion = 18 // NFKC(0xFDFA) 16 17 maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128 18 ) 19 20 // ssState is used for reporting the segment state after inserting a rune. 21 // It is returned by streamSafe.next. 22 type ssState int 23 24 const ( 25 // Indicates a rune was successfully added to the segment. 26 ssSuccess ssState = iota 27 // Indicates a rune starts a new segment and should not be added. 28 ssStarter 29 // Indicates a rune caused a segment overflow and a CGJ should be inserted. 30 ssOverflow 31 ) 32 33 // streamSafe implements the policy of when a CGJ should be inserted. 34 type streamSafe uint8 35 36 // mkStreamSafe is a shorthand for declaring a streamSafe var and calling 37 // first on it. 38 func mkStreamSafe(p Properties) streamSafe { 39 return streamSafe(p.nTrailingNonStarters()) 40 } 41 42 // first inserts the first rune of a segment. 43 func (ss *streamSafe) first(p Properties) { 44 if *ss != 0 { 45 panic("!= 0") 46 } 47 *ss = streamSafe(p.nTrailingNonStarters()) 48 } 49 50 // insert returns a ssState value to indicate whether a rune represented by p 51 // can be inserted. 52 func (ss *streamSafe) next(p Properties) ssState { 53 if *ss > maxNonStarters { 54 panic("streamSafe was not reset") 55 } 56 n := p.nLeadingNonStarters() 57 if *ss += streamSafe(n); *ss > maxNonStarters { 58 *ss = 0 59 return ssOverflow 60 } 61 // The Stream-Safe Text Processing prescribes that the counting can stop 62 // as soon as a starter is encountered. However, there are some starters, 63 // like Jamo V and T, that can combine with other runes, leaving their 64 // successive non-starters appended to the previous, possibly causing an 65 // overflow. We will therefore consider any rune with a non-zero nLead to 66 // be a non-starter. Note that it always hold that if nLead > 0 then 67 // nLead == nTrail. 68 if n == 0 { 69 *ss = 0 70 return ssStarter 71 } 72 return ssSuccess 73 } 74 75 // backwards is used for checking for overflow and segment starts 76 // when traversing a string backwards. Users do not need to call first 77 // for the first rune. The state of the streamSafe retains the count of 78 // the non-starters loaded. 79 func (ss *streamSafe) backwards(p Properties) ssState { 80 if *ss > maxNonStarters { 81 panic("streamSafe was not reset") 82 } 83 c := *ss + streamSafe(p.nTrailingNonStarters()) 84 if c > maxNonStarters { 85 return ssOverflow 86 } 87 *ss = c 88 if p.nLeadingNonStarters() == 0 { 89 return ssStarter 90 } 91 return ssSuccess 92 } 93 94 func (ss streamSafe) isMax() bool { 95 return ss == maxNonStarters 96 } 97 98 // GraphemeJoiner is inserted after maxNonStarters non-starter runes. 99 const GraphemeJoiner = "\u034F" 100 101 // reorderBuffer is used to normalize a single segment. Characters inserted with 102 // insert are decomposed and reordered based on CCC. The compose method can 103 // be used to recombine characters. Note that the byte buffer does not hold 104 // the UTF-8 characters in order. Only the rune array is maintained in sorted 105 // order. flush writes the resulting segment to a byte array. 106 type reorderBuffer struct { 107 rune [maxBufferSize]Properties // Per character info. 108 byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos. 109 nbyte uint8 // Number or bytes. 110 ss streamSafe // For limiting length of non-starter sequence. 111 nrune int // Number of runeInfos. 112 f formInfo 113 114 src input 115 nsrc int 116 tmpBytes input 117 118 out []byte 119 flushF func(*reorderBuffer) bool 120 } 121 122 func (rb *reorderBuffer) init(f Form, src []byte) { 123 rb.f = *formTable[f] 124 rb.src.setBytes(src) 125 rb.nsrc = len(src) 126 rb.ss = 0 127 } 128 129 func (rb *reorderBuffer) initString(f Form, src string) { 130 rb.f = *formTable[f] 131 rb.src.setString(src) 132 rb.nsrc = len(src) 133 rb.ss = 0 134 } 135 136 func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) { 137 rb.out = out 138 rb.flushF = f 139 } 140 141 // reset discards all characters from the buffer. 142 func (rb *reorderBuffer) reset() { 143 rb.nrune = 0 144 rb.nbyte = 0 145 rb.ss = 0 146 } 147 148 func (rb *reorderBuffer) doFlush() bool { 149 if rb.f.composing { 150 rb.compose() 151 } 152 res := rb.flushF(rb) 153 rb.reset() 154 return res 155 } 156 157 // appendFlush appends the normalized segment to rb.out. 158 func appendFlush(rb *reorderBuffer) bool { 159 for i := 0; i < rb.nrune; i++ { 160 start := rb.rune[i].pos 161 end := start + rb.rune[i].size 162 rb.out = append(rb.out, rb.byte[start:end]...) 163 } 164 return true 165 } 166 167 // flush appends the normalized segment to out and resets rb. 168 func (rb *reorderBuffer) flush(out []byte) []byte { 169 for i := 0; i < rb.nrune; i++ { 170 start := rb.rune[i].pos 171 end := start + rb.rune[i].size 172 out = append(out, rb.byte[start:end]...) 173 } 174 rb.reset() 175 return out 176 } 177 178 // flushCopy copies the normalized segment to buf and resets rb. 179 // It returns the number of bytes written to buf. 180 func (rb *reorderBuffer) flushCopy(buf []byte) int { 181 p := 0 182 for i := 0; i < rb.nrune; i++ { 183 runep := rb.rune[i] 184 p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size]) 185 } 186 rb.reset() 187 return p 188 } 189 190 // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class. 191 // It returns false if the buffer is not large enough to hold the rune. 192 // It is used internally by insert and insertString only. 193 func (rb *reorderBuffer) insertOrdered(info Properties) { 194 n := rb.nrune 195 b := rb.rune[:] 196 cc := info.ccc 197 if cc > 0 { 198 // Find insertion position + move elements to make room. 199 for ; n > 0; n-- { 200 if b[n-1].ccc <= cc { 201 break 202 } 203 b[n] = b[n-1] 204 } 205 } 206 rb.nrune += 1 207 pos := uint8(rb.nbyte) 208 rb.nbyte += utf8.UTFMax 209 info.pos = pos 210 b[n] = info 211 } 212 213 // insertErr is an error code returned by insert. Using this type instead 214 // of error improves performance up to 20% for many of the benchmarks. 215 type insertErr int 216 217 const ( 218 iSuccess insertErr = -iota 219 iShortDst 220 iShortSrc 221 ) 222 223 // insertFlush inserts the given rune in the buffer ordered by CCC. 224 // If a decomposition with multiple segments are encountered, they leading 225 // ones are flushed. 226 // It returns a non-zero error code if the rune was not inserted. 227 func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr { 228 if rune := src.hangul(i); rune != 0 { 229 rb.decomposeHangul(rune) 230 return iSuccess 231 } 232 if info.hasDecomposition() { 233 return rb.insertDecomposed(info.Decomposition()) 234 } 235 rb.insertSingle(src, i, info) 236 return iSuccess 237 } 238 239 // insertUnsafe inserts the given rune in the buffer ordered by CCC. 240 // It is assumed there is sufficient space to hold the runes. It is the 241 // responsibility of the caller to ensure this. This can be done by checking 242 // the state returned by the streamSafe type. 243 func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) { 244 if rune := src.hangul(i); rune != 0 { 245 rb.decomposeHangul(rune) 246 } 247 if info.hasDecomposition() { 248 // TODO: inline. 249 rb.insertDecomposed(info.Decomposition()) 250 } else { 251 rb.insertSingle(src, i, info) 252 } 253 } 254 255 // insertDecomposed inserts an entry in to the reorderBuffer for each rune 256 // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes. 257 // It flushes the buffer on each new segment start. 258 func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr { 259 rb.tmpBytes.setBytes(dcomp) 260 for i := 0; i < len(dcomp); { 261 info := rb.f.info(rb.tmpBytes, i) 262 if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() { 263 return iShortDst 264 } 265 i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)]) 266 rb.insertOrdered(info) 267 } 268 return iSuccess 269 } 270 271 // insertSingle inserts an entry in the reorderBuffer for the rune at 272 // position i. info is the runeInfo for the rune at position i. 273 func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) { 274 src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size)) 275 rb.insertOrdered(info) 276 } 277 278 // insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb. 279 func (rb *reorderBuffer) insertCGJ() { 280 rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))}) 281 } 282 283 // appendRune inserts a rune at the end of the buffer. It is used for Hangul. 284 func (rb *reorderBuffer) appendRune(r rune) { 285 bn := rb.nbyte 286 sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) 287 rb.nbyte += utf8.UTFMax 288 rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)} 289 rb.nrune++ 290 } 291 292 // assignRune sets a rune at position pos. It is used for Hangul and recomposition. 293 func (rb *reorderBuffer) assignRune(pos int, r rune) { 294 bn := rb.rune[pos].pos 295 sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) 296 rb.rune[pos] = Properties{pos: bn, size: uint8(sz)} 297 } 298 299 // runeAt returns the rune at position n. It is used for Hangul and recomposition. 300 func (rb *reorderBuffer) runeAt(n int) rune { 301 inf := rb.rune[n] 302 r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size]) 303 return r 304 } 305 306 // bytesAt returns the UTF-8 encoding of the rune at position n. 307 // It is used for Hangul and recomposition. 308 func (rb *reorderBuffer) bytesAt(n int) []byte { 309 inf := rb.rune[n] 310 return rb.byte[inf.pos : int(inf.pos)+int(inf.size)] 311 } 312 313 // For Hangul we combine algorithmically, instead of using tables. 314 const ( 315 hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80 316 hangulBase0 = 0xEA 317 hangulBase1 = 0xB0 318 hangulBase2 = 0x80 319 320 hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4 321 hangulEnd0 = 0xED 322 hangulEnd1 = 0x9E 323 hangulEnd2 = 0xA4 324 325 jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00 326 jamoLBase0 = 0xE1 327 jamoLBase1 = 0x84 328 jamoLEnd = 0x1113 329 jamoVBase = 0x1161 330 jamoVEnd = 0x1176 331 jamoTBase = 0x11A7 332 jamoTEnd = 0x11C3 333 334 jamoTCount = 28 335 jamoVCount = 21 336 jamoVTCount = 21 * 28 337 jamoLVTCount = 19 * 21 * 28 338 ) 339 340 const hangulUTF8Size = 3 341 342 func isHangul(b []byte) bool { 343 if len(b) < hangulUTF8Size { 344 return false 345 } 346 b0 := b[0] 347 if b0 < hangulBase0 { 348 return false 349 } 350 b1 := b[1] 351 switch { 352 case b0 == hangulBase0: 353 return b1 >= hangulBase1 354 case b0 < hangulEnd0: 355 return true 356 case b0 > hangulEnd0: 357 return false 358 case b1 < hangulEnd1: 359 return true 360 } 361 return b1 == hangulEnd1 && b[2] < hangulEnd2 362 } 363 364 func isHangulString(b string) bool { 365 if len(b) < hangulUTF8Size { 366 return false 367 } 368 b0 := b[0] 369 if b0 < hangulBase0 { 370 return false 371 } 372 b1 := b[1] 373 switch { 374 case b0 == hangulBase0: 375 return b1 >= hangulBase1 376 case b0 < hangulEnd0: 377 return true 378 case b0 > hangulEnd0: 379 return false 380 case b1 < hangulEnd1: 381 return true 382 } 383 return b1 == hangulEnd1 && b[2] < hangulEnd2 384 } 385 386 // Caller must ensure len(b) >= 2. 387 func isJamoVT(b []byte) bool { 388 // True if (rune & 0xff00) == jamoLBase 389 return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1 390 } 391 392 func isHangulWithoutJamoT(b []byte) bool { 393 c, _ := utf8.DecodeRune(b) 394 c -= hangulBase 395 return c < jamoLVTCount && c%jamoTCount == 0 396 } 397 398 // decomposeHangul writes the decomposed Hangul to buf and returns the number 399 // of bytes written. len(buf) should be at least 9. 400 func decomposeHangul(buf []byte, r rune) int { 401 const JamoUTF8Len = 3 402 r -= hangulBase 403 x := r % jamoTCount 404 r /= jamoTCount 405 utf8.EncodeRune(buf, jamoLBase+r/jamoVCount) 406 utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount) 407 if x != 0 { 408 utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x) 409 return 3 * JamoUTF8Len 410 } 411 return 2 * JamoUTF8Len 412 } 413 414 // decomposeHangul algorithmically decomposes a Hangul rune into 415 // its Jamo components. 416 // See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul. 417 func (rb *reorderBuffer) decomposeHangul(r rune) { 418 r -= hangulBase 419 x := r % jamoTCount 420 r /= jamoTCount 421 rb.appendRune(jamoLBase + r/jamoVCount) 422 rb.appendRune(jamoVBase + r%jamoVCount) 423 if x != 0 { 424 rb.appendRune(jamoTBase + x) 425 } 426 } 427 428 // combineHangul algorithmically combines Jamo character components into Hangul. 429 // See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul. 430 func (rb *reorderBuffer) combineHangul(s, i, k int) { 431 b := rb.rune[:] 432 bn := rb.nrune 433 for ; i < bn; i++ { 434 cccB := b[k-1].ccc 435 cccC := b[i].ccc 436 if cccB == 0 { 437 s = k - 1 438 } 439 if s != k-1 && cccB >= cccC { 440 // b[i] is blocked by greater-equal cccX below it 441 b[k] = b[i] 442 k++ 443 } else { 444 l := rb.runeAt(s) // also used to compare to hangulBase 445 v := rb.runeAt(i) // also used to compare to jamoT 446 switch { 447 case jamoLBase <= l && l < jamoLEnd && 448 jamoVBase <= v && v < jamoVEnd: 449 // 11xx plus 116x to LV 450 rb.assignRune(s, hangulBase+ 451 (l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount) 452 case hangulBase <= l && l < hangulEnd && 453 jamoTBase < v && v < jamoTEnd && 454 ((l-hangulBase)%jamoTCount) == 0: 455 // ACxx plus 11Ax to LVT 456 rb.assignRune(s, l+v-jamoTBase) 457 default: 458 b[k] = b[i] 459 k++ 460 } 461 } 462 } 463 rb.nrune = k 464 } 465 466 // compose recombines the runes in the buffer. 467 // It should only be used to recompose a single segment, as it will not 468 // handle alternations between Hangul and non-Hangul characters correctly. 469 func (rb *reorderBuffer) compose() { 470 // UAX #15, section X5 , including Corrigendum #5 471 // "In any character sequence beginning with starter S, a character C is 472 // blocked from S if and only if there is some character B between S 473 // and C, and either B is a starter or it has the same or higher 474 // combining class as C." 475 bn := rb.nrune 476 if bn == 0 { 477 return 478 } 479 k := 1 480 b := rb.rune[:] 481 for s, i := 0, 1; i < bn; i++ { 482 if isJamoVT(rb.bytesAt(i)) { 483 // Redo from start in Hangul mode. Necessary to support 484 // U+320E..U+321E in NFKC mode. 485 rb.combineHangul(s, i, k) 486 return 487 } 488 ii := b[i] 489 // We can only use combineForward as a filter if we later 490 // get the info for the combined character. This is more 491 // expensive than using the filter. Using combinesBackward() 492 // is safe. 493 if ii.combinesBackward() { 494 cccB := b[k-1].ccc 495 cccC := ii.ccc 496 blocked := false // b[i] blocked by starter or greater or equal CCC? 497 if cccB == 0 { 498 s = k - 1 499 } else { 500 blocked = s != k-1 && cccB >= cccC 501 } 502 if !blocked { 503 combined := combine(rb.runeAt(s), rb.runeAt(i)) 504 if combined != 0 { 505 rb.assignRune(s, combined) 506 continue 507 } 508 } 509 } 510 b[k] = b[i] 511 k++ 512 } 513 rb.nrune = k 514 }