github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/text/unicode/norm/composition.go (about)

     1  // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
     2  
     3  // Copyright 2011 The Go Authors. All rights reserved.
     4  // Use of this source code is governed by a BSD-style
     5  // license that can be found in the LICENSE file.
     6  
     7  package norm
     8  
     9  import "unicode/utf8"
    10  
    11  const (
    12  	maxNonStarters = 30
    13  	// The maximum number of characters needed for a buffer is
    14  	// maxNonStarters + 1 for the starter + 1 for the GCJ
    15  	maxBufferSize    = maxNonStarters + 2
    16  	maxNFCExpansion  = 3  // NFC(0x1D160)
    17  	maxNFKCExpansion = 18 // NFKC(0xFDFA)
    18  
    19  	maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
    20  )
    21  
    22  // ssState is used for reporting the segment state after inserting a rune.
    23  // It is returned by streamSafe.next.
    24  type ssState int
    25  
    26  const (
    27  	// Indicates a rune was successfully added to the segment.
    28  	ssSuccess ssState = iota
    29  	// Indicates a rune starts a new segment and should not be added.
    30  	ssStarter
    31  	// Indicates a rune caused a segment overflow and a CGJ should be inserted.
    32  	ssOverflow
    33  )
    34  
    35  // streamSafe implements the policy of when a CGJ should be inserted.
    36  type streamSafe uint8
    37  
    38  // first inserts the first rune of a segment. It is a faster version of next if
    39  // it is known p represents the first rune in a segment.
    40  func (ss *streamSafe) first(p Properties) {
    41  	*ss = streamSafe(p.nTrailingNonStarters())
    42  }
    43  
    44  // insert returns a ssState value to indicate whether a rune represented by p
    45  // can be inserted.
    46  func (ss *streamSafe) next(p Properties) ssState {
    47  	if *ss > maxNonStarters {
    48  		panic("streamSafe was not reset")
    49  	}
    50  	n := p.nLeadingNonStarters()
    51  	if *ss += streamSafe(n); *ss > maxNonStarters {
    52  		*ss = 0
    53  		return ssOverflow
    54  	}
    55  	// The Stream-Safe Text Processing prescribes that the counting can stop
    56  	// as soon as a starter is encountered. However, there are some starters,
    57  	// like Jamo V and T, that can combine with other runes, leaving their
    58  	// successive non-starters appended to the previous, possibly causing an
    59  	// overflow. We will therefore consider any rune with a non-zero nLead to
    60  	// be a non-starter. Note that it always hold that if nLead > 0 then
    61  	// nLead == nTrail.
    62  	if n == 0 {
    63  		*ss = streamSafe(p.nTrailingNonStarters())
    64  		return ssStarter
    65  	}
    66  	return ssSuccess
    67  }
    68  
    69  // backwards is used for checking for overflow and segment starts
    70  // when traversing a string backwards. Users do not need to call first
    71  // for the first rune. The state of the streamSafe retains the count of
    72  // the non-starters loaded.
    73  func (ss *streamSafe) backwards(p Properties) ssState {
    74  	if *ss > maxNonStarters {
    75  		panic("streamSafe was not reset")
    76  	}
    77  	c := *ss + streamSafe(p.nTrailingNonStarters())
    78  	if c > maxNonStarters {
    79  		return ssOverflow
    80  	}
    81  	*ss = c
    82  	if p.nLeadingNonStarters() == 0 {
    83  		return ssStarter
    84  	}
    85  	return ssSuccess
    86  }
    87  
    88  func (ss streamSafe) isMax() bool {
    89  	return ss == maxNonStarters
    90  }
    91  
    92  // GraphemeJoiner is inserted after maxNonStarters non-starter runes.
    93  const GraphemeJoiner = "\u034F"
    94  
    95  // reorderBuffer is used to normalize a single segment.  Characters inserted with
    96  // insert are decomposed and reordered based on CCC. The compose method can
    97  // be used to recombine characters.  Note that the byte buffer does not hold
    98  // the UTF-8 characters in order.  Only the rune array is maintained in sorted
    99  // order. flush writes the resulting segment to a byte array.
   100  type reorderBuffer struct {
   101  	rune  [maxBufferSize]Properties // Per character info.
   102  	byte  [maxByteBufferSize]byte   // UTF-8 buffer. Referenced by runeInfo.pos.
   103  	nbyte uint8                     // Number or bytes.
   104  	ss    streamSafe                // For limiting length of non-starter sequence.
   105  	nrune int                       // Number of runeInfos.
   106  	f     formInfo
   107  
   108  	src      input
   109  	nsrc     int
   110  	tmpBytes input
   111  
   112  	out    []byte
   113  	flushF func(*reorderBuffer) bool
   114  }
   115  
   116  func (rb *reorderBuffer) init(f Form, src []byte) {
   117  	rb.f = *formTable[f]
   118  	rb.src.setBytes(src)
   119  	rb.nsrc = len(src)
   120  	rb.ss = 0
   121  }
   122  
   123  func (rb *reorderBuffer) initString(f Form, src string) {
   124  	rb.f = *formTable[f]
   125  	rb.src.setString(src)
   126  	rb.nsrc = len(src)
   127  	rb.ss = 0
   128  }
   129  
   130  func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
   131  	rb.out = out
   132  	rb.flushF = f
   133  }
   134  
   135  // reset discards all characters from the buffer.
   136  func (rb *reorderBuffer) reset() {
   137  	rb.nrune = 0
   138  	rb.nbyte = 0
   139  }
   140  
   141  func (rb *reorderBuffer) doFlush() bool {
   142  	if rb.f.composing {
   143  		rb.compose()
   144  	}
   145  	res := rb.flushF(rb)
   146  	rb.reset()
   147  	return res
   148  }
   149  
   150  // appendFlush appends the normalized segment to rb.out.
   151  func appendFlush(rb *reorderBuffer) bool {
   152  	for i := 0; i < rb.nrune; i++ {
   153  		start := rb.rune[i].pos
   154  		end := start + rb.rune[i].size
   155  		rb.out = append(rb.out, rb.byte[start:end]...)
   156  	}
   157  	return true
   158  }
   159  
   160  // flush appends the normalized segment to out and resets rb.
   161  func (rb *reorderBuffer) flush(out []byte) []byte {
   162  	for i := 0; i < rb.nrune; i++ {
   163  		start := rb.rune[i].pos
   164  		end := start + rb.rune[i].size
   165  		out = append(out, rb.byte[start:end]...)
   166  	}
   167  	rb.reset()
   168  	return out
   169  }
   170  
   171  // flushCopy copies the normalized segment to buf and resets rb.
   172  // It returns the number of bytes written to buf.
   173  func (rb *reorderBuffer) flushCopy(buf []byte) int {
   174  	p := 0
   175  	for i := 0; i < rb.nrune; i++ {
   176  		runep := rb.rune[i]
   177  		p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
   178  	}
   179  	rb.reset()
   180  	return p
   181  }
   182  
   183  // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
   184  // It returns false if the buffer is not large enough to hold the rune.
   185  // It is used internally by insert and insertString only.
   186  func (rb *reorderBuffer) insertOrdered(info Properties) {
   187  	n := rb.nrune
   188  	b := rb.rune[:]
   189  	cc := info.ccc
   190  	if cc > 0 {
   191  		// Find insertion position + move elements to make room.
   192  		for ; n > 0; n-- {
   193  			if b[n-1].ccc <= cc {
   194  				break
   195  			}
   196  			b[n] = b[n-1]
   197  		}
   198  	}
   199  	rb.nrune += 1
   200  	pos := uint8(rb.nbyte)
   201  	rb.nbyte += utf8.UTFMax
   202  	info.pos = pos
   203  	b[n] = info
   204  }
   205  
   206  // insertErr is an error code returned by insert. Using this type instead
   207  // of error improves performance up to 20% for many of the benchmarks.
   208  type insertErr int
   209  
   210  const (
   211  	iSuccess insertErr = -iota
   212  	iShortDst
   213  	iShortSrc
   214  )
   215  
   216  // insertFlush inserts the given rune in the buffer ordered by CCC.
   217  // If a decomposition with multiple segments are encountered, they leading
   218  // ones are flushed.
   219  // It returns a non-zero error code if the rune was not inserted.
   220  func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
   221  	if rune := src.hangul(i); rune != 0 {
   222  		rb.decomposeHangul(rune)
   223  		return iSuccess
   224  	}
   225  	if info.hasDecomposition() {
   226  		return rb.insertDecomposed(info.Decomposition())
   227  	}
   228  	rb.insertSingle(src, i, info)
   229  	return iSuccess
   230  }
   231  
   232  // insertUnsafe inserts the given rune in the buffer ordered by CCC.
   233  // It is assumed there is sufficient space to hold the runes. It is the
   234  // responsibility of the caller to ensure this. This can be done by checking
   235  // the state returned by the streamSafe type.
   236  func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
   237  	if rune := src.hangul(i); rune != 0 {
   238  		rb.decomposeHangul(rune)
   239  	}
   240  	if info.hasDecomposition() {
   241  		// TODO: inline.
   242  		rb.insertDecomposed(info.Decomposition())
   243  	} else {
   244  		rb.insertSingle(src, i, info)
   245  	}
   246  }
   247  
   248  // insertDecomposed inserts an entry in to the reorderBuffer for each rune
   249  // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
   250  // It flushes the buffer on each new segment start.
   251  func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
   252  	rb.tmpBytes.setBytes(dcomp)
   253  	// As the streamSafe accounting already handles the counting for modifiers,
   254  	// we don't have to call next. However, we do need to keep the accounting
   255  	// intact when flushing the buffer.
   256  	for i := 0; i < len(dcomp); {
   257  		info := rb.f.info(rb.tmpBytes, i)
   258  		if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
   259  			return iShortDst
   260  		}
   261  		i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
   262  		rb.insertOrdered(info)
   263  	}
   264  	return iSuccess
   265  }
   266  
   267  // insertSingle inserts an entry in the reorderBuffer for the rune at
   268  // position i. info is the runeInfo for the rune at position i.
   269  func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
   270  	src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
   271  	rb.insertOrdered(info)
   272  }
   273  
   274  // insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
   275  func (rb *reorderBuffer) insertCGJ() {
   276  	rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
   277  }
   278  
   279  // appendRune inserts a rune at the end of the buffer. It is used for Hangul.
   280  func (rb *reorderBuffer) appendRune(r rune) {
   281  	bn := rb.nbyte
   282  	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
   283  	rb.nbyte += utf8.UTFMax
   284  	rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
   285  	rb.nrune++
   286  }
   287  
   288  // assignRune sets a rune at position pos. It is used for Hangul and recomposition.
   289  func (rb *reorderBuffer) assignRune(pos int, r rune) {
   290  	bn := rb.rune[pos].pos
   291  	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
   292  	rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
   293  }
   294  
   295  // runeAt returns the rune at position n. It is used for Hangul and recomposition.
   296  func (rb *reorderBuffer) runeAt(n int) rune {
   297  	inf := rb.rune[n]
   298  	r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
   299  	return r
   300  }
   301  
   302  // bytesAt returns the UTF-8 encoding of the rune at position n.
   303  // It is used for Hangul and recomposition.
   304  func (rb *reorderBuffer) bytesAt(n int) []byte {
   305  	inf := rb.rune[n]
   306  	return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
   307  }
   308  
   309  // For Hangul we combine algorithmically, instead of using tables.
   310  const (
   311  	hangulBase  = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
   312  	hangulBase0 = 0xEA
   313  	hangulBase1 = 0xB0
   314  	hangulBase2 = 0x80
   315  
   316  	hangulEnd  = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
   317  	hangulEnd0 = 0xED
   318  	hangulEnd1 = 0x9E
   319  	hangulEnd2 = 0xA4
   320  
   321  	jamoLBase  = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
   322  	jamoLBase0 = 0xE1
   323  	jamoLBase1 = 0x84
   324  	jamoLEnd   = 0x1113
   325  	jamoVBase  = 0x1161
   326  	jamoVEnd   = 0x1176
   327  	jamoTBase  = 0x11A7
   328  	jamoTEnd   = 0x11C3
   329  
   330  	jamoTCount   = 28
   331  	jamoVCount   = 21
   332  	jamoVTCount  = 21 * 28
   333  	jamoLVTCount = 19 * 21 * 28
   334  )
   335  
   336  const hangulUTF8Size = 3
   337  
   338  func isHangul(b []byte) bool {
   339  	if len(b) < hangulUTF8Size {
   340  		return false
   341  	}
   342  	b0 := b[0]
   343  	if b0 < hangulBase0 {
   344  		return false
   345  	}
   346  	b1 := b[1]
   347  	switch {
   348  	case b0 == hangulBase0:
   349  		return b1 >= hangulBase1
   350  	case b0 < hangulEnd0:
   351  		return true
   352  	case b0 > hangulEnd0:
   353  		return false
   354  	case b1 < hangulEnd1:
   355  		return true
   356  	}
   357  	return b1 == hangulEnd1 && b[2] < hangulEnd2
   358  }
   359  
   360  func isHangulString(b string) bool {
   361  	if len(b) < hangulUTF8Size {
   362  		return false
   363  	}
   364  	b0 := b[0]
   365  	if b0 < hangulBase0 {
   366  		return false
   367  	}
   368  	b1 := b[1]
   369  	switch {
   370  	case b0 == hangulBase0:
   371  		return b1 >= hangulBase1
   372  	case b0 < hangulEnd0:
   373  		return true
   374  	case b0 > hangulEnd0:
   375  		return false
   376  	case b1 < hangulEnd1:
   377  		return true
   378  	}
   379  	return b1 == hangulEnd1 && b[2] < hangulEnd2
   380  }
   381  
   382  // Caller must ensure len(b) >= 2.
   383  func isJamoVT(b []byte) bool {
   384  	// True if (rune & 0xff00) == jamoLBase
   385  	return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
   386  }
   387  
   388  func isHangulWithoutJamoT(b []byte) bool {
   389  	c, _ := utf8.DecodeRune(b)
   390  	c -= hangulBase
   391  	return c < jamoLVTCount && c%jamoTCount == 0
   392  }
   393  
   394  // decomposeHangul writes the decomposed Hangul to buf and returns the number
   395  // of bytes written.  len(buf) should be at least 9.
   396  func decomposeHangul(buf []byte, r rune) int {
   397  	const JamoUTF8Len = 3
   398  	r -= hangulBase
   399  	x := r % jamoTCount
   400  	r /= jamoTCount
   401  	utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
   402  	utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
   403  	if x != 0 {
   404  		utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
   405  		return 3 * JamoUTF8Len
   406  	}
   407  	return 2 * JamoUTF8Len
   408  }
   409  
   410  // decomposeHangul algorithmically decomposes a Hangul rune into
   411  // its Jamo components.
   412  // See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
   413  func (rb *reorderBuffer) decomposeHangul(r rune) {
   414  	r -= hangulBase
   415  	x := r % jamoTCount
   416  	r /= jamoTCount
   417  	rb.appendRune(jamoLBase + r/jamoVCount)
   418  	rb.appendRune(jamoVBase + r%jamoVCount)
   419  	if x != 0 {
   420  		rb.appendRune(jamoTBase + x)
   421  	}
   422  }
   423  
   424  // combineHangul algorithmically combines Jamo character components into Hangul.
   425  // See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
   426  func (rb *reorderBuffer) combineHangul(s, i, k int) {
   427  	b := rb.rune[:]
   428  	bn := rb.nrune
   429  	for ; i < bn; i++ {
   430  		cccB := b[k-1].ccc
   431  		cccC := b[i].ccc
   432  		if cccB == 0 {
   433  			s = k - 1
   434  		}
   435  		if s != k-1 && cccB >= cccC {
   436  			// b[i] is blocked by greater-equal cccX below it
   437  			b[k] = b[i]
   438  			k++
   439  		} else {
   440  			l := rb.runeAt(s) // also used to compare to hangulBase
   441  			v := rb.runeAt(i) // also used to compare to jamoT
   442  			switch {
   443  			case jamoLBase <= l && l < jamoLEnd &&
   444  				jamoVBase <= v && v < jamoVEnd:
   445  				// 11xx plus 116x to LV
   446  				rb.assignRune(s, hangulBase+
   447  					(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
   448  			case hangulBase <= l && l < hangulEnd &&
   449  				jamoTBase < v && v < jamoTEnd &&
   450  				((l-hangulBase)%jamoTCount) == 0:
   451  				// ACxx plus 11Ax to LVT
   452  				rb.assignRune(s, l+v-jamoTBase)
   453  			default:
   454  				b[k] = b[i]
   455  				k++
   456  			}
   457  		}
   458  	}
   459  	rb.nrune = k
   460  }
   461  
   462  // compose recombines the runes in the buffer.
   463  // It should only be used to recompose a single segment, as it will not
   464  // handle alternations between Hangul and non-Hangul characters correctly.
   465  func (rb *reorderBuffer) compose() {
   466  	// UAX #15, section X5 , including Corrigendum #5
   467  	// "In any character sequence beginning with starter S, a character C is
   468  	//  blocked from S if and only if there is some character B between S
   469  	//  and C, and either B is a starter or it has the same or higher
   470  	//  combining class as C."
   471  	bn := rb.nrune
   472  	if bn == 0 {
   473  		return
   474  	}
   475  	k := 1
   476  	b := rb.rune[:]
   477  	for s, i := 0, 1; i < bn; i++ {
   478  		if isJamoVT(rb.bytesAt(i)) {
   479  			// Redo from start in Hangul mode. Necessary to support
   480  			// U+320E..U+321E in NFKC mode.
   481  			rb.combineHangul(s, i, k)
   482  			return
   483  		}
   484  		ii := b[i]
   485  		// We can only use combineForward as a filter if we later
   486  		// get the info for the combined character. This is more
   487  		// expensive than using the filter. Using combinesBackward()
   488  		// is safe.
   489  		if ii.combinesBackward() {
   490  			cccB := b[k-1].ccc
   491  			cccC := ii.ccc
   492  			blocked := false // b[i] blocked by starter or greater or equal CCC?
   493  			if cccB == 0 {
   494  				s = k - 1
   495  			} else {
   496  				blocked = s != k-1 && cccB >= cccC
   497  			}
   498  			if !blocked {
   499  				combined := combine(rb.runeAt(s), rb.runeAt(i))
   500  				if combined != 0 {
   501  					rb.assignRune(s, combined)
   502  					continue
   503  				}
   504  			}
   505  		}
   506  		b[k] = b[i]
   507  		k++
   508  	}
   509  	rb.nrune = k
   510  }