github.com/zxy12/go_duplicate_112_new@v0.0.0-20200807091221-747231827200/src/internal/x/text/unicode/norm/iter.go (about)

     1  // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
     2  
     3  // Copyright 2011 The Go Authors. All rights reserved.
     4  // Use of this source code is governed by a BSD-style
     5  // license that can be found in the LICENSE file.
     6  
     7  package norm
     8  
     9  import (
    10  	"fmt"
    11  	"unicode/utf8"
    12  )
    13  
    14  // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
    15  // sequence of starter and non-starter runes for the purpose of normalization.
    16  const MaxSegmentSize = maxByteBufferSize
    17  
    18  // An Iter iterates over a string or byte slice, while normalizing it
    19  // to a given Form.
    20  type Iter struct {
    21  	rb     reorderBuffer
    22  	buf    [maxByteBufferSize]byte
    23  	info   Properties // first character saved from previous iteration
    24  	next   iterFunc   // implementation of next depends on form
    25  	asciiF iterFunc
    26  
    27  	p        int    // current position in input source
    28  	multiSeg []byte // remainder of multi-segment decomposition
    29  }
    30  
    31  type iterFunc func(*Iter) []byte
    32  
    33  // Init initializes i to iterate over src after normalizing it to Form f.
    34  func (i *Iter) Init(f Form, src []byte) {
    35  	i.p = 0
    36  	if len(src) == 0 {
    37  		i.setDone()
    38  		i.rb.nsrc = 0
    39  		return
    40  	}
    41  	i.multiSeg = nil
    42  	i.rb.init(f, src)
    43  	i.next = i.rb.f.nextMain
    44  	i.asciiF = nextASCIIBytes
    45  	i.info = i.rb.f.info(i.rb.src, i.p)
    46  	i.rb.ss.first(i.info)
    47  }
    48  
    49  // InitString initializes i to iterate over src after normalizing it to Form f.
    50  func (i *Iter) InitString(f Form, src string) {
    51  	i.p = 0
    52  	if len(src) == 0 {
    53  		i.setDone()
    54  		i.rb.nsrc = 0
    55  		return
    56  	}
    57  	i.multiSeg = nil
    58  	i.rb.initString(f, src)
    59  	i.next = i.rb.f.nextMain
    60  	i.asciiF = nextASCIIString
    61  	i.info = i.rb.f.info(i.rb.src, i.p)
    62  	i.rb.ss.first(i.info)
    63  }
    64  
    65  // Seek sets the segment to be returned by the next call to Next to start
    66  // at position p.  It is the responsibility of the caller to set p to the
    67  // start of a segment.
    68  func (i *Iter) Seek(offset int64, whence int) (int64, error) {
    69  	var abs int64
    70  	switch whence {
    71  	case 0:
    72  		abs = offset
    73  	case 1:
    74  		abs = int64(i.p) + offset
    75  	case 2:
    76  		abs = int64(i.rb.nsrc) + offset
    77  	default:
    78  		return 0, fmt.Errorf("norm: invalid whence")
    79  	}
    80  	if abs < 0 {
    81  		return 0, fmt.Errorf("norm: negative position")
    82  	}
    83  	if int(abs) >= i.rb.nsrc {
    84  		i.setDone()
    85  		return int64(i.p), nil
    86  	}
    87  	i.p = int(abs)
    88  	i.multiSeg = nil
    89  	i.next = i.rb.f.nextMain
    90  	i.info = i.rb.f.info(i.rb.src, i.p)
    91  	i.rb.ss.first(i.info)
    92  	return abs, nil
    93  }
    94  
    95  // returnSlice returns a slice of the underlying input type as a byte slice.
    96  // If the underlying is of type []byte, it will simply return a slice.
    97  // If the underlying is of type string, it will copy the slice to the buffer
    98  // and return that.
    99  func (i *Iter) returnSlice(a, b int) []byte {
   100  	if i.rb.src.bytes == nil {
   101  		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
   102  	}
   103  	return i.rb.src.bytes[a:b]
   104  }
   105  
   106  // Pos returns the byte position at which the next call to Next will commence processing.
   107  func (i *Iter) Pos() int {
   108  	return i.p
   109  }
   110  
   111  func (i *Iter) setDone() {
   112  	i.next = nextDone
   113  	i.p = i.rb.nsrc
   114  }
   115  
   116  // Done returns true if there is no more input to process.
   117  func (i *Iter) Done() bool {
   118  	return i.p >= i.rb.nsrc
   119  }
   120  
   121  // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
   122  // For any input a and b for which f(a) == f(b), subsequent calls
   123  // to Next will return the same segments.
   124  // Modifying runes are grouped together with the preceding starter, if such a starter exists.
   125  // Although not guaranteed, n will typically be the smallest possible n.
   126  func (i *Iter) Next() []byte {
   127  	return i.next(i)
   128  }
   129  
   130  func nextASCIIBytes(i *Iter) []byte {
   131  	p := i.p + 1
   132  	if p >= i.rb.nsrc {
   133  		i.setDone()
   134  		return i.rb.src.bytes[i.p:p]
   135  	}
   136  	if i.rb.src.bytes[p] < utf8.RuneSelf {
   137  		p0 := i.p
   138  		i.p = p
   139  		return i.rb.src.bytes[p0:p]
   140  	}
   141  	i.info = i.rb.f.info(i.rb.src, i.p)
   142  	i.next = i.rb.f.nextMain
   143  	return i.next(i)
   144  }
   145  
   146  func nextASCIIString(i *Iter) []byte {
   147  	p := i.p + 1
   148  	if p >= i.rb.nsrc {
   149  		i.buf[0] = i.rb.src.str[i.p]
   150  		i.setDone()
   151  		return i.buf[:1]
   152  	}
   153  	if i.rb.src.str[p] < utf8.RuneSelf {
   154  		i.buf[0] = i.rb.src.str[i.p]
   155  		i.p = p
   156  		return i.buf[:1]
   157  	}
   158  	i.info = i.rb.f.info(i.rb.src, i.p)
   159  	i.next = i.rb.f.nextMain
   160  	return i.next(i)
   161  }
   162  
   163  func nextHangul(i *Iter) []byte {
   164  	p := i.p
   165  	next := p + hangulUTF8Size
   166  	if next >= i.rb.nsrc {
   167  		i.setDone()
   168  	} else if i.rb.src.hangul(next) == 0 {
   169  		i.rb.ss.next(i.info)
   170  		i.info = i.rb.f.info(i.rb.src, i.p)
   171  		i.next = i.rb.f.nextMain
   172  		return i.next(i)
   173  	}
   174  	i.p = next
   175  	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
   176  }
   177  
   178  func nextDone(i *Iter) []byte {
   179  	return nil
   180  }
   181  
   182  // nextMulti is used for iterating over multi-segment decompositions
   183  // for decomposing normal forms.
   184  func nextMulti(i *Iter) []byte {
   185  	j := 0
   186  	d := i.multiSeg
   187  	// skip first rune
   188  	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
   189  	}
   190  	for j < len(d) {
   191  		info := i.rb.f.info(input{bytes: d}, j)
   192  		if info.BoundaryBefore() {
   193  			i.multiSeg = d[j:]
   194  			return d[:j]
   195  		}
   196  		j += int(info.size)
   197  	}
   198  	// treat last segment as normal decomposition
   199  	i.next = i.rb.f.nextMain
   200  	return i.next(i)
   201  }
   202  
   203  // nextMultiNorm is used for iterating over multi-segment decompositions
   204  // for composing normal forms.
   205  func nextMultiNorm(i *Iter) []byte {
   206  	j := 0
   207  	d := i.multiSeg
   208  	for j < len(d) {
   209  		info := i.rb.f.info(input{bytes: d}, j)
   210  		if info.BoundaryBefore() {
   211  			i.rb.compose()
   212  			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   213  			i.rb.insertUnsafe(input{bytes: d}, j, info)
   214  			i.multiSeg = d[j+int(info.size):]
   215  			return seg
   216  		}
   217  		i.rb.insertUnsafe(input{bytes: d}, j, info)
   218  		j += int(info.size)
   219  	}
   220  	i.multiSeg = nil
   221  	i.next = nextComposed
   222  	return doNormComposed(i)
   223  }
   224  
   225  // nextDecomposed is the implementation of Next for forms NFD and NFKD.
   226  func nextDecomposed(i *Iter) (next []byte) {
   227  	outp := 0
   228  	inCopyStart, outCopyStart := i.p, 0
   229  	for {
   230  		if sz := int(i.info.size); sz <= 1 {
   231  			i.rb.ss = 0
   232  			p := i.p
   233  			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
   234  			if i.p >= i.rb.nsrc {
   235  				i.setDone()
   236  				return i.returnSlice(p, i.p)
   237  			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   238  				i.next = i.asciiF
   239  				return i.returnSlice(p, i.p)
   240  			}
   241  			outp++
   242  		} else if d := i.info.Decomposition(); d != nil {
   243  			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
   244  			// Case 1: there is a leftover to copy.  In this case the decomposition
   245  			// must begin with a modifier and should always be appended.
   246  			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
   247  			p := outp + len(d)
   248  			if outp > 0 {
   249  				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   250  				// TODO: this condition should not be possible, but we leave it
   251  				// in for defensive purposes.
   252  				if p > len(i.buf) {
   253  					return i.buf[:outp]
   254  				}
   255  			} else if i.info.multiSegment() {
   256  				// outp must be 0 as multi-segment decompositions always
   257  				// start a new segment.
   258  				if i.multiSeg == nil {
   259  					i.multiSeg = d
   260  					i.next = nextMulti
   261  					return nextMulti(i)
   262  				}
   263  				// We are in the last segment.  Treat as normal decomposition.
   264  				d = i.multiSeg
   265  				i.multiSeg = nil
   266  				p = len(d)
   267  			}
   268  			prevCC := i.info.tccc
   269  			if i.p += sz; i.p >= i.rb.nsrc {
   270  				i.setDone()
   271  				i.info = Properties{} // Force BoundaryBefore to succeed.
   272  			} else {
   273  				i.info = i.rb.f.info(i.rb.src, i.p)
   274  			}
   275  			switch i.rb.ss.next(i.info) {
   276  			case ssOverflow:
   277  				i.next = nextCGJDecompose
   278  				fallthrough
   279  			case ssStarter:
   280  				if outp > 0 {
   281  					copy(i.buf[outp:], d)
   282  					return i.buf[:p]
   283  				}
   284  				return d
   285  			}
   286  			copy(i.buf[outp:], d)
   287  			outp = p
   288  			inCopyStart, outCopyStart = i.p, outp
   289  			if i.info.ccc < prevCC {
   290  				goto doNorm
   291  			}
   292  			continue
   293  		} else if r := i.rb.src.hangul(i.p); r != 0 {
   294  			outp = decomposeHangul(i.buf[:], r)
   295  			i.p += hangulUTF8Size
   296  			inCopyStart, outCopyStart = i.p, outp
   297  			if i.p >= i.rb.nsrc {
   298  				i.setDone()
   299  				break
   300  			} else if i.rb.src.hangul(i.p) != 0 {
   301  				i.next = nextHangul
   302  				return i.buf[:outp]
   303  			}
   304  		} else {
   305  			p := outp + sz
   306  			if p > len(i.buf) {
   307  				break
   308  			}
   309  			outp = p
   310  			i.p += sz
   311  		}
   312  		if i.p >= i.rb.nsrc {
   313  			i.setDone()
   314  			break
   315  		}
   316  		prevCC := i.info.tccc
   317  		i.info = i.rb.f.info(i.rb.src, i.p)
   318  		if v := i.rb.ss.next(i.info); v == ssStarter {
   319  			break
   320  		} else if v == ssOverflow {
   321  			i.next = nextCGJDecompose
   322  			break
   323  		}
   324  		if i.info.ccc < prevCC {
   325  			goto doNorm
   326  		}
   327  	}
   328  	if outCopyStart == 0 {
   329  		return i.returnSlice(inCopyStart, i.p)
   330  	} else if inCopyStart < i.p {
   331  		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   332  	}
   333  	return i.buf[:outp]
   334  doNorm:
   335  	// Insert what we have decomposed so far in the reorderBuffer.
   336  	// As we will only reorder, there will always be enough room.
   337  	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   338  	i.rb.insertDecomposed(i.buf[0:outp])
   339  	return doNormDecomposed(i)
   340  }
   341  
   342  func doNormDecomposed(i *Iter) []byte {
   343  	for {
   344  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   345  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   346  			i.setDone()
   347  			break
   348  		}
   349  		i.info = i.rb.f.info(i.rb.src, i.p)
   350  		if i.info.ccc == 0 {
   351  			break
   352  		}
   353  		if s := i.rb.ss.next(i.info); s == ssOverflow {
   354  			i.next = nextCGJDecompose
   355  			break
   356  		}
   357  	}
   358  	// new segment or too many combining characters: exit normalization
   359  	return i.buf[:i.rb.flushCopy(i.buf[:])]
   360  }
   361  
   362  func nextCGJDecompose(i *Iter) []byte {
   363  	i.rb.ss = 0
   364  	i.rb.insertCGJ()
   365  	i.next = nextDecomposed
   366  	i.rb.ss.first(i.info)
   367  	buf := doNormDecomposed(i)
   368  	return buf
   369  }
   370  
   371  // nextComposed is the implementation of Next for forms NFC and NFKC.
   372  func nextComposed(i *Iter) []byte {
   373  	outp, startp := 0, i.p
   374  	var prevCC uint8
   375  	for {
   376  		if !i.info.isYesC() {
   377  			goto doNorm
   378  		}
   379  		prevCC = i.info.tccc
   380  		sz := int(i.info.size)
   381  		if sz == 0 {
   382  			sz = 1 // illegal rune: copy byte-by-byte
   383  		}
   384  		p := outp + sz
   385  		if p > len(i.buf) {
   386  			break
   387  		}
   388  		outp = p
   389  		i.p += sz
   390  		if i.p >= i.rb.nsrc {
   391  			i.setDone()
   392  			break
   393  		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   394  			i.rb.ss = 0
   395  			i.next = i.asciiF
   396  			break
   397  		}
   398  		i.info = i.rb.f.info(i.rb.src, i.p)
   399  		if v := i.rb.ss.next(i.info); v == ssStarter {
   400  			break
   401  		} else if v == ssOverflow {
   402  			i.next = nextCGJCompose
   403  			break
   404  		}
   405  		if i.info.ccc < prevCC {
   406  			goto doNorm
   407  		}
   408  	}
   409  	return i.returnSlice(startp, i.p)
   410  doNorm:
   411  	// reset to start position
   412  	i.p = startp
   413  	i.info = i.rb.f.info(i.rb.src, i.p)
   414  	i.rb.ss.first(i.info)
   415  	if i.info.multiSegment() {
   416  		d := i.info.Decomposition()
   417  		info := i.rb.f.info(input{bytes: d}, 0)
   418  		i.rb.insertUnsafe(input{bytes: d}, 0, info)
   419  		i.multiSeg = d[int(info.size):]
   420  		i.next = nextMultiNorm
   421  		return nextMultiNorm(i)
   422  	}
   423  	i.rb.ss.first(i.info)
   424  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   425  	return doNormComposed(i)
   426  }
   427  
   428  func doNormComposed(i *Iter) []byte {
   429  	// First rune should already be inserted.
   430  	for {
   431  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   432  			i.setDone()
   433  			break
   434  		}
   435  		i.info = i.rb.f.info(i.rb.src, i.p)
   436  		if s := i.rb.ss.next(i.info); s == ssStarter {
   437  			break
   438  		} else if s == ssOverflow {
   439  			i.next = nextCGJCompose
   440  			break
   441  		}
   442  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   443  	}
   444  	i.rb.compose()
   445  	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   446  	return seg
   447  }
   448  
   449  func nextCGJCompose(i *Iter) []byte {
   450  	i.rb.ss = 0 // instead of first
   451  	i.rb.insertCGJ()
   452  	i.next = nextComposed
   453  	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
   454  	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
   455  	// If we ever change that, insert a check here.
   456  	i.rb.ss.first(i.info)
   457  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   458  	return doNormComposed(i)
   459  }