github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/unicode/norm/iter.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  import (
     8  	"fmt"
     9  	"unicode/utf8"
    10  )
    11  
    12  // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
    13  // sequence of starter and non-starter runes for the purpose of normalization.
    14  const MaxSegmentSize = maxByteBufferSize
    15  
    16  // An Iter iterates over a string or byte slice, while normalizing it
    17  // to a given Form.
    18  type Iter struct {
    19  	rb     reorderBuffer
    20  	buf    [maxByteBufferSize]byte
    21  	info   Properties // first character saved from previous iteration
    22  	next   iterFunc   // implementation of next depends on form
    23  	asciiF iterFunc
    24  
    25  	p        int    // current position in input source
    26  	multiSeg []byte // remainder of multi-segment decomposition
    27  }
    28  
    29  type iterFunc func(*Iter) []byte
    30  
    31  // Init initializes i to iterate over src after normalizing it to Form f.
    32  func (i *Iter) Init(f Form, src []byte) {
    33  	i.p = 0
    34  	if len(src) == 0 {
    35  		i.setDone()
    36  		i.rb.nsrc = 0
    37  		return
    38  	}
    39  	i.multiSeg = nil
    40  	i.rb.init(f, src)
    41  	i.next = i.rb.f.nextMain
    42  	i.asciiF = nextASCIIBytes
    43  	i.info = i.rb.f.info(i.rb.src, i.p)
    44  }
    45  
    46  // InitString initializes i to iterate over src after normalizing it to Form f.
    47  func (i *Iter) InitString(f Form, src string) {
    48  	i.p = 0
    49  	if len(src) == 0 {
    50  		i.setDone()
    51  		i.rb.nsrc = 0
    52  		return
    53  	}
    54  	i.multiSeg = nil
    55  	i.rb.initString(f, src)
    56  	i.next = i.rb.f.nextMain
    57  	i.asciiF = nextASCIIString
    58  	i.info = i.rb.f.info(i.rb.src, i.p)
    59  }
    60  
    61  // Seek sets the segment to be returned by the next call to Next to start
    62  // at position p.  It is the responsibility of the caller to set p to the
    63  // start of a UTF8 rune.
    64  func (i *Iter) Seek(offset int64, whence int) (int64, error) {
    65  	var abs int64
    66  	switch whence {
    67  	case 0:
    68  		abs = offset
    69  	case 1:
    70  		abs = int64(i.p) + offset
    71  	case 2:
    72  		abs = int64(i.rb.nsrc) + offset
    73  	default:
    74  		return 0, fmt.Errorf("norm: invalid whence")
    75  	}
    76  	if abs < 0 {
    77  		return 0, fmt.Errorf("norm: negative position")
    78  	}
    79  	if int(abs) >= i.rb.nsrc {
    80  		i.setDone()
    81  		return int64(i.p), nil
    82  	}
    83  	i.p = int(abs)
    84  	i.multiSeg = nil
    85  	i.next = i.rb.f.nextMain
    86  	i.info = i.rb.f.info(i.rb.src, i.p)
    87  	return abs, nil
    88  }
    89  
    90  // returnSlice returns a slice of the underlying input type as a byte slice.
    91  // If the underlying is of type []byte, it will simply return a slice.
    92  // If the underlying is of type string, it will copy the slice to the buffer
    93  // and return that.
    94  func (i *Iter) returnSlice(a, b int) []byte {
    95  	if i.rb.src.bytes == nil {
    96  		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
    97  	}
    98  	return i.rb.src.bytes[a:b]
    99  }
   100  
   101  // Pos returns the byte position at which the next call to Next will commence processing.
   102  func (i *Iter) Pos() int {
   103  	return i.p
   104  }
   105  
   106  func (i *Iter) setDone() {
   107  	i.next = nextDone
   108  	i.p = i.rb.nsrc
   109  }
   110  
   111  // Done returns true if there is no more input to process.
   112  func (i *Iter) Done() bool {
   113  	return i.p >= i.rb.nsrc
   114  }
   115  
   116  // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
   117  // For any input a and b for which f(a) == f(b), subsequent calls
   118  // to Next will return the same segments.
   119  // Modifying runes are grouped together with the preceding starter, if such a starter exists.
   120  // Although not guaranteed, n will typically be the smallest possible n.
   121  func (i *Iter) Next() []byte {
   122  	return i.next(i)
   123  }
   124  
   125  func nextASCIIBytes(i *Iter) []byte {
   126  	p := i.p + 1
   127  	if p >= i.rb.nsrc {
   128  		i.setDone()
   129  		return i.rb.src.bytes[i.p:p]
   130  	}
   131  	if i.rb.src.bytes[p] < utf8.RuneSelf {
   132  		p0 := i.p
   133  		i.p = p
   134  		return i.rb.src.bytes[p0:p]
   135  	}
   136  	i.info = i.rb.f.info(i.rb.src, i.p)
   137  	i.next = i.rb.f.nextMain
   138  	return i.next(i)
   139  }
   140  
   141  func nextASCIIString(i *Iter) []byte {
   142  	p := i.p + 1
   143  	if p >= i.rb.nsrc {
   144  		i.buf[0] = i.rb.src.str[i.p]
   145  		i.setDone()
   146  		return i.buf[:1]
   147  	}
   148  	if i.rb.src.str[p] < utf8.RuneSelf {
   149  		i.buf[0] = i.rb.src.str[i.p]
   150  		i.p = p
   151  		return i.buf[:1]
   152  	}
   153  	i.info = i.rb.f.info(i.rb.src, i.p)
   154  	i.next = i.rb.f.nextMain
   155  	return i.next(i)
   156  }
   157  
   158  func nextHangul(i *Iter) []byte {
   159  	p := i.p
   160  	next := p + hangulUTF8Size
   161  	if next >= i.rb.nsrc {
   162  		i.setDone()
   163  	} else if i.rb.src.hangul(next) == 0 {
   164  		i.info = i.rb.f.info(i.rb.src, i.p)
   165  		i.next = i.rb.f.nextMain
   166  		return i.next(i)
   167  	}
   168  	i.p = next
   169  	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
   170  }
   171  
   172  func nextDone(i *Iter) []byte {
   173  	return nil
   174  }
   175  
   176  // nextMulti is used for iterating over multi-segment decompositions
   177  // for decomposing normal forms.
   178  func nextMulti(i *Iter) []byte {
   179  	j := 0
   180  	d := i.multiSeg
   181  	// skip first rune
   182  	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
   183  	}
   184  	for j < len(d) {
   185  		info := i.rb.f.info(input{bytes: d}, j)
   186  		if info.BoundaryBefore() {
   187  			i.multiSeg = d[j:]
   188  			return d[:j]
   189  		}
   190  		j += int(info.size)
   191  	}
   192  	// treat last segment as normal decomposition
   193  	i.next = i.rb.f.nextMain
   194  	return i.next(i)
   195  }
   196  
   197  // nextMultiNorm is used for iterating over multi-segment decompositions
   198  // for composing normal forms.
   199  func nextMultiNorm(i *Iter) []byte {
   200  	j := 0
   201  	d := i.multiSeg
   202  	for j < len(d) {
   203  		info := i.rb.f.info(input{bytes: d}, j)
   204  		if info.BoundaryBefore() {
   205  			i.rb.compose()
   206  			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   207  			i.rb.ss.first(info)
   208  			i.rb.insertUnsafe(input{bytes: d}, j, info)
   209  			i.multiSeg = d[j+int(info.size):]
   210  			return seg
   211  		}
   212  		i.rb.ss.next(info)
   213  		i.rb.insertUnsafe(input{bytes: d}, j, info)
   214  		j += int(info.size)
   215  	}
   216  	i.multiSeg = nil
   217  	i.next = nextComposed
   218  	return doNormComposed(i)
   219  }
   220  
   221  // nextDecomposed is the implementation of Next for forms NFD and NFKD.
   222  func nextDecomposed(i *Iter) (next []byte) {
   223  	outp := 0
   224  	inCopyStart, outCopyStart := i.p, 0
   225  	ss := mkStreamSafe(i.info)
   226  	for {
   227  		if sz := int(i.info.size); sz <= 1 {
   228  			p := i.p
   229  			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
   230  			if i.p >= i.rb.nsrc {
   231  				i.setDone()
   232  				return i.returnSlice(p, i.p)
   233  			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   234  				i.next = i.asciiF
   235  				return i.returnSlice(p, i.p)
   236  			}
   237  			outp++
   238  		} else if d := i.info.Decomposition(); d != nil {
   239  			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
   240  			// Case 1: there is a leftover to copy.  In this case the decomposition
   241  			// must begin with a modifier and should always be appended.
   242  			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
   243  			p := outp + len(d)
   244  			if outp > 0 {
   245  				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   246  				if p > len(i.buf) {
   247  					return i.buf[:outp]
   248  				}
   249  			} else if i.info.multiSegment() {
   250  				// outp must be 0 as multi-segment decompositions always
   251  				// start a new segment.
   252  				if i.multiSeg == nil {
   253  					i.multiSeg = d
   254  					i.next = nextMulti
   255  					return nextMulti(i)
   256  				}
   257  				// We are in the last segment.  Treat as normal decomposition.
   258  				d = i.multiSeg
   259  				i.multiSeg = nil
   260  				p = len(d)
   261  			}
   262  			prevCC := i.info.tccc
   263  			if i.p += sz; i.p >= i.rb.nsrc {
   264  				i.setDone()
   265  				i.info = Properties{} // Force BoundaryBefore to succeed.
   266  			} else {
   267  				i.info = i.rb.f.info(i.rb.src, i.p)
   268  			}
   269  			switch ss.next(i.info) {
   270  			case ssOverflow:
   271  				i.next = nextCGJDecompose
   272  				fallthrough
   273  			case ssStarter:
   274  				if outp > 0 {
   275  					copy(i.buf[outp:], d)
   276  					return i.buf[:p]
   277  				}
   278  				return d
   279  			}
   280  			copy(i.buf[outp:], d)
   281  			outp = p
   282  			inCopyStart, outCopyStart = i.p, outp
   283  			if i.info.ccc < prevCC {
   284  				goto doNorm
   285  			}
   286  			continue
   287  		} else if r := i.rb.src.hangul(i.p); r != 0 {
   288  			outp = decomposeHangul(i.buf[:], r)
   289  			i.p += hangulUTF8Size
   290  			inCopyStart, outCopyStart = i.p, outp
   291  			if i.p >= i.rb.nsrc {
   292  				i.setDone()
   293  				break
   294  			} else if i.rb.src.hangul(i.p) != 0 {
   295  				i.next = nextHangul
   296  				return i.buf[:outp]
   297  			}
   298  		} else {
   299  			p := outp + sz
   300  			if p > len(i.buf) {
   301  				break
   302  			}
   303  			outp = p
   304  			i.p += sz
   305  		}
   306  		if i.p >= i.rb.nsrc {
   307  			i.setDone()
   308  			break
   309  		}
   310  		prevCC := i.info.tccc
   311  		i.info = i.rb.f.info(i.rb.src, i.p)
   312  		if v := ss.next(i.info); v == ssStarter {
   313  			break
   314  		} else if v == ssOverflow {
   315  			i.next = nextCGJDecompose
   316  			break
   317  		}
   318  		if i.info.ccc < prevCC {
   319  			goto doNorm
   320  		}
   321  	}
   322  	if outCopyStart == 0 {
   323  		return i.returnSlice(inCopyStart, i.p)
   324  	} else if inCopyStart < i.p {
   325  		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   326  	}
   327  	return i.buf[:outp]
   328  doNorm:
   329  	// Insert what we have decomposed so far in the reorderBuffer.
   330  	// As we will only reorder, there will always be enough room.
   331  	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   332  	i.rb.insertDecomposed(i.buf[0:outp])
   333  	return doNormDecomposed(i)
   334  }
   335  
   336  func doNormDecomposed(i *Iter) []byte {
   337  	for {
   338  		if s := i.rb.ss.next(i.info); s == ssOverflow {
   339  			i.next = nextCGJDecompose
   340  			break
   341  		}
   342  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   343  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   344  			i.setDone()
   345  			break
   346  		}
   347  		i.info = i.rb.f.info(i.rb.src, i.p)
   348  		if i.info.ccc == 0 {
   349  			break
   350  		}
   351  	}
   352  	// new segment or too many combining characters: exit normalization
   353  	return i.buf[:i.rb.flushCopy(i.buf[:])]
   354  }
   355  
   356  func nextCGJDecompose(i *Iter) []byte {
   357  	i.rb.ss = 0
   358  	i.rb.insertCGJ()
   359  	i.next = nextDecomposed
   360  	buf := doNormDecomposed(i)
   361  	return buf
   362  }
   363  
   364  // nextComposed is the implementation of Next for forms NFC and NFKC.
   365  func nextComposed(i *Iter) []byte {
   366  	outp, startp := 0, i.p
   367  	var prevCC uint8
   368  	ss := mkStreamSafe(i.info)
   369  	for {
   370  		if !i.info.isYesC() {
   371  			goto doNorm
   372  		}
   373  		prevCC = i.info.tccc
   374  		sz := int(i.info.size)
   375  		if sz == 0 {
   376  			sz = 1 // illegal rune: copy byte-by-byte
   377  		}
   378  		p := outp + sz
   379  		if p > len(i.buf) {
   380  			break
   381  		}
   382  		outp = p
   383  		i.p += sz
   384  		if i.p >= i.rb.nsrc {
   385  			i.setDone()
   386  			break
   387  		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   388  			i.next = i.asciiF
   389  			break
   390  		}
   391  		i.info = i.rb.f.info(i.rb.src, i.p)
   392  		if v := ss.next(i.info); v == ssStarter {
   393  			break
   394  		} else if v == ssOverflow {
   395  			i.next = nextCGJCompose
   396  			break
   397  		}
   398  		if i.info.ccc < prevCC {
   399  			goto doNorm
   400  		}
   401  	}
   402  	return i.returnSlice(startp, i.p)
   403  doNorm:
   404  	i.p = startp
   405  	i.info = i.rb.f.info(i.rb.src, i.p)
   406  	if i.info.multiSegment() {
   407  		d := i.info.Decomposition()
   408  		info := i.rb.f.info(input{bytes: d}, 0)
   409  		i.rb.insertUnsafe(input{bytes: d}, 0, info)
   410  		i.multiSeg = d[int(info.size):]
   411  		i.next = nextMultiNorm
   412  		return nextMultiNorm(i)
   413  	}
   414  	i.rb.ss.first(i.info)
   415  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   416  	return doNormComposed(i)
   417  }
   418  
   419  func doNormComposed(i *Iter) []byte {
   420  	// First rune should already be inserted.
   421  	for {
   422  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   423  			i.setDone()
   424  			break
   425  		}
   426  		i.info = i.rb.f.info(i.rb.src, i.p)
   427  		if s := i.rb.ss.next(i.info); s == ssStarter {
   428  			break
   429  		} else if s == ssOverflow {
   430  			i.next = nextCGJCompose
   431  			break
   432  		}
   433  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   434  	}
   435  	i.rb.compose()
   436  	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   437  	return seg
   438  }
   439  
   440  func nextCGJCompose(i *Iter) []byte {
   441  	i.rb.ss = 0 // instead of first
   442  	i.rb.insertCGJ()
   443  	i.next = nextComposed
   444  	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
   445  	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
   446  	// If we ever change that, insert a check here.
   447  	i.rb.ss.first(i.info)
   448  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   449  	return doNormComposed(i)
   450  }