github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/unicode/norm/normalize.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run maketables.go triegen.go
     6  //go:generate go run maketables.go triegen.go -test
     7  
     8  // Package norm contains types and functions for normalizing Unicode strings.
     9  package norm // import "golang.org/x/text/unicode/norm"
    10  
    11  import "unicode/utf8"
    12  
    13  // A Form denotes a canonical representation of Unicode code points.
    14  // The Unicode-defined normalization and equivalence forms are:
    15  //
    16  //   NFC   Unicode Normalization Form C
    17  //   NFD   Unicode Normalization Form D
    18  //   NFKC  Unicode Normalization Form KC
    19  //   NFKD  Unicode Normalization Form KD
    20  //
    21  // For a Form f, this documentation uses the notation f(x) to mean
    22  // the bytes or string x converted to the given form.
    23  // A position n in x is called a boundary if conversion to the form can
    24  // proceed independently on both sides:
    25  //   f(x) == append(f(x[0:n]), f(x[n:])...)
    26  //
    27  // References: http://unicode.org/reports/tr15/ and
    28  // http://unicode.org/notes/tn5/.
    29  type Form int
    30  
    31  const (
    32  	NFC Form = iota
    33  	NFD
    34  	NFKC
    35  	NFKD
    36  )
    37  
    38  // Bytes returns f(b). May return b if f(b) = b.
    39  func (f Form) Bytes(b []byte) []byte {
    40  	src := inputBytes(b)
    41  	ft := formTable[f]
    42  	n, ok := ft.quickSpan(src, 0, len(b), true)
    43  	if ok {
    44  		return b
    45  	}
    46  	out := make([]byte, n, len(b))
    47  	copy(out, b[0:n])
    48  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
    49  	return doAppendInner(&rb, n)
    50  }
    51  
    52  // String returns f(s).
    53  func (f Form) String(s string) string {
    54  	src := inputString(s)
    55  	ft := formTable[f]
    56  	n, ok := ft.quickSpan(src, 0, len(s), true)
    57  	if ok {
    58  		return s
    59  	}
    60  	out := make([]byte, n, len(s))
    61  	copy(out, s[0:n])
    62  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
    63  	return string(doAppendInner(&rb, n))
    64  }
    65  
    66  // IsNormal returns true if b == f(b).
    67  func (f Form) IsNormal(b []byte) bool {
    68  	src := inputBytes(b)
    69  	ft := formTable[f]
    70  	bp, ok := ft.quickSpan(src, 0, len(b), true)
    71  	if ok {
    72  		return true
    73  	}
    74  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
    75  	rb.setFlusher(nil, cmpNormalBytes)
    76  	for bp < len(b) {
    77  		rb.out = b[bp:]
    78  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
    79  			return false
    80  		}
    81  		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
    82  	}
    83  	return true
    84  }
    85  
    86  func cmpNormalBytes(rb *reorderBuffer) bool {
    87  	b := rb.out
    88  	for i := 0; i < rb.nrune; i++ {
    89  		info := rb.rune[i]
    90  		if int(info.size) > len(b) {
    91  			return false
    92  		}
    93  		p := info.pos
    94  		pe := p + info.size
    95  		for ; p < pe; p++ {
    96  			if b[0] != rb.byte[p] {
    97  				return false
    98  			}
    99  			b = b[1:]
   100  		}
   101  	}
   102  	return true
   103  }
   104  
   105  // IsNormalString returns true if s == f(s).
   106  func (f Form) IsNormalString(s string) bool {
   107  	src := inputString(s)
   108  	ft := formTable[f]
   109  	bp, ok := ft.quickSpan(src, 0, len(s), true)
   110  	if ok {
   111  		return true
   112  	}
   113  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
   114  	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
   115  		for i := 0; i < rb.nrune; i++ {
   116  			info := rb.rune[i]
   117  			if bp+int(info.size) > len(s) {
   118  				return false
   119  			}
   120  			p := info.pos
   121  			pe := p + info.size
   122  			for ; p < pe; p++ {
   123  				if s[bp] != rb.byte[p] {
   124  					return false
   125  				}
   126  				bp++
   127  			}
   128  		}
   129  		return true
   130  	})
   131  	for bp < len(s) {
   132  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
   133  			return false
   134  		}
   135  		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
   136  	}
   137  	return true
   138  }
   139  
   140  // patchTail fixes a case where a rune may be incorrectly normalized
   141  // if it is followed by illegal continuation bytes. It returns the
   142  // patched buffer and whether the decomposition is still in progress.
   143  func patchTail(rb *reorderBuffer) bool {
   144  	info, p := lastRuneStart(&rb.f, rb.out)
   145  	if p == -1 || info.size == 0 {
   146  		return true
   147  	}
   148  	end := p + int(info.size)
   149  	extra := len(rb.out) - end
   150  	if extra > 0 {
   151  		// Potentially allocating memory. However, this only
   152  		// happens with ill-formed UTF-8.
   153  		x := make([]byte, 0)
   154  		x = append(x, rb.out[len(rb.out)-extra:]...)
   155  		rb.out = rb.out[:end]
   156  		decomposeToLastBoundary(rb)
   157  		rb.doFlush()
   158  		rb.out = append(rb.out, x...)
   159  		return false
   160  	}
   161  	buf := rb.out[p:]
   162  	rb.out = rb.out[:p]
   163  	decomposeToLastBoundary(rb)
   164  	if s := rb.ss.next(info); s == ssStarter {
   165  		rb.doFlush()
   166  		rb.ss.first(info)
   167  	} else if s == ssOverflow {
   168  		rb.doFlush()
   169  		rb.insertCGJ()
   170  		rb.ss = 0
   171  	}
   172  	rb.insertUnsafe(inputBytes(buf), 0, info)
   173  	return true
   174  }
   175  
   176  func appendQuick(rb *reorderBuffer, i int) int {
   177  	if rb.nsrc == i {
   178  		return i
   179  	}
   180  	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
   181  	rb.out = rb.src.appendSlice(rb.out, i, end)
   182  	return end
   183  }
   184  
   185  // Append returns f(append(out, b...)).
   186  // The buffer out must be nil, empty, or equal to f(out).
   187  func (f Form) Append(out []byte, src ...byte) []byte {
   188  	return f.doAppend(out, inputBytes(src), len(src))
   189  }
   190  
   191  func (f Form) doAppend(out []byte, src input, n int) []byte {
   192  	if n == 0 {
   193  		return out
   194  	}
   195  	ft := formTable[f]
   196  	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
   197  	if len(out) == 0 {
   198  		p, _ := ft.quickSpan(src, 0, n, true)
   199  		out = src.appendSlice(out, 0, p)
   200  		if p == n {
   201  			return out
   202  		}
   203  		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
   204  		return doAppendInner(&rb, p)
   205  	}
   206  	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
   207  	return doAppend(&rb, out, 0)
   208  }
   209  
   210  func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
   211  	rb.setFlusher(out, appendFlush)
   212  	src, n := rb.src, rb.nsrc
   213  	doMerge := len(out) > 0
   214  	if q := src.skipContinuationBytes(p); q > p {
   215  		// Move leading non-starters to destination.
   216  		rb.out = src.appendSlice(rb.out, p, q)
   217  		p = q
   218  		doMerge = patchTail(rb)
   219  	}
   220  	fd := &rb.f
   221  	if doMerge {
   222  		var info Properties
   223  		if p < n {
   224  			info = fd.info(src, p)
   225  			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
   226  				if p == 0 {
   227  					decomposeToLastBoundary(rb)
   228  				}
   229  				p = decomposeSegment(rb, p, true)
   230  			}
   231  		}
   232  		if info.size == 0 {
   233  			rb.doFlush()
   234  			// Append incomplete UTF-8 encoding.
   235  			return src.appendSlice(rb.out, p, n)
   236  		}
   237  		if rb.nrune > 0 {
   238  			return doAppendInner(rb, p)
   239  		}
   240  	}
   241  	p = appendQuick(rb, p)
   242  	return doAppendInner(rb, p)
   243  }
   244  
   245  func doAppendInner(rb *reorderBuffer, p int) []byte {
   246  	for n := rb.nsrc; p < n; {
   247  		p = decomposeSegment(rb, p, true)
   248  		p = appendQuick(rb, p)
   249  	}
   250  	return rb.out
   251  }
   252  
   253  // AppendString returns f(append(out, []byte(s))).
   254  // The buffer out must be nil, empty, or equal to f(out).
   255  func (f Form) AppendString(out []byte, src string) []byte {
   256  	return f.doAppend(out, inputString(src), len(src))
   257  }
   258  
   259  // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
   260  // It is not guaranteed to return the largest such n.
   261  func (f Form) QuickSpan(b []byte) int {
   262  	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
   263  	return n
   264  }
   265  
   266  // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
   267  // whether any non-normalized parts were found. If atEOF is false, n will
   268  // not point past the last segment if this segment might be become
   269  // non-normalized by appending other runes.
   270  func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
   271  	var lastCC uint8
   272  	ss := streamSafe(0)
   273  	lastSegStart := i
   274  	for n = end; i < n; {
   275  		if j := src.skipASCII(i, n); i != j {
   276  			i = j
   277  			lastSegStart = i - 1
   278  			lastCC = 0
   279  			ss = 0
   280  			continue
   281  		}
   282  		info := f.info(src, i)
   283  		if info.size == 0 {
   284  			if atEOF {
   285  				// include incomplete runes
   286  				return n, true
   287  			}
   288  			return lastSegStart, true
   289  		}
   290  		// This block needs to be before the next, because it is possible to
   291  		// have an overflow for runes that are starters (e.g. with U+FF9E).
   292  		switch ss.next(info) {
   293  		case ssStarter:
   294  			ss.first(info)
   295  			lastSegStart = i
   296  		case ssOverflow:
   297  			return lastSegStart, false
   298  		case ssSuccess:
   299  			if lastCC > info.ccc {
   300  				return lastSegStart, false
   301  			}
   302  		}
   303  		if f.composing {
   304  			if !info.isYesC() {
   305  				break
   306  			}
   307  		} else {
   308  			if !info.isYesD() {
   309  				break
   310  			}
   311  		}
   312  		lastCC = info.ccc
   313  		i += int(info.size)
   314  	}
   315  	if i == n {
   316  		if !atEOF {
   317  			n = lastSegStart
   318  		}
   319  		return n, true
   320  	}
   321  	return lastSegStart, false
   322  }
   323  
   324  // QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
   325  // It is not guaranteed to return the largest such n.
   326  func (f Form) QuickSpanString(s string) int {
   327  	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
   328  	return n
   329  }
   330  
   331  // FirstBoundary returns the position i of the first boundary in b
   332  // or -1 if b contains no boundary.
   333  func (f Form) FirstBoundary(b []byte) int {
   334  	return f.firstBoundary(inputBytes(b), len(b))
   335  }
   336  
   337  func (f Form) firstBoundary(src input, nsrc int) int {
   338  	i := src.skipContinuationBytes(0)
   339  	if i >= nsrc {
   340  		return -1
   341  	}
   342  	fd := formTable[f]
   343  	ss := streamSafe(0)
   344  	// We should call ss.first here, but we can't as the first rune is
   345  	// skipped already. This means FirstBoundary can't really determine
   346  	// CGJ insertion points correctly. Luckily it doesn't have to.
   347  	// TODO: consider adding NextBoundary
   348  	for {
   349  		info := fd.info(src, i)
   350  		if info.size == 0 {
   351  			return -1
   352  		}
   353  		if s := ss.next(info); s != ssSuccess {
   354  			return i
   355  		}
   356  		i += int(info.size)
   357  		if i >= nsrc {
   358  			if !info.BoundaryAfter() && !ss.isMax() {
   359  				return -1
   360  			}
   361  			return nsrc
   362  		}
   363  	}
   364  }
   365  
   366  // FirstBoundaryInString returns the position i of the first boundary in s
   367  // or -1 if s contains no boundary.
   368  func (f Form) FirstBoundaryInString(s string) int {
   369  	return f.firstBoundary(inputString(s), len(s))
   370  }
   371  
   372  // LastBoundary returns the position i of the last boundary in b
   373  // or -1 if b contains no boundary.
   374  func (f Form) LastBoundary(b []byte) int {
   375  	return lastBoundary(formTable[f], b)
   376  }
   377  
   378  func lastBoundary(fd *formInfo, b []byte) int {
   379  	i := len(b)
   380  	info, p := lastRuneStart(fd, b)
   381  	if p == -1 {
   382  		return -1
   383  	}
   384  	if info.size == 0 { // ends with incomplete rune
   385  		if p == 0 { // starts with incomplete rune
   386  			return -1
   387  		}
   388  		i = p
   389  		info, p = lastRuneStart(fd, b[:i])
   390  		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
   391  			return i
   392  		}
   393  	}
   394  	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
   395  		return i
   396  	}
   397  	if info.BoundaryAfter() {
   398  		return i
   399  	}
   400  	ss := streamSafe(0)
   401  	v := ss.backwards(info)
   402  	for i = p; i >= 0 && v != ssStarter; i = p {
   403  		info, p = lastRuneStart(fd, b[:i])
   404  		if v = ss.backwards(info); v == ssOverflow {
   405  			break
   406  		}
   407  		if p+int(info.size) != i {
   408  			if p == -1 { // no boundary found
   409  				return -1
   410  			}
   411  			return i // boundary after an illegal UTF-8 encoding
   412  		}
   413  	}
   414  	return i
   415  }
   416  
   417  // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
   418  // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
   419  // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
   420  func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
   421  	// Force one character to be consumed.
   422  	info := rb.f.info(rb.src, sp)
   423  	if info.size == 0 {
   424  		return 0
   425  	}
   426  	if rb.nrune > 0 {
   427  		if s := rb.ss.next(info); s == ssStarter {
   428  			goto end
   429  		} else if s == ssOverflow {
   430  			rb.insertCGJ()
   431  			goto end
   432  		}
   433  	} else {
   434  		rb.ss.first(info)
   435  	}
   436  	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   437  		return int(err)
   438  	}
   439  	for {
   440  		sp += int(info.size)
   441  		if sp >= rb.nsrc {
   442  			if !atEOF && !info.BoundaryAfter() {
   443  				return int(iShortSrc)
   444  			}
   445  			break
   446  		}
   447  		info = rb.f.info(rb.src, sp)
   448  		if info.size == 0 {
   449  			if !atEOF {
   450  				return int(iShortSrc)
   451  			}
   452  			break
   453  		}
   454  		if s := rb.ss.next(info); s == ssStarter {
   455  			break
   456  		} else if s == ssOverflow {
   457  			rb.insertCGJ()
   458  			break
   459  		}
   460  		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   461  			return int(err)
   462  		}
   463  	}
   464  end:
   465  	if !rb.doFlush() {
   466  		return int(iShortDst)
   467  	}
   468  	return sp
   469  }
   470  
   471  // lastRuneStart returns the runeInfo and position of the last
   472  // rune in buf or the zero runeInfo and -1 if no rune was found.
   473  func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
   474  	p := len(buf) - 1
   475  	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
   476  	}
   477  	if p < 0 {
   478  		return Properties{}, -1
   479  	}
   480  	return fd.info(inputBytes(buf), p), p
   481  }
   482  
   483  // decomposeToLastBoundary finds an open segment at the end of the buffer
   484  // and scans it into rb. Returns the buffer minus the last segment.
   485  func decomposeToLastBoundary(rb *reorderBuffer) {
   486  	fd := &rb.f
   487  	info, i := lastRuneStart(fd, rb.out)
   488  	if int(info.size) != len(rb.out)-i {
   489  		// illegal trailing continuation bytes
   490  		return
   491  	}
   492  	if info.BoundaryAfter() {
   493  		return
   494  	}
   495  	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
   496  	padd := 0
   497  	ss := streamSafe(0)
   498  	p := len(rb.out)
   499  	for {
   500  		add[padd] = info
   501  		v := ss.backwards(info)
   502  		if v == ssOverflow {
   503  			// Note that if we have an overflow, it the string we are appending to
   504  			// is not correctly normalized. In this case the behavior is undefined.
   505  			break
   506  		}
   507  		padd++
   508  		p -= int(info.size)
   509  		if v == ssStarter || p < 0 {
   510  			break
   511  		}
   512  		info, i = lastRuneStart(fd, rb.out[:p])
   513  		if int(info.size) != p-i {
   514  			break
   515  		}
   516  	}
   517  	rb.ss = ss
   518  	// Copy bytes for insertion as we may need to overwrite rb.out.
   519  	var buf [maxBufferSize * utf8.UTFMax]byte
   520  	cp := buf[:copy(buf[:], rb.out[p:])]
   521  	rb.out = rb.out[:p]
   522  	for padd--; padd >= 0; padd-- {
   523  		info = add[padd]
   524  		rb.insertUnsafe(inputBytes(cp), 0, info)
   525  		cp = cp[info.size:]
   526  	}
   527  }