github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/colltab/table.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package colltab
     6  
     7  import (
     8  	"unicode/utf8"
     9  
    10  	"github.com/go-enjin/golang-org-x-text/unicode/norm"
    11  )
    12  
    13  // Table holds all collation data for a given collation ordering.
    14  type Table struct {
    15  	Index Trie // main trie
    16  
    17  	// expansion info
    18  	ExpandElem []uint32
    19  
    20  	// contraction info
    21  	ContractTries  ContractTrieSet
    22  	ContractElem   []uint32
    23  	MaxContractLen int
    24  	VariableTop    uint32
    25  }
    26  
    27  func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
    28  	return t.appendNext(w, source{bytes: b})
    29  }
    30  
    31  func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
    32  	return t.appendNext(w, source{str: s})
    33  }
    34  
    35  func (t *Table) Start(p int, b []byte) int {
    36  	// TODO: implement
    37  	panic("not implemented")
    38  }
    39  
    40  func (t *Table) StartString(p int, s string) int {
    41  	// TODO: implement
    42  	panic("not implemented")
    43  }
    44  
    45  func (t *Table) Domain() []string {
    46  	// TODO: implement
    47  	panic("not implemented")
    48  }
    49  
    50  func (t *Table) Top() uint32 {
    51  	return t.VariableTop
    52  }
    53  
    54  type source struct {
    55  	str   string
    56  	bytes []byte
    57  }
    58  
    59  func (src *source) lookup(t *Table) (ce Elem, sz int) {
    60  	if src.bytes == nil {
    61  		return t.Index.lookupString(src.str)
    62  	}
    63  	return t.Index.lookup(src.bytes)
    64  }
    65  
    66  func (src *source) tail(sz int) {
    67  	if src.bytes == nil {
    68  		src.str = src.str[sz:]
    69  	} else {
    70  		src.bytes = src.bytes[sz:]
    71  	}
    72  }
    73  
    74  func (src *source) nfd(buf []byte, end int) []byte {
    75  	if src.bytes == nil {
    76  		return norm.NFD.AppendString(buf[:0], src.str[:end])
    77  	}
    78  	return norm.NFD.Append(buf[:0], src.bytes[:end]...)
    79  }
    80  
    81  func (src *source) rune() (r rune, sz int) {
    82  	if src.bytes == nil {
    83  		return utf8.DecodeRuneInString(src.str)
    84  	}
    85  	return utf8.DecodeRune(src.bytes)
    86  }
    87  
    88  func (src *source) properties(f norm.Form) norm.Properties {
    89  	if src.bytes == nil {
    90  		return f.PropertiesString(src.str)
    91  	}
    92  	return f.Properties(src.bytes)
    93  }
    94  
    95  // appendNext appends the weights corresponding to the next rune or
    96  // contraction in s.  If a contraction is matched to a discontinuous
    97  // sequence of runes, the weights for the interstitial runes are
    98  // appended as well.  It returns a new slice that includes the appended
    99  // weights and the number of bytes consumed from s.
   100  func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
   101  	ce, sz := src.lookup(t)
   102  	tp := ce.ctype()
   103  	if tp == ceNormal {
   104  		if ce == 0 {
   105  			r, _ := src.rune()
   106  			const (
   107  				hangulSize  = 3
   108  				firstHangul = 0xAC00
   109  				lastHangul  = 0xD7A3
   110  			)
   111  			if r >= firstHangul && r <= lastHangul {
   112  				// TODO: performance can be considerably improved here.
   113  				n = sz
   114  				var buf [16]byte // Used for decomposing Hangul.
   115  				for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
   116  					ce, sz = t.Index.lookup(b)
   117  					w = append(w, ce)
   118  				}
   119  				return w, n
   120  			}
   121  			ce = makeImplicitCE(implicitPrimary(r))
   122  		}
   123  		w = append(w, ce)
   124  	} else if tp == ceExpansionIndex {
   125  		w = t.appendExpansion(w, ce)
   126  	} else if tp == ceContractionIndex {
   127  		n := 0
   128  		src.tail(sz)
   129  		if src.bytes == nil {
   130  			w, n = t.matchContractionString(w, ce, src.str)
   131  		} else {
   132  			w, n = t.matchContraction(w, ce, src.bytes)
   133  		}
   134  		sz += n
   135  	} else if tp == ceDecompose {
   136  		// Decompose using NFKD and replace tertiary weights.
   137  		t1, t2 := splitDecompose(ce)
   138  		i := len(w)
   139  		nfkd := src.properties(norm.NFKD).Decomposition()
   140  		for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
   141  			w, p = t.appendNext(w, source{bytes: nfkd})
   142  		}
   143  		w[i] = w[i].updateTertiary(t1)
   144  		if i++; i < len(w) {
   145  			w[i] = w[i].updateTertiary(t2)
   146  			for i++; i < len(w); i++ {
   147  				w[i] = w[i].updateTertiary(maxTertiary)
   148  			}
   149  		}
   150  	}
   151  	return w, sz
   152  }
   153  
   154  func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
   155  	i := splitExpandIndex(ce)
   156  	n := int(t.ExpandElem[i])
   157  	i++
   158  	for _, ce := range t.ExpandElem[i : i+n] {
   159  		w = append(w, Elem(ce))
   160  	}
   161  	return w
   162  }
   163  
   164  func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
   165  	index, n, offset := splitContractIndex(ce)
   166  
   167  	scan := t.ContractTries.scanner(index, n, suffix)
   168  	buf := [norm.MaxSegmentSize]byte{}
   169  	bufp := 0
   170  	p := scan.scan(0)
   171  
   172  	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
   173  		// By now we should have filtered most cases.
   174  		p0 := p
   175  		bufn := 0
   176  		rune := norm.NFD.Properties(suffix[p:])
   177  		p += rune.Size()
   178  		if rune.LeadCCC() != 0 {
   179  			prevCC := rune.TrailCCC()
   180  			// A gap may only occur in the last normalization segment.
   181  			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
   182  			if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
   183  				scan.s = suffix[:p+end]
   184  			}
   185  			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
   186  				rune = norm.NFD.Properties(suffix[p:])
   187  				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
   188  					break
   189  				}
   190  				prevCC = rune.TrailCCC()
   191  				if pp := scan.scan(p); pp != p {
   192  					// Copy the interstitial runes for later processing.
   193  					bufn += copy(buf[bufn:], suffix[p0:p])
   194  					if scan.pindex == pp {
   195  						bufp = bufn
   196  					}
   197  					p, p0 = pp, pp
   198  				} else {
   199  					p += rune.Size()
   200  				}
   201  			}
   202  		}
   203  	}
   204  	// Append weights for the matched contraction, which may be an expansion.
   205  	i, n := scan.result()
   206  	ce = Elem(t.ContractElem[i+offset])
   207  	if ce.ctype() == ceNormal {
   208  		w = append(w, ce)
   209  	} else {
   210  		w = t.appendExpansion(w, ce)
   211  	}
   212  	// Append weights for the runes in the segment not part of the contraction.
   213  	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
   214  		w, p = t.appendNext(w, source{bytes: b})
   215  	}
   216  	return w, n
   217  }
   218  
   219  // TODO: unify the two implementations. This is best done after first simplifying
   220  // the algorithm taking into account the inclusion of both NFC and NFD forms
   221  // in the table.
   222  func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
   223  	index, n, offset := splitContractIndex(ce)
   224  
   225  	scan := t.ContractTries.scannerString(index, n, suffix)
   226  	buf := [norm.MaxSegmentSize]byte{}
   227  	bufp := 0
   228  	p := scan.scan(0)
   229  
   230  	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
   231  		// By now we should have filtered most cases.
   232  		p0 := p
   233  		bufn := 0
   234  		rune := norm.NFD.PropertiesString(suffix[p:])
   235  		p += rune.Size()
   236  		if rune.LeadCCC() != 0 {
   237  			prevCC := rune.TrailCCC()
   238  			// A gap may only occur in the last normalization segment.
   239  			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
   240  			if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
   241  				scan.s = suffix[:p+end]
   242  			}
   243  			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
   244  				rune = norm.NFD.PropertiesString(suffix[p:])
   245  				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
   246  					break
   247  				}
   248  				prevCC = rune.TrailCCC()
   249  				if pp := scan.scan(p); pp != p {
   250  					// Copy the interstitial runes for later processing.
   251  					bufn += copy(buf[bufn:], suffix[p0:p])
   252  					if scan.pindex == pp {
   253  						bufp = bufn
   254  					}
   255  					p, p0 = pp, pp
   256  				} else {
   257  					p += rune.Size()
   258  				}
   259  			}
   260  		}
   261  	}
   262  	// Append weights for the matched contraction, which may be an expansion.
   263  	i, n := scan.result()
   264  	ce = Elem(t.ContractElem[i+offset])
   265  	if ce.ctype() == ceNormal {
   266  		w = append(w, ce)
   267  	} else {
   268  		w = t.appendExpansion(w, ce)
   269  	}
   270  	// Append weights for the runes in the segment not part of the contraction.
   271  	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
   272  		w, p = t.appendNext(w, source{bytes: b})
   273  	}
   274  	return w, n
   275  }