github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/collate/build/contract.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package build
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"reflect"
    11  	"sort"
    12  	"strings"
    13  )
    14  
    15  // This file contains code for detecting contractions and generating
    16  // the necessary tables.
    17  // Any Unicode Collation Algorithm (UCA) table entry that has more than
    18  // one rune one the left-hand side is called a contraction.
    19  // See http://www.unicode.org/reports/tr10/#Contractions for more details.
    20  //
    21  // We define the following terms:
    22  //   initial:     a rune that appears as the first rune in a contraction.
    23  //   suffix:      a sequence of runes succeeding the initial rune
    24  //                in a given contraction.
    25  //   non-initial: a rune that appears in a suffix.
    26  //
    27  // A rune may be both a initial and a non-initial and may be so in
    28  // many contractions.  An initial may typically also appear by itself.
    29  // In case of ambiguities, the UCA requires we match the longest
    30  // contraction.
    31  //
    32  // Many contraction rules share the same set of possible suffixes.
    33  // We store sets of suffixes in a trie that associates an index with
    34  // each suffix in the set.  This index can be used to look up a
    35  // collation element associated with the (starter rune, suffix) pair.
    36  //
    37  // The trie is defined on a UTF-8 byte sequence.
    38  // The overall trie is represented as an array of ctEntries.  Each node of the trie
    39  // is represented as a subsequence of ctEntries, where each entry corresponds to
    40  // a possible match of a next character in the search string.  An entry
    41  // also includes the length and offset to the next sequence of entries
    42  // to check in case of a match.
    43  
    44  const (
    45  	final   = 0
    46  	noIndex = 0xFF
    47  )
    48  
    49  // ctEntry associates to a matching byte an offset and/or next sequence of
    50  // bytes to check. A ctEntry c is called final if a match means that the
    51  // longest suffix has been found.  An entry c is final if c.n == 0.
    52  // A single final entry can match a range of characters to an offset.
    53  // A non-final entry always matches a single byte. Note that a non-final
    54  // entry might still resemble a completed suffix.
    55  // Examples:
    56  // The suffix strings "ab" and "ac" can be represented as:
    57  // []ctEntry{
    58  //     {'a', 1, 1, noIndex},  // 'a' by itself does not match, so i is 0xFF.
    59  //     {'b', 'c', 0, 1},   // "ab" -> 1, "ac" -> 2
    60  // }
    61  //
    62  // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
    63  // []ctEntry{
    64  //     {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
    65  //     {'b', 1, 2, 1},    // "ab" -> 1, may be followed by 'c' or 'd'.
    66  //     {'d', 'd', final, 3},  // "abd" -> 3
    67  //     {'c', 4, 1, 2},    // "abc" -> 2, may be followed by 'd'.
    68  //     {'d', 'd', final, 4},  // "abcd" -> 4
    69  // }
    70  // See genStateTests in contract_test.go for more examples.
    71  type ctEntry struct {
    72  	l uint8 // non-final: byte value to match; final: lowest match in range.
    73  	h uint8 // non-final: relative index to next block; final: highest match in range.
    74  	n uint8 // non-final: length of next block; final: final
    75  	i uint8 // result offset. Will be noIndex if more bytes are needed to complete.
    76  }
    77  
    78  // contractTrieSet holds a set of contraction tries. The tries are stored
    79  // consecutively in the entry field.
    80  type contractTrieSet []struct{ l, h, n, i uint8 }
    81  
    82  // ctHandle is used to identify a trie in the trie set, consisting in an offset
    83  // in the array and the size of the first node.
    84  type ctHandle struct {
    85  	index, n int
    86  }
    87  
    88  // appendTrie adds a new trie for the given suffixes to the trie set and returns
    89  // a handle to it.  The handle will be invalid on error.
    90  func (ct *contractTrieSet) appendTrie(suffixes []string) (ctHandle, error) {
    91  	es := make([]stridx, len(suffixes))
    92  	for i, s := range suffixes {
    93  		es[i].str = s
    94  	}
    95  	sort.Sort(offsetSort(es))
    96  	for i := range es {
    97  		es[i].index = i + 1
    98  	}
    99  	sort.Sort(genidxSort(es))
   100  	i := len(*ct)
   101  	n, err := ct.genStates(es)
   102  	if err != nil {
   103  		*ct = (*ct)[:i]
   104  		return ctHandle{}, err
   105  	}
   106  	return ctHandle{i, n}, nil
   107  }
   108  
   109  // genStates generates ctEntries for a given suffix set and returns
   110  // the number of entries for the first node.
   111  func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
   112  	if len(sis) == 0 {
   113  		return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
   114  	}
   115  	start := len(*ct)
   116  	// create entries for differing first bytes.
   117  	for _, si := range sis {
   118  		s := si.str
   119  		if len(s) == 0 {
   120  			continue
   121  		}
   122  		added := false
   123  		c := s[0]
   124  		if len(s) > 1 {
   125  			for j := len(*ct) - 1; j >= start; j-- {
   126  				if (*ct)[j].l == c {
   127  					added = true
   128  					break
   129  				}
   130  			}
   131  			if !added {
   132  				*ct = append(*ct, ctEntry{l: c, i: noIndex})
   133  			}
   134  		} else {
   135  			for j := len(*ct) - 1; j >= start; j-- {
   136  				// Update the offset for longer suffixes with the same byte.
   137  				if (*ct)[j].l == c {
   138  					(*ct)[j].i = uint8(si.index)
   139  					added = true
   140  				}
   141  				// Extend range of final ctEntry, if possible.
   142  				if (*ct)[j].h+1 == c {
   143  					(*ct)[j].h = c
   144  					added = true
   145  				}
   146  			}
   147  			if !added {
   148  				*ct = append(*ct, ctEntry{l: c, h: c, n: final, i: uint8(si.index)})
   149  			}
   150  		}
   151  	}
   152  	n := len(*ct) - start
   153  	// Append nodes for the remainder of the suffixes for each ctEntry.
   154  	sp := 0
   155  	for i, end := start, len(*ct); i < end; i++ {
   156  		fe := (*ct)[i]
   157  		if fe.h == 0 { // uninitialized non-final
   158  			ln := len(*ct) - start - n
   159  			if ln > 0xFF {
   160  				return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
   161  			}
   162  			fe.h = uint8(ln)
   163  			// Find first non-final strings with same byte as current entry.
   164  			for ; sis[sp].str[0] != fe.l; sp++ {
   165  			}
   166  			se := sp + 1
   167  			for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.l; se++ {
   168  			}
   169  			sl := sis[sp:se]
   170  			sp = se
   171  			for i, si := range sl {
   172  				sl[i].str = si.str[1:]
   173  			}
   174  			nn, err := ct.genStates(sl)
   175  			if err != nil {
   176  				return 0, err
   177  			}
   178  			fe.n = uint8(nn)
   179  			(*ct)[i] = fe
   180  		}
   181  	}
   182  	sort.Sort(entrySort((*ct)[start : start+n]))
   183  	return n, nil
   184  }
   185  
   186  // There may be both a final and non-final entry for a byte if the byte
   187  // is implied in a range of matches in the final entry.
   188  // We need to ensure that the non-final entry comes first in that case.
   189  type entrySort contractTrieSet
   190  
   191  func (fe entrySort) Len() int      { return len(fe) }
   192  func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
   193  func (fe entrySort) Less(i, j int) bool {
   194  	return fe[i].l > fe[j].l
   195  }
   196  
   197  // stridx is used for sorting suffixes and their associated offsets.
   198  type stridx struct {
   199  	str   string
   200  	index int
   201  }
   202  
   203  // For computing the offsets, we first sort by size, and then by string.
   204  // This ensures that strings that only differ in the last byte by 1
   205  // are sorted consecutively in increasing order such that they can
   206  // be packed as a range in a final ctEntry.
   207  type offsetSort []stridx
   208  
   209  func (si offsetSort) Len() int      { return len(si) }
   210  func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
   211  func (si offsetSort) Less(i, j int) bool {
   212  	if len(si[i].str) != len(si[j].str) {
   213  		return len(si[i].str) > len(si[j].str)
   214  	}
   215  	return si[i].str < si[j].str
   216  }
   217  
   218  // For indexing, we want to ensure that strings are sorted in string order, where
   219  // for strings with the same prefix, we put longer strings before shorter ones.
   220  type genidxSort []stridx
   221  
   222  func (si genidxSort) Len() int      { return len(si) }
   223  func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
   224  func (si genidxSort) Less(i, j int) bool {
   225  	if strings.HasPrefix(si[j].str, si[i].str) {
   226  		return false
   227  	}
   228  	if strings.HasPrefix(si[i].str, si[j].str) {
   229  		return true
   230  	}
   231  	return si[i].str < si[j].str
   232  }
   233  
   234  // lookup matches the longest suffix in str and returns the associated offset
   235  // and the number of bytes consumed.
   236  func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) {
   237  	states := (*ct)[h.index:]
   238  	p := 0
   239  	n := h.n
   240  	for i := 0; i < n && p < len(str); {
   241  		e := states[i]
   242  		c := str[p]
   243  		if c >= e.l {
   244  			if e.l == c {
   245  				p++
   246  				if e.i != noIndex {
   247  					index, ns = int(e.i), p
   248  				}
   249  				if e.n != final {
   250  					// set to new state
   251  					i, states, n = 0, states[int(e.h)+n:], int(e.n)
   252  				} else {
   253  					return
   254  				}
   255  				continue
   256  			} else if e.n == final && c <= e.h {
   257  				p++
   258  				return int(c-e.l) + int(e.i), p
   259  			}
   260  		}
   261  		i++
   262  	}
   263  	return
   264  }
   265  
   266  // print writes the contractTrieSet t as compilable Go code to w. It returns
   267  // the total number of bytes written and the size of the resulting data structure in bytes.
   268  func (t *contractTrieSet) print(w io.Writer, name string) (n, size int, err error) {
   269  	update3 := func(nn, sz int, e error) {
   270  		n += nn
   271  		if err == nil {
   272  			err = e
   273  		}
   274  		size += sz
   275  	}
   276  	update2 := func(nn int, e error) { update3(nn, 0, e) }
   277  
   278  	update3(t.printArray(w, name))
   279  	update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
   280  	update3(t.printStruct(w, name))
   281  	update2(fmt.Fprintln(w))
   282  	return
   283  }
   284  
   285  func (ct contractTrieSet) printArray(w io.Writer, name string) (n, size int, err error) {
   286  	p := func(f string, a ...interface{}) {
   287  		nn, e := fmt.Fprintf(w, f, a...)
   288  		n += nn
   289  		if err == nil {
   290  			err = e
   291  		}
   292  	}
   293  	size = len(ct) * 4
   294  	p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
   295  	p("var %sCTEntries = [%d]struct{l,h,n,i uint8}{\n", name, len(ct))
   296  	for _, fe := range ct {
   297  		p("\t{0x%X, 0x%X, %d, %d},\n", fe.l, fe.h, fe.n, fe.i)
   298  	}
   299  	p("}\n")
   300  	return
   301  }
   302  
   303  func (ct contractTrieSet) printStruct(w io.Writer, name string) (n, size int, err error) {
   304  	n, err = fmt.Fprintf(w, "contractTrieSet( %sCTEntries[:] )", name)
   305  	size = int(reflect.TypeOf(ct).Size())
   306  	return
   307  }