github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/collate/build/contract.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package build 6 7 import ( 8 "fmt" 9 "io" 10 "reflect" 11 "sort" 12 "strings" 13 ) 14 15 // This file contains code for detecting contractions and generating 16 // the necessary tables. 17 // Any Unicode Collation Algorithm (UCA) table entry that has more than 18 // one rune one the left-hand side is called a contraction. 19 // See http://www.unicode.org/reports/tr10/#Contractions for more details. 20 // 21 // We define the following terms: 22 // initial: a rune that appears as the first rune in a contraction. 23 // suffix: a sequence of runes succeeding the initial rune 24 // in a given contraction. 25 // non-initial: a rune that appears in a suffix. 26 // 27 // A rune may be both a initial and a non-initial and may be so in 28 // many contractions. An initial may typically also appear by itself. 29 // In case of ambiguities, the UCA requires we match the longest 30 // contraction. 31 // 32 // Many contraction rules share the same set of possible suffixes. 33 // We store sets of suffixes in a trie that associates an index with 34 // each suffix in the set. This index can be used to look up a 35 // collation element associated with the (starter rune, suffix) pair. 36 // 37 // The trie is defined on a UTF-8 byte sequence. 38 // The overall trie is represented as an array of ctEntries. Each node of the trie 39 // is represented as a subsequence of ctEntries, where each entry corresponds to 40 // a possible match of a next character in the search string. An entry 41 // also includes the length and offset to the next sequence of entries 42 // to check in case of a match. 43 44 const ( 45 final = 0 46 noIndex = 0xFF 47 ) 48 49 // ctEntry associates to a matching byte an offset and/or next sequence of 50 // bytes to check. A ctEntry c is called final if a match means that the 51 // longest suffix has been found. An entry c is final if c.n == 0. 52 // A single final entry can match a range of characters to an offset. 53 // A non-final entry always matches a single byte. Note that a non-final 54 // entry might still resemble a completed suffix. 55 // Examples: 56 // The suffix strings "ab" and "ac" can be represented as: 57 // []ctEntry{ 58 // {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF. 59 // {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 60 // } 61 // 62 // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: 63 // []ctEntry{ 64 // {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'. 65 // {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. 66 // {'d', 'd', final, 3}, // "abd" -> 3 67 // {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. 68 // {'d', 'd', final, 4}, // "abcd" -> 4 69 // } 70 // See genStateTests in contract_test.go for more examples. 71 type ctEntry struct { 72 l uint8 // non-final: byte value to match; final: lowest match in range. 73 h uint8 // non-final: relative index to next block; final: highest match in range. 74 n uint8 // non-final: length of next block; final: final 75 i uint8 // result offset. Will be noIndex if more bytes are needed to complete. 76 } 77 78 // contractTrieSet holds a set of contraction tries. The tries are stored 79 // consecutively in the entry field. 80 type contractTrieSet []struct{ l, h, n, i uint8 } 81 82 // ctHandle is used to identify a trie in the trie set, consisting in an offset 83 // in the array and the size of the first node. 84 type ctHandle struct { 85 index, n int 86 } 87 88 // appendTrie adds a new trie for the given suffixes to the trie set and returns 89 // a handle to it. The handle will be invalid on error. 90 func (ct *contractTrieSet) appendTrie(suffixes []string) (ctHandle, error) { 91 es := make([]stridx, len(suffixes)) 92 for i, s := range suffixes { 93 es[i].str = s 94 } 95 sort.Sort(offsetSort(es)) 96 for i := range es { 97 es[i].index = i + 1 98 } 99 sort.Sort(genidxSort(es)) 100 i := len(*ct) 101 n, err := ct.genStates(es) 102 if err != nil { 103 *ct = (*ct)[:i] 104 return ctHandle{}, err 105 } 106 return ctHandle{i, n}, nil 107 } 108 109 // genStates generates ctEntries for a given suffix set and returns 110 // the number of entries for the first node. 111 func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { 112 if len(sis) == 0 { 113 return 0, fmt.Errorf("genStates: list of suffices must be non-empty") 114 } 115 start := len(*ct) 116 // create entries for differing first bytes. 117 for _, si := range sis { 118 s := si.str 119 if len(s) == 0 { 120 continue 121 } 122 added := false 123 c := s[0] 124 if len(s) > 1 { 125 for j := len(*ct) - 1; j >= start; j-- { 126 if (*ct)[j].l == c { 127 added = true 128 break 129 } 130 } 131 if !added { 132 *ct = append(*ct, ctEntry{l: c, i: noIndex}) 133 } 134 } else { 135 for j := len(*ct) - 1; j >= start; j-- { 136 // Update the offset for longer suffixes with the same byte. 137 if (*ct)[j].l == c { 138 (*ct)[j].i = uint8(si.index) 139 added = true 140 } 141 // Extend range of final ctEntry, if possible. 142 if (*ct)[j].h+1 == c { 143 (*ct)[j].h = c 144 added = true 145 } 146 } 147 if !added { 148 *ct = append(*ct, ctEntry{l: c, h: c, n: final, i: uint8(si.index)}) 149 } 150 } 151 } 152 n := len(*ct) - start 153 // Append nodes for the remainder of the suffixes for each ctEntry. 154 sp := 0 155 for i, end := start, len(*ct); i < end; i++ { 156 fe := (*ct)[i] 157 if fe.h == 0 { // uninitialized non-final 158 ln := len(*ct) - start - n 159 if ln > 0xFF { 160 return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) 161 } 162 fe.h = uint8(ln) 163 // Find first non-final strings with same byte as current entry. 164 for ; sis[sp].str[0] != fe.l; sp++ { 165 } 166 se := sp + 1 167 for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.l; se++ { 168 } 169 sl := sis[sp:se] 170 sp = se 171 for i, si := range sl { 172 sl[i].str = si.str[1:] 173 } 174 nn, err := ct.genStates(sl) 175 if err != nil { 176 return 0, err 177 } 178 fe.n = uint8(nn) 179 (*ct)[i] = fe 180 } 181 } 182 sort.Sort(entrySort((*ct)[start : start+n])) 183 return n, nil 184 } 185 186 // There may be both a final and non-final entry for a byte if the byte 187 // is implied in a range of matches in the final entry. 188 // We need to ensure that the non-final entry comes first in that case. 189 type entrySort contractTrieSet 190 191 func (fe entrySort) Len() int { return len(fe) } 192 func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] } 193 func (fe entrySort) Less(i, j int) bool { 194 return fe[i].l > fe[j].l 195 } 196 197 // stridx is used for sorting suffixes and their associated offsets. 198 type stridx struct { 199 str string 200 index int 201 } 202 203 // For computing the offsets, we first sort by size, and then by string. 204 // This ensures that strings that only differ in the last byte by 1 205 // are sorted consecutively in increasing order such that they can 206 // be packed as a range in a final ctEntry. 207 type offsetSort []stridx 208 209 func (si offsetSort) Len() int { return len(si) } 210 func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 211 func (si offsetSort) Less(i, j int) bool { 212 if len(si[i].str) != len(si[j].str) { 213 return len(si[i].str) > len(si[j].str) 214 } 215 return si[i].str < si[j].str 216 } 217 218 // For indexing, we want to ensure that strings are sorted in string order, where 219 // for strings with the same prefix, we put longer strings before shorter ones. 220 type genidxSort []stridx 221 222 func (si genidxSort) Len() int { return len(si) } 223 func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 224 func (si genidxSort) Less(i, j int) bool { 225 if strings.HasPrefix(si[j].str, si[i].str) { 226 return false 227 } 228 if strings.HasPrefix(si[i].str, si[j].str) { 229 return true 230 } 231 return si[i].str < si[j].str 232 } 233 234 // lookup matches the longest suffix in str and returns the associated offset 235 // and the number of bytes consumed. 236 func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) { 237 states := (*ct)[h.index:] 238 p := 0 239 n := h.n 240 for i := 0; i < n && p < len(str); { 241 e := states[i] 242 c := str[p] 243 if c >= e.l { 244 if e.l == c { 245 p++ 246 if e.i != noIndex { 247 index, ns = int(e.i), p 248 } 249 if e.n != final { 250 // set to new state 251 i, states, n = 0, states[int(e.h)+n:], int(e.n) 252 } else { 253 return 254 } 255 continue 256 } else if e.n == final && c <= e.h { 257 p++ 258 return int(c-e.l) + int(e.i), p 259 } 260 } 261 i++ 262 } 263 return 264 } 265 266 // print writes the contractTrieSet t as compilable Go code to w. It returns 267 // the total number of bytes written and the size of the resulting data structure in bytes. 268 func (t *contractTrieSet) print(w io.Writer, name string) (n, size int, err error) { 269 update3 := func(nn, sz int, e error) { 270 n += nn 271 if err == nil { 272 err = e 273 } 274 size += sz 275 } 276 update2 := func(nn int, e error) { update3(nn, 0, e) } 277 278 update3(t.printArray(w, name)) 279 update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name)) 280 update3(t.printStruct(w, name)) 281 update2(fmt.Fprintln(w)) 282 return 283 } 284 285 func (ct contractTrieSet) printArray(w io.Writer, name string) (n, size int, err error) { 286 p := func(f string, a ...interface{}) { 287 nn, e := fmt.Fprintf(w, f, a...) 288 n += nn 289 if err == nil { 290 err = e 291 } 292 } 293 size = len(ct) * 4 294 p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size) 295 p("var %sCTEntries = [%d]struct{l,h,n,i uint8}{\n", name, len(ct)) 296 for _, fe := range ct { 297 p("\t{0x%X, 0x%X, %d, %d},\n", fe.l, fe.h, fe.n, fe.i) 298 } 299 p("}\n") 300 return 301 } 302 303 func (ct contractTrieSet) printStruct(w io.Writer, name string) (n, size int, err error) { 304 n, err = fmt.Fprintf(w, "contractTrieSet( %sCTEntries[:] )", name) 305 size = int(reflect.TypeOf(ct).Size()) 306 return 307 }