golang.org/x/text@v0.14.0/collate/build/contract.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package build 6 7 import ( 8 "fmt" 9 "io" 10 "reflect" 11 "sort" 12 "strings" 13 14 "golang.org/x/text/internal/colltab" 15 ) 16 17 // This file contains code for detecting contractions and generating 18 // the necessary tables. 19 // Any Unicode Collation Algorithm (UCA) table entry that has more than 20 // one rune one the left-hand side is called a contraction. 21 // See https://www.unicode.org/reports/tr10/#Contractions for more details. 22 // 23 // We define the following terms: 24 // initial: a rune that appears as the first rune in a contraction. 25 // suffix: a sequence of runes succeeding the initial rune 26 // in a given contraction. 27 // non-initial: a rune that appears in a suffix. 28 // 29 // A rune may be both an initial and a non-initial and may be so in 30 // many contractions. An initial may typically also appear by itself. 31 // In case of ambiguities, the UCA requires we match the longest 32 // contraction. 33 // 34 // Many contraction rules share the same set of possible suffixes. 35 // We store sets of suffixes in a trie that associates an index with 36 // each suffix in the set. This index can be used to look up a 37 // collation element associated with the (starter rune, suffix) pair. 38 // 39 // The trie is defined on a UTF-8 byte sequence. 40 // The overall trie is represented as an array of ctEntries. Each node of the trie 41 // is represented as a subsequence of ctEntries, where each entry corresponds to 42 // a possible match of a next character in the search string. An entry 43 // also includes the length and offset to the next sequence of entries 44 // to check in case of a match. 45 46 const ( 47 final = 0 48 noIndex = 0xFF 49 ) 50 51 // ctEntry associates to a matching byte an offset and/or next sequence of 52 // bytes to check. A ctEntry c is called final if a match means that the 53 // longest suffix has been found. An entry c is final if c.N == 0. 54 // A single final entry can match a range of characters to an offset. 55 // A non-final entry always matches a single byte. Note that a non-final 56 // entry might still resemble a completed suffix. 57 // Examples: 58 // The suffix strings "ab" and "ac" can be represented as: 59 // 60 // []ctEntry{ 61 // {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF. 62 // {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 63 // } 64 // 65 // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: 66 // 67 // []ctEntry{ 68 // {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'. 69 // {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. 70 // {'d', 'd', final, 3}, // "abd" -> 3 71 // {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. 72 // {'d', 'd', final, 4}, // "abcd" -> 4 73 // } 74 // 75 // See genStateTests in contract_test.go for more examples. 76 type ctEntry struct { 77 L uint8 // non-final: byte value to match; final: lowest match in range. 78 H uint8 // non-final: relative index to next block; final: highest match in range. 79 N uint8 // non-final: length of next block; final: final 80 I uint8 // result offset. Will be noIndex if more bytes are needed to complete. 81 } 82 83 // contractTrieSet holds a set of contraction tries. The tries are stored 84 // consecutively in the entry field. 85 type contractTrieSet []struct{ l, h, n, i uint8 } 86 87 // ctHandle is used to identify a trie in the trie set, consisting in an offset 88 // in the array and the size of the first node. 89 type ctHandle struct { 90 index, n int 91 } 92 93 // appendTrie adds a new trie for the given suffixes to the trie set and returns 94 // a handle to it. The handle will be invalid on error. 95 func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) { 96 es := make([]stridx, len(suffixes)) 97 for i, s := range suffixes { 98 es[i].str = s 99 } 100 sort.Sort(offsetSort(es)) 101 for i := range es { 102 es[i].index = i + 1 103 } 104 sort.Sort(genidxSort(es)) 105 i := len(*ct) 106 n, err := genStates(ct, es) 107 if err != nil { 108 *ct = (*ct)[:i] 109 return ctHandle{}, err 110 } 111 return ctHandle{i, n}, nil 112 } 113 114 // genStates generates ctEntries for a given suffix set and returns 115 // the number of entries for the first node. 116 func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) { 117 if len(sis) == 0 { 118 return 0, fmt.Errorf("genStates: list of suffices must be non-empty") 119 } 120 start := len(*ct) 121 // create entries for differing first bytes. 122 for _, si := range sis { 123 s := si.str 124 if len(s) == 0 { 125 continue 126 } 127 added := false 128 c := s[0] 129 if len(s) > 1 { 130 for j := len(*ct) - 1; j >= start; j-- { 131 if (*ct)[j].L == c { 132 added = true 133 break 134 } 135 } 136 if !added { 137 *ct = append(*ct, ctEntry{L: c, I: noIndex}) 138 } 139 } else { 140 for j := len(*ct) - 1; j >= start; j-- { 141 // Update the offset for longer suffixes with the same byte. 142 if (*ct)[j].L == c { 143 (*ct)[j].I = uint8(si.index) 144 added = true 145 } 146 // Extend range of final ctEntry, if possible. 147 if (*ct)[j].H+1 == c { 148 (*ct)[j].H = c 149 added = true 150 } 151 } 152 if !added { 153 *ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)}) 154 } 155 } 156 } 157 n := len(*ct) - start 158 // Append nodes for the remainder of the suffixes for each ctEntry. 159 sp := 0 160 for i, end := start, len(*ct); i < end; i++ { 161 fe := (*ct)[i] 162 if fe.H == 0 { // uninitialized non-final 163 ln := len(*ct) - start - n 164 if ln > 0xFF { 165 return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) 166 } 167 fe.H = uint8(ln) 168 // Find first non-final strings with same byte as current entry. 169 for ; sis[sp].str[0] != fe.L; sp++ { 170 } 171 se := sp + 1 172 for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ { 173 } 174 sl := sis[sp:se] 175 sp = se 176 for i, si := range sl { 177 sl[i].str = si.str[1:] 178 } 179 nn, err := genStates(ct, sl) 180 if err != nil { 181 return 0, err 182 } 183 fe.N = uint8(nn) 184 (*ct)[i] = fe 185 } 186 } 187 sort.Sort(entrySort((*ct)[start : start+n])) 188 return n, nil 189 } 190 191 // There may be both a final and non-final entry for a byte if the byte 192 // is implied in a range of matches in the final entry. 193 // We need to ensure that the non-final entry comes first in that case. 194 type entrySort colltab.ContractTrieSet 195 196 func (fe entrySort) Len() int { return len(fe) } 197 func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] } 198 func (fe entrySort) Less(i, j int) bool { 199 return fe[i].L > fe[j].L 200 } 201 202 // stridx is used for sorting suffixes and their associated offsets. 203 type stridx struct { 204 str string 205 index int 206 } 207 208 // For computing the offsets, we first sort by size, and then by string. 209 // This ensures that strings that only differ in the last byte by 1 210 // are sorted consecutively in increasing order such that they can 211 // be packed as a range in a final ctEntry. 212 type offsetSort []stridx 213 214 func (si offsetSort) Len() int { return len(si) } 215 func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 216 func (si offsetSort) Less(i, j int) bool { 217 if len(si[i].str) != len(si[j].str) { 218 return len(si[i].str) > len(si[j].str) 219 } 220 return si[i].str < si[j].str 221 } 222 223 // For indexing, we want to ensure that strings are sorted in string order, where 224 // for strings with the same prefix, we put longer strings before shorter ones. 225 type genidxSort []stridx 226 227 func (si genidxSort) Len() int { return len(si) } 228 func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 229 func (si genidxSort) Less(i, j int) bool { 230 if strings.HasPrefix(si[j].str, si[i].str) { 231 return false 232 } 233 if strings.HasPrefix(si[i].str, si[j].str) { 234 return true 235 } 236 return si[i].str < si[j].str 237 } 238 239 // lookup matches the longest suffix in str and returns the associated offset 240 // and the number of bytes consumed. 241 func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) { 242 states := (*ct)[h.index:] 243 p := 0 244 n := h.n 245 for i := 0; i < n && p < len(str); { 246 e := states[i] 247 c := str[p] 248 if c >= e.L { 249 if e.L == c { 250 p++ 251 if e.I != noIndex { 252 index, ns = int(e.I), p 253 } 254 if e.N != final { 255 // set to new state 256 i, states, n = 0, states[int(e.H)+n:], int(e.N) 257 } else { 258 return 259 } 260 continue 261 } else if e.N == final && c <= e.H { 262 p++ 263 return int(c-e.L) + int(e.I), p 264 } 265 } 266 i++ 267 } 268 return 269 } 270 271 // print writes the contractTrieSet t as compilable Go code to w. It returns 272 // the total number of bytes written and the size of the resulting data structure in bytes. 273 func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 274 update3 := func(nn, sz int, e error) { 275 n += nn 276 if err == nil { 277 err = e 278 } 279 size += sz 280 } 281 update2 := func(nn int, e error) { update3(nn, 0, e) } 282 283 update3(printArray(*t, w, name)) 284 update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name)) 285 update3(printStruct(*t, w, name)) 286 update2(fmt.Fprintln(w)) 287 return 288 } 289 290 func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 291 p := func(f string, a ...interface{}) { 292 nn, e := fmt.Fprintf(w, f, a...) 293 n += nn 294 if err == nil { 295 err = e 296 } 297 } 298 size = len(ct) * 4 299 p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size) 300 p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct)) 301 for _, fe := range ct { 302 p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I) 303 } 304 p("}\n") 305 return 306 } 307 308 func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 309 n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name) 310 size = int(reflect.TypeOf(ct).Size()) 311 return 312 }