github.com/liquid-dev/text@v0.3.3-liquid/collate/build/contract.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package build 6 7 import ( 8 "fmt" 9 "io" 10 "reflect" 11 "sort" 12 "strings" 13 14 "github.com/liquid-dev/text/internal/colltab" 15 ) 16 17 // This file contains code for detecting contractions and generating 18 // the necessary tables. 19 // Any Unicode Collation Algorithm (UCA) table entry that has more than 20 // one rune one the left-hand side is called a contraction. 21 // See https://www.unicode.org/reports/tr10/#Contractions for more details. 22 // 23 // We define the following terms: 24 // initial: a rune that appears as the first rune in a contraction. 25 // suffix: a sequence of runes succeeding the initial rune 26 // in a given contraction. 27 // non-initial: a rune that appears in a suffix. 28 // 29 // A rune may be both an initial and a non-initial and may be so in 30 // many contractions. An initial may typically also appear by itself. 31 // In case of ambiguities, the UCA requires we match the longest 32 // contraction. 33 // 34 // Many contraction rules share the same set of possible suffixes. 35 // We store sets of suffixes in a trie that associates an index with 36 // each suffix in the set. This index can be used to look up a 37 // collation element associated with the (starter rune, suffix) pair. 38 // 39 // The trie is defined on a UTF-8 byte sequence. 40 // The overall trie is represented as an array of ctEntries. Each node of the trie 41 // is represented as a subsequence of ctEntries, where each entry corresponds to 42 // a possible match of a next character in the search string. An entry 43 // also includes the length and offset to the next sequence of entries 44 // to check in case of a match. 45 46 const ( 47 final = 0 48 noIndex = 0xFF 49 ) 50 51 // ctEntry associates to a matching byte an offset and/or next sequence of 52 // bytes to check. A ctEntry c is called final if a match means that the 53 // longest suffix has been found. An entry c is final if c.N == 0. 54 // A single final entry can match a range of characters to an offset. 55 // A non-final entry always matches a single byte. Note that a non-final 56 // entry might still resemble a completed suffix. 57 // Examples: 58 // The suffix strings "ab" and "ac" can be represented as: 59 // []ctEntry{ 60 // {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF. 61 // {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 62 // } 63 // 64 // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: 65 // []ctEntry{ 66 // {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'. 67 // {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. 68 // {'d', 'd', final, 3}, // "abd" -> 3 69 // {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. 70 // {'d', 'd', final, 4}, // "abcd" -> 4 71 // } 72 // See genStateTests in contract_test.go for more examples. 73 type ctEntry struct { 74 L uint8 // non-final: byte value to match; final: lowest match in range. 75 H uint8 // non-final: relative index to next block; final: highest match in range. 76 N uint8 // non-final: length of next block; final: final 77 I uint8 // result offset. Will be noIndex if more bytes are needed to complete. 78 } 79 80 // contractTrieSet holds a set of contraction tries. The tries are stored 81 // consecutively in the entry field. 82 type contractTrieSet []struct{ l, h, n, i uint8 } 83 84 // ctHandle is used to identify a trie in the trie set, consisting in an offset 85 // in the array and the size of the first node. 86 type ctHandle struct { 87 index, n int 88 } 89 90 // appendTrie adds a new trie for the given suffixes to the trie set and returns 91 // a handle to it. The handle will be invalid on error. 92 func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) { 93 es := make([]stridx, len(suffixes)) 94 for i, s := range suffixes { 95 es[i].str = s 96 } 97 sort.Sort(offsetSort(es)) 98 for i := range es { 99 es[i].index = i + 1 100 } 101 sort.Sort(genidxSort(es)) 102 i := len(*ct) 103 n, err := genStates(ct, es) 104 if err != nil { 105 *ct = (*ct)[:i] 106 return ctHandle{}, err 107 } 108 return ctHandle{i, n}, nil 109 } 110 111 // genStates generates ctEntries for a given suffix set and returns 112 // the number of entries for the first node. 113 func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) { 114 if len(sis) == 0 { 115 return 0, fmt.Errorf("genStates: list of suffices must be non-empty") 116 } 117 start := len(*ct) 118 // create entries for differing first bytes. 119 for _, si := range sis { 120 s := si.str 121 if len(s) == 0 { 122 continue 123 } 124 added := false 125 c := s[0] 126 if len(s) > 1 { 127 for j := len(*ct) - 1; j >= start; j-- { 128 if (*ct)[j].L == c { 129 added = true 130 break 131 } 132 } 133 if !added { 134 *ct = append(*ct, ctEntry{L: c, I: noIndex}) 135 } 136 } else { 137 for j := len(*ct) - 1; j >= start; j-- { 138 // Update the offset for longer suffixes with the same byte. 139 if (*ct)[j].L == c { 140 (*ct)[j].I = uint8(si.index) 141 added = true 142 } 143 // Extend range of final ctEntry, if possible. 144 if (*ct)[j].H+1 == c { 145 (*ct)[j].H = c 146 added = true 147 } 148 } 149 if !added { 150 *ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)}) 151 } 152 } 153 } 154 n := len(*ct) - start 155 // Append nodes for the remainder of the suffixes for each ctEntry. 156 sp := 0 157 for i, end := start, len(*ct); i < end; i++ { 158 fe := (*ct)[i] 159 if fe.H == 0 { // uninitialized non-final 160 ln := len(*ct) - start - n 161 if ln > 0xFF { 162 return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) 163 } 164 fe.H = uint8(ln) 165 // Find first non-final strings with same byte as current entry. 166 for ; sis[sp].str[0] != fe.L; sp++ { 167 } 168 se := sp + 1 169 for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ { 170 } 171 sl := sis[sp:se] 172 sp = se 173 for i, si := range sl { 174 sl[i].str = si.str[1:] 175 } 176 nn, err := genStates(ct, sl) 177 if err != nil { 178 return 0, err 179 } 180 fe.N = uint8(nn) 181 (*ct)[i] = fe 182 } 183 } 184 sort.Sort(entrySort((*ct)[start : start+n])) 185 return n, nil 186 } 187 188 // There may be both a final and non-final entry for a byte if the byte 189 // is implied in a range of matches in the final entry. 190 // We need to ensure that the non-final entry comes first in that case. 191 type entrySort colltab.ContractTrieSet 192 193 func (fe entrySort) Len() int { return len(fe) } 194 func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] } 195 func (fe entrySort) Less(i, j int) bool { 196 return fe[i].L > fe[j].L 197 } 198 199 // stridx is used for sorting suffixes and their associated offsets. 200 type stridx struct { 201 str string 202 index int 203 } 204 205 // For computing the offsets, we first sort by size, and then by string. 206 // This ensures that strings that only differ in the last byte by 1 207 // are sorted consecutively in increasing order such that they can 208 // be packed as a range in a final ctEntry. 209 type offsetSort []stridx 210 211 func (si offsetSort) Len() int { return len(si) } 212 func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 213 func (si offsetSort) Less(i, j int) bool { 214 if len(si[i].str) != len(si[j].str) { 215 return len(si[i].str) > len(si[j].str) 216 } 217 return si[i].str < si[j].str 218 } 219 220 // For indexing, we want to ensure that strings are sorted in string order, where 221 // for strings with the same prefix, we put longer strings before shorter ones. 222 type genidxSort []stridx 223 224 func (si genidxSort) Len() int { return len(si) } 225 func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 226 func (si genidxSort) Less(i, j int) bool { 227 if strings.HasPrefix(si[j].str, si[i].str) { 228 return false 229 } 230 if strings.HasPrefix(si[i].str, si[j].str) { 231 return true 232 } 233 return si[i].str < si[j].str 234 } 235 236 // lookup matches the longest suffix in str and returns the associated offset 237 // and the number of bytes consumed. 238 func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) { 239 states := (*ct)[h.index:] 240 p := 0 241 n := h.n 242 for i := 0; i < n && p < len(str); { 243 e := states[i] 244 c := str[p] 245 if c >= e.L { 246 if e.L == c { 247 p++ 248 if e.I != noIndex { 249 index, ns = int(e.I), p 250 } 251 if e.N != final { 252 // set to new state 253 i, states, n = 0, states[int(e.H)+n:], int(e.N) 254 } else { 255 return 256 } 257 continue 258 } else if e.N == final && c <= e.H { 259 p++ 260 return int(c-e.L) + int(e.I), p 261 } 262 } 263 i++ 264 } 265 return 266 } 267 268 // print writes the contractTrieSet t as compilable Go code to w. It returns 269 // the total number of bytes written and the size of the resulting data structure in bytes. 270 func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 271 update3 := func(nn, sz int, e error) { 272 n += nn 273 if err == nil { 274 err = e 275 } 276 size += sz 277 } 278 update2 := func(nn int, e error) { update3(nn, 0, e) } 279 280 update3(printArray(*t, w, name)) 281 update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name)) 282 update3(printStruct(*t, w, name)) 283 update2(fmt.Fprintln(w)) 284 return 285 } 286 287 func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 288 p := func(f string, a ...interface{}) { 289 nn, e := fmt.Fprintf(w, f, a...) 290 n += nn 291 if err == nil { 292 err = e 293 } 294 } 295 size = len(ct) * 4 296 p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size) 297 p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct)) 298 for _, fe := range ct { 299 p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I) 300 } 301 p("}\n") 302 return 303 } 304 305 func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 306 n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name) 307 size = int(reflect.TypeOf(ct).Size()) 308 return 309 }