golang.org/x/text@v0.14.0/collate/build/trie.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // The trie in this file is used to associate the first full character 6 // in a UTF-8 string to a collation element. 7 // All but the last byte in a UTF-8 byte sequence are 8 // used to look up offsets in the index table to be used for the next byte. 9 // The last byte is used to index into a table of collation elements. 10 // This file contains the code for the generation of the trie. 11 12 package build 13 14 import ( 15 "fmt" 16 "hash/fnv" 17 "io" 18 "reflect" 19 ) 20 21 const ( 22 blockSize = 64 23 blockOffset = 2 // Subtract 2 blocks to compensate for the 0x80 added to continuation bytes. 24 ) 25 26 type trieHandle struct { 27 lookupStart uint16 // offset in table for first byte 28 valueStart uint16 // offset in table for first byte 29 } 30 31 type trie struct { 32 index []uint16 33 values []uint32 34 } 35 36 // trieNode is the intermediate trie structure used for generating a trie. 37 type trieNode struct { 38 index []*trieNode 39 value []uint32 40 b byte 41 refValue uint16 42 refIndex uint16 43 } 44 45 func newNode() *trieNode { 46 return &trieNode{ 47 index: make([]*trieNode, 64), 48 value: make([]uint32, 128), // root node size is 128 instead of 64 49 } 50 } 51 52 func (n *trieNode) isInternal() bool { 53 return n.value != nil 54 } 55 56 func (n *trieNode) insert(r rune, value uint32) { 57 const maskx = 0x3F // mask out two most-significant bits 58 str := string(r) 59 if len(str) == 1 { 60 n.value[str[0]] = value 61 return 62 } 63 for i := 0; i < len(str)-1; i++ { 64 b := str[i] & maskx 65 if n.index == nil { 66 n.index = make([]*trieNode, blockSize) 67 } 68 nn := n.index[b] 69 if nn == nil { 70 nn = &trieNode{} 71 nn.b = b 72 n.index[b] = nn 73 } 74 n = nn 75 } 76 if n.value == nil { 77 n.value = make([]uint32, blockSize) 78 } 79 b := str[len(str)-1] & maskx 80 n.value[b] = value 81 } 82 83 type trieBuilder struct { 84 t *trie 85 86 roots []*trieHandle 87 88 lookupBlocks []*trieNode 89 valueBlocks []*trieNode 90 91 lookupBlockIdx map[uint32]*trieNode 92 valueBlockIdx map[uint32]*trieNode 93 } 94 95 func newTrieBuilder() *trieBuilder { 96 index := &trieBuilder{} 97 index.lookupBlocks = make([]*trieNode, 0) 98 index.valueBlocks = make([]*trieNode, 0) 99 index.lookupBlockIdx = make(map[uint32]*trieNode) 100 index.valueBlockIdx = make(map[uint32]*trieNode) 101 // The third nil is the default null block. The other two blocks 102 // are used to guarantee an offset of at least 3 for each block. 103 index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil) 104 index.t = &trie{} 105 return index 106 } 107 108 func (b *trieBuilder) computeOffsets(n *trieNode) *trieNode { 109 hasher := fnv.New32() 110 if n.index != nil { 111 for i, nn := range n.index { 112 var vi, vv uint16 113 if nn != nil { 114 nn = b.computeOffsets(nn) 115 n.index[i] = nn 116 vi = nn.refIndex 117 vv = nn.refValue 118 } 119 hasher.Write([]byte{byte(vi >> 8), byte(vi)}) 120 hasher.Write([]byte{byte(vv >> 8), byte(vv)}) 121 } 122 h := hasher.Sum32() 123 nn, ok := b.lookupBlockIdx[h] 124 if !ok { 125 n.refIndex = uint16(len(b.lookupBlocks)) - blockOffset 126 b.lookupBlocks = append(b.lookupBlocks, n) 127 b.lookupBlockIdx[h] = n 128 } else { 129 n = nn 130 } 131 } else { 132 for _, v := range n.value { 133 hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}) 134 } 135 h := hasher.Sum32() 136 nn, ok := b.valueBlockIdx[h] 137 if !ok { 138 n.refValue = uint16(len(b.valueBlocks)) - blockOffset 139 n.refIndex = n.refValue 140 b.valueBlocks = append(b.valueBlocks, n) 141 b.valueBlockIdx[h] = n 142 } else { 143 n = nn 144 } 145 } 146 return n 147 } 148 149 func (b *trieBuilder) addStartValueBlock(n *trieNode) uint16 { 150 hasher := fnv.New32() 151 for _, v := range n.value[:2*blockSize] { 152 hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}) 153 } 154 h := hasher.Sum32() 155 nn, ok := b.valueBlockIdx[h] 156 if !ok { 157 n.refValue = uint16(len(b.valueBlocks)) 158 n.refIndex = n.refValue 159 b.valueBlocks = append(b.valueBlocks, n) 160 // Add a dummy block to accommodate the double block size. 161 b.valueBlocks = append(b.valueBlocks, nil) 162 b.valueBlockIdx[h] = n 163 } else { 164 n = nn 165 } 166 return n.refValue 167 } 168 169 func genValueBlock(t *trie, n *trieNode) { 170 if n != nil { 171 for _, v := range n.value { 172 t.values = append(t.values, v) 173 } 174 } 175 } 176 177 func genLookupBlock(t *trie, n *trieNode) { 178 for _, nn := range n.index { 179 v := uint16(0) 180 if nn != nil { 181 if n.index != nil { 182 v = nn.refIndex 183 } else { 184 v = nn.refValue 185 } 186 } 187 t.index = append(t.index, v) 188 } 189 } 190 191 func (b *trieBuilder) addTrie(n *trieNode) *trieHandle { 192 h := &trieHandle{} 193 b.roots = append(b.roots, h) 194 h.valueStart = b.addStartValueBlock(n) 195 if len(b.roots) == 1 { 196 // We insert a null block after the first start value block. 197 // This ensures that continuation bytes UTF-8 sequences of length 198 // greater than 2 will automatically hit a null block if there 199 // was an undefined entry. 200 b.valueBlocks = append(b.valueBlocks, nil) 201 } 202 n = b.computeOffsets(n) 203 // Offset by one extra block as the first byte starts at 0xC0 instead of 0x80. 204 h.lookupStart = n.refIndex - 1 205 return h 206 } 207 208 // generate generates and returns the trie for n. 209 func (b *trieBuilder) generate() (t *trie, err error) { 210 t = b.t 211 if len(b.valueBlocks) >= 1<<16 { 212 return nil, fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(b.valueBlocks), 1<<16) 213 } 214 if len(b.lookupBlocks) >= 1<<16 { 215 return nil, fmt.Errorf("maximum number of lookup blocks exceeded (%d > %d)", len(b.lookupBlocks), 1<<16) 216 } 217 genValueBlock(t, b.valueBlocks[0]) 218 genValueBlock(t, &trieNode{value: make([]uint32, 64)}) 219 for i := 2; i < len(b.valueBlocks); i++ { 220 genValueBlock(t, b.valueBlocks[i]) 221 } 222 n := &trieNode{index: make([]*trieNode, 64)} 223 genLookupBlock(t, n) 224 genLookupBlock(t, n) 225 genLookupBlock(t, n) 226 for i := 3; i < len(b.lookupBlocks); i++ { 227 genLookupBlock(t, b.lookupBlocks[i]) 228 } 229 return b.t, nil 230 } 231 232 func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) { 233 p := func(f string, a ...interface{}) { 234 nn, e := fmt.Fprintf(w, f, a...) 235 n += nn 236 if err == nil { 237 err = e 238 } 239 } 240 nv := len(t.values) 241 p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4) 242 p("// Block 2 is the null block.\n") 243 p("var %sValues = [%d]uint32 {", name, nv) 244 var printnewline bool 245 for i, v := range t.values { 246 if i%blockSize == 0 { 247 p("\n\t// Block %#x, offset %#x", i/blockSize, i) 248 } 249 if i%4 == 0 { 250 printnewline = true 251 } 252 if v != 0 { 253 if printnewline { 254 p("\n\t") 255 printnewline = false 256 } 257 p("%#04x:%#08x, ", i, v) 258 } 259 } 260 p("\n}\n\n") 261 ni := len(t.index) 262 p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2) 263 p("// Block 0 is the null block.\n") 264 p("var %sLookup = [%d]uint16 {", name, ni) 265 printnewline = false 266 for i, v := range t.index { 267 if i%blockSize == 0 { 268 p("\n\t// Block %#x, offset %#x", i/blockSize, i) 269 } 270 if i%8 == 0 { 271 printnewline = true 272 } 273 if v != 0 { 274 if printnewline { 275 p("\n\t") 276 printnewline = false 277 } 278 p("%#03x:%#02x, ", i, v) 279 } 280 } 281 p("\n}\n\n") 282 return n, nv*4 + ni*2, err 283 } 284 285 func (t *trie) printStruct(w io.Writer, handle *trieHandle, name string) (n, sz int, err error) { 286 const msg = "trie{ %sLookup[%d:], %sValues[%d:], %sLookup[:], %sValues[:]}" 287 n, err = fmt.Fprintf(w, msg, name, handle.lookupStart*blockSize, name, handle.valueStart*blockSize, name, name) 288 sz += int(reflect.TypeOf(trie{}).Size()) 289 return 290 }