github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/triegen/triegen.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package triegen implements a code generator for a trie for associating 6 // unsigned integer values with UTF-8 encoded runes. 7 // 8 // Many of the go.text packages use tries for storing per-rune information. A 9 // trie is especially useful if many of the runes have the same value. If this 10 // is the case, many blocks can be expected to be shared allowing for 11 // information on many runes to be stored in little space. 12 // 13 // As most of the lookups are done directly on []byte slices, the tries use the 14 // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to 15 // runes and contributes a little bit to better performance. It also naturally 16 // provides a fast path for ASCII. 17 // 18 // Space is also an issue. There are many code points defined in Unicode and as 19 // a result tables can get quite large. So every byte counts. The triegen 20 // package automatically chooses the smallest integer values to represent the 21 // tables. Compacters allow further compression of the trie by allowing for 22 // alternative representations of individual trie blocks. 23 // 24 // triegen allows generating multiple tries as a single structure. This is 25 // useful when, for example, one wants to generate tries for several languages 26 // that have a lot of values in common. Some existing libraries for 27 // internationalization store all per-language data as a dynamically loadable 28 // chunk. The go.text packages are designed with the assumption that the user 29 // typically wants to compile in support for all supported languages, in line 30 // with the approach common to Go to create a single standalone binary. The 31 // multi-root trie approach can give significant storage savings in this 32 // scenario. 33 // 34 // triegen generates both tables and code. The code is optimized to use the 35 // automatically chosen data types. The following code is generated for a Trie 36 // or multiple Tries named "foo": 37 // - type fooTrie 38 // The trie type. 39 // 40 // - func newFooTrie(x int) *fooTrie 41 // Trie constructor, where x is the index of the trie passed to Gen. 42 // 43 // - func (t *fooTrie) lookup(s []byte) (v uintX, sz int) 44 // The lookup method, where uintX is automatically chosen. 45 // 46 // - func lookupString, lookupUnsafe and lookupStringUnsafe 47 // Variants of the above. 48 // 49 // - var fooValues and fooIndex and any tables generated by Compacters. 50 // The core trie data. 51 // 52 // - var fooTrieHandles 53 // Indexes of starter blocks in case of multiple trie roots. 54 // 55 // It is recommended that users test the generated trie by checking the returned 56 // value for every rune. Such exhaustive tests are possible as the the number of 57 // runes in Unicode is limited. 58 package triegen // import "github.com/go-xe2/third/golang.org/x/text/internal/triegen" 59 60 // TODO: Arguably, the internally optimized data types would not have to be 61 // exposed in the generated API. We could also investigate not generating the 62 // code, but using it through a package. We would have to investigate the impact 63 // on performance of making such change, though. For packages like unicode/norm, 64 // small changes like this could tank performance. 65 66 import ( 67 "encoding/binary" 68 "fmt" 69 "hash/crc64" 70 "io" 71 "log" 72 "unicode/utf8" 73 ) 74 75 // builder builds a set of tries for associating values with runes. The set of 76 // tries can share common index and value blocks. 77 type builder struct { 78 Name string 79 80 // ValueType is the type of the trie values looked up. 81 ValueType string 82 83 // ValueSize is the byte size of the ValueType. 84 ValueSize int 85 86 // IndexType is the type of trie index values used for all UTF-8 bytes of 87 // a rune except the last one. 88 IndexType string 89 90 // IndexSize is the byte size of the IndexType. 91 IndexSize int 92 93 // SourceType is used when generating the lookup functions. If the user 94 // requests StringSupport, all lookup functions will be generated for 95 // string input as well. 96 SourceType string 97 98 Trie []*Trie 99 100 IndexBlocks []*node 101 ValueBlocks [][]uint64 102 Compactions []compaction 103 Checksum uint64 104 105 ASCIIBlock string 106 StarterBlock string 107 108 indexBlockIdx map[uint64]int 109 valueBlockIdx map[uint64]nodeIndex 110 asciiBlockIdx map[uint64]int 111 112 // Stats are used to fill out the template. 113 Stats struct { 114 NValueEntries int 115 NValueBytes int 116 NIndexEntries int 117 NIndexBytes int 118 NHandleBytes int 119 } 120 121 err error 122 } 123 124 // A nodeIndex encodes the index of a node, which is defined by the compaction 125 // which stores it and an index within the compaction. For internal nodes, the 126 // compaction is always 0. 127 type nodeIndex struct { 128 compaction int 129 index int 130 } 131 132 // compaction keeps track of stats used for the compaction. 133 type compaction struct { 134 c Compacter 135 blocks []*node 136 maxHandle uint32 137 totalSize int 138 139 // Used by template-based generator and thus exported. 140 Cutoff uint32 141 Offset uint32 142 Handler string 143 } 144 145 func (b *builder) setError(err error) { 146 if b.err == nil { 147 b.err = err 148 } 149 } 150 151 // An Option can be passed to Gen. 152 type Option func(b *builder) error 153 154 // Compact configures the trie generator to use the given Compacter. 155 func Compact(c Compacter) Option { 156 return func(b *builder) error { 157 b.Compactions = append(b.Compactions, compaction{ 158 c: c, 159 Handler: c.Handler() + "(n, b)"}) 160 return nil 161 } 162 } 163 164 // Gen writes Go code for a shared trie lookup structure to w for the given 165 // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will 166 // return the *nameTrie for tries[x]. A value can be looked up by using one of 167 // the various lookup methods defined on nameTrie. It returns the table size of 168 // the generated trie. 169 func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) { 170 // The index contains two dummy blocks, followed by the zero block. The zero 171 // block is at offset 0x80, so that the offset for the zero block for 172 // continuation bytes is 0. 173 b := &builder{ 174 Name: name, 175 Trie: tries, 176 IndexBlocks: []*node{{}, {}, {}}, 177 Compactions: []compaction{{ 178 Handler: name + "Values[n<<6+uint32(b)]", 179 }}, 180 // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero 181 // block. 182 indexBlockIdx: map[uint64]int{0: 0}, 183 valueBlockIdx: map[uint64]nodeIndex{0: {}}, 184 asciiBlockIdx: map[uint64]int{}, 185 } 186 b.Compactions[0].c = (*simpleCompacter)(b) 187 188 for _, f := range opts { 189 if err := f(b); err != nil { 190 return 0, err 191 } 192 } 193 b.build() 194 if b.err != nil { 195 return 0, b.err 196 } 197 if err = b.print(w); err != nil { 198 return 0, err 199 } 200 return b.Size(), nil 201 } 202 203 // A Trie represents a single root node of a trie. A builder may build several 204 // overlapping tries at once. 205 type Trie struct { 206 root *node 207 208 hiddenTrie 209 } 210 211 // hiddenTrie contains values we want to be visible to the template generator, 212 // but hidden from the API documentation. 213 type hiddenTrie struct { 214 Name string 215 Checksum uint64 216 ASCIIIndex int 217 StarterIndex int 218 } 219 220 // NewTrie returns a new trie root. 221 func NewTrie(name string) *Trie { 222 return &Trie{ 223 &node{ 224 children: make([]*node, blockSize), 225 values: make([]uint64, utf8.RuneSelf), 226 }, 227 hiddenTrie{Name: name}, 228 } 229 } 230 231 // Gen is a convenience wrapper around the Gen func passing t as the only trie 232 // and uses the name passed to NewTrie. It returns the size of the generated 233 // tables. 234 func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) { 235 return Gen(w, t.Name, []*Trie{t}, opts...) 236 } 237 238 // node is a node of the intermediate trie structure. 239 type node struct { 240 // children holds this node's children. It is always of length 64. 241 // A child node may be nil. 242 children []*node 243 244 // values contains the values of this node. If it is non-nil, this node is 245 // either a root or leaf node: 246 // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F]. 247 // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF]. 248 values []uint64 249 250 index nodeIndex 251 } 252 253 // Insert associates value with the given rune. Insert will panic if a non-zero 254 // value is passed for an invalid rune. 255 func (t *Trie) Insert(r rune, value uint64) { 256 if value == 0 { 257 return 258 } 259 s := string(r) 260 if []rune(s)[0] != r && value != 0 { 261 // Note: The UCD tables will always assign what amounts to a zero value 262 // to a surrogate. Allowing a zero value for an illegal rune allows 263 // users to iterate over [0..MaxRune] without having to explicitly 264 // exclude surrogates, which would be tedious. 265 panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r)) 266 } 267 if len(s) == 1 { 268 // It is a root node value (ASCII). 269 t.root.values[s[0]] = value 270 return 271 } 272 273 n := t.root 274 for ; len(s) > 1; s = s[1:] { 275 if n.children == nil { 276 n.children = make([]*node, blockSize) 277 } 278 p := s[0] % blockSize 279 c := n.children[p] 280 if c == nil { 281 c = &node{} 282 n.children[p] = c 283 } 284 if len(s) > 2 && c.values != nil { 285 log.Fatalf("triegen: insert(%U): found internal node with values", r) 286 } 287 n = c 288 } 289 if n.values == nil { 290 n.values = make([]uint64, blockSize) 291 } 292 if n.children != nil { 293 log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r) 294 } 295 n.values[s[0]-0x80] = value 296 } 297 298 // Size returns the number of bytes the generated trie will take to store. It 299 // needs to be exported as it is used in the templates. 300 func (b *builder) Size() int { 301 // Index blocks. 302 sz := len(b.IndexBlocks) * blockSize * b.IndexSize 303 304 // Skip the first compaction, which represents the normal value blocks, as 305 // its totalSize does not account for the ASCII blocks, which are managed 306 // separately. 307 sz += len(b.ValueBlocks) * blockSize * b.ValueSize 308 for _, c := range b.Compactions[1:] { 309 sz += c.totalSize 310 } 311 312 // TODO: this computation does not account for the fixed overhead of a using 313 // a compaction, either code or data. As for data, though, the typical 314 // overhead of data is in the order of bytes (2 bytes for cases). Further, 315 // the savings of using a compaction should anyway be substantial for it to 316 // be worth it. 317 318 // For multi-root tries, we also need to account for the handles. 319 if len(b.Trie) > 1 { 320 sz += 2 * b.IndexSize * len(b.Trie) 321 } 322 return sz 323 } 324 325 func (b *builder) build() { 326 // Compute the sizes of the values. 327 var vmax uint64 328 for _, t := range b.Trie { 329 vmax = maxValue(t.root, vmax) 330 } 331 b.ValueType, b.ValueSize = getIntType(vmax) 332 333 // Compute all block allocations. 334 // TODO: first compute the ASCII blocks for all tries and then the other 335 // nodes. ASCII blocks are more restricted in placement, as they require two 336 // blocks to be placed consecutively. Processing them first may improve 337 // sharing (at least one zero block can be expected to be saved.) 338 for _, t := range b.Trie { 339 b.Checksum += b.buildTrie(t) 340 } 341 342 // Compute the offsets for all the Compacters. 343 offset := uint32(0) 344 for i := range b.Compactions { 345 c := &b.Compactions[i] 346 c.Offset = offset 347 offset += c.maxHandle + 1 348 c.Cutoff = offset 349 } 350 351 // Compute the sizes of indexes. 352 // TODO: different byte positions could have different sizes. So far we have 353 // not found a case where this is beneficial. 354 imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff) 355 for _, ib := range b.IndexBlocks { 356 if x := uint64(ib.index.index); x > imax { 357 imax = x 358 } 359 } 360 b.IndexType, b.IndexSize = getIntType(imax) 361 } 362 363 func maxValue(n *node, max uint64) uint64 { 364 if n == nil { 365 return max 366 } 367 for _, c := range n.children { 368 max = maxValue(c, max) 369 } 370 for _, v := range n.values { 371 if max < v { 372 max = v 373 } 374 } 375 return max 376 } 377 378 func getIntType(v uint64) (string, int) { 379 switch { 380 case v < 1<<8: 381 return "uint8", 1 382 case v < 1<<16: 383 return "uint16", 2 384 case v < 1<<32: 385 return "uint32", 4 386 } 387 return "uint64", 8 388 } 389 390 const ( 391 blockSize = 64 392 393 // Subtract two blocks to offset 0x80, the first continuation byte. 394 blockOffset = 2 395 396 // Subtract three blocks to offset 0xC0, the first non-ASCII starter. 397 rootBlockOffset = 3 398 ) 399 400 var crcTable = crc64.MakeTable(crc64.ISO) 401 402 func (b *builder) buildTrie(t *Trie) uint64 { 403 n := t.root 404 405 // Get the ASCII offset. For the first trie, the ASCII block will be at 406 // position 0. 407 hasher := crc64.New(crcTable) 408 binary.Write(hasher, binary.BigEndian, n.values) 409 hash := hasher.Sum64() 410 411 v, ok := b.asciiBlockIdx[hash] 412 if !ok { 413 v = len(b.ValueBlocks) 414 b.asciiBlockIdx[hash] = v 415 416 b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:]) 417 if v == 0 { 418 // Add the zero block at position 2 so that it will be assigned a 419 // zero reference in the lookup blocks. 420 // TODO: always do this? This would allow us to remove a check from 421 // the trie lookup, but at the expense of extra space. Analyze 422 // performance for unicode/norm. 423 b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize)) 424 } 425 } 426 t.ASCIIIndex = v 427 428 // Compute remaining offsets. 429 t.Checksum = b.computeOffsets(n, true) 430 // We already subtracted the normal blockOffset from the index. Subtract the 431 // difference for starter bytes. 432 t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset) 433 return t.Checksum 434 } 435 436 func (b *builder) computeOffsets(n *node, root bool) uint64 { 437 // For the first trie, the root lookup block will be at position 3, which is 438 // the offset for UTF-8 non-ASCII starter bytes. 439 first := len(b.IndexBlocks) == rootBlockOffset 440 if first { 441 b.IndexBlocks = append(b.IndexBlocks, n) 442 } 443 444 // We special-case the cases where all values recursively are 0. This allows 445 // for the use of a zero block to which all such values can be directed. 446 hash := uint64(0) 447 if n.children != nil || n.values != nil { 448 hasher := crc64.New(crcTable) 449 for _, c := range n.children { 450 var v uint64 451 if c != nil { 452 v = b.computeOffsets(c, false) 453 } 454 binary.Write(hasher, binary.BigEndian, v) 455 } 456 binary.Write(hasher, binary.BigEndian, n.values) 457 hash = hasher.Sum64() 458 } 459 460 if first { 461 b.indexBlockIdx[hash] = rootBlockOffset - blockOffset 462 } 463 464 // Compacters don't apply to internal nodes. 465 if n.children != nil { 466 v, ok := b.indexBlockIdx[hash] 467 if !ok { 468 v = len(b.IndexBlocks) - blockOffset 469 b.IndexBlocks = append(b.IndexBlocks, n) 470 b.indexBlockIdx[hash] = v 471 } 472 n.index = nodeIndex{0, v} 473 } else { 474 h, ok := b.valueBlockIdx[hash] 475 if !ok { 476 bestI, bestSize := 0, blockSize*b.ValueSize 477 for i, c := range b.Compactions[1:] { 478 if sz, ok := c.c.Size(n.values); ok && bestSize > sz { 479 bestI, bestSize = i+1, sz 480 } 481 } 482 c := &b.Compactions[bestI] 483 c.totalSize += bestSize 484 v := c.c.Store(n.values) 485 if c.maxHandle < v { 486 c.maxHandle = v 487 } 488 h = nodeIndex{bestI, int(v)} 489 b.valueBlockIdx[hash] = h 490 } 491 n.index = h 492 } 493 return hash 494 }