github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/triegen/triegen.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package triegen implements a code generator for a trie for associating
     6  // unsigned integer values with UTF-8 encoded runes.
     7  //
     8  // Many of the go.text packages use tries for storing per-rune information.  A
     9  // trie is especially useful if many of the runes have the same value. If this
    10  // is the case, many blocks can be expected to be shared allowing for
    11  // information on many runes to be stored in little space.
    12  //
    13  // As most of the lookups are done directly on []byte slices, the tries use the
    14  // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
    15  // runes and contributes a little bit to better performance. It also naturally
    16  // provides a fast path for ASCII.
    17  //
    18  // Space is also an issue. There are many code points defined in Unicode and as
    19  // a result tables can get quite large. So every byte counts. The triegen
    20  // package automatically chooses the smallest integer values to represent the
    21  // tables. Compacters allow further compression of the trie by allowing for
    22  // alternative representations of individual trie blocks.
    23  //
    24  // triegen allows generating multiple tries as a single structure. This is
    25  // useful when, for example, one wants to generate tries for several languages
    26  // that have a lot of values in common. Some existing libraries for
    27  // internationalization store all per-language data as a dynamically loadable
    28  // chunk. The go.text packages are designed with the assumption that the user
    29  // typically wants to compile in support for all supported languages, in line
    30  // with the approach common to Go to create a single standalone binary. The
    31  // multi-root trie approach can give significant storage savings in this
    32  // scenario.
    33  //
    34  // triegen generates both tables and code. The code is optimized to use the
    35  // automatically chosen data types. The following code is generated for a Trie
    36  // or multiple Tries named "foo":
    37  //	- type fooTrie
    38  //		The trie type.
    39  //
    40  //	- func newFooTrie(x int) *fooTrie
    41  //		Trie constructor, where x is the index of the trie passed to Gen.
    42  //
    43  //	- func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
    44  //		The lookup method, where uintX is automatically chosen.
    45  //
    46  //	- func lookupString, lookupUnsafe and lookupStringUnsafe
    47  //		Variants of the above.
    48  //
    49  //	- var fooValues and fooIndex and any tables generated by Compacters.
    50  //		The core trie data.
    51  //
    52  //	- var fooTrieHandles
    53  //		Indexes of starter blocks in case of multiple trie roots.
    54  //
    55  // It is recommended that users test the generated trie by checking the returned
    56  // value for every rune. Such exhaustive tests are possible as the the number of
    57  // runes in Unicode is limited.
    58  package triegen // import "github.com/go-xe2/third/golang.org/x/text/internal/triegen"
    59  
    60  // TODO: Arguably, the internally optimized data types would not have to be
    61  // exposed in the generated API. We could also investigate not generating the
    62  // code, but using it through a package. We would have to investigate the impact
    63  // on performance of making such change, though. For packages like unicode/norm,
    64  // small changes like this could tank performance.
    65  
    66  import (
    67  	"encoding/binary"
    68  	"fmt"
    69  	"hash/crc64"
    70  	"io"
    71  	"log"
    72  	"unicode/utf8"
    73  )
    74  
    75  // builder builds a set of tries for associating values with runes. The set of
    76  // tries can share common index and value blocks.
    77  type builder struct {
    78  	Name string
    79  
    80  	// ValueType is the type of the trie values looked up.
    81  	ValueType string
    82  
    83  	// ValueSize is the byte size of the ValueType.
    84  	ValueSize int
    85  
    86  	// IndexType is the type of trie index values used for all UTF-8 bytes of
    87  	// a rune except the last one.
    88  	IndexType string
    89  
    90  	// IndexSize is the byte size of the IndexType.
    91  	IndexSize int
    92  
    93  	// SourceType is used when generating the lookup functions. If the user
    94  	// requests StringSupport, all lookup functions will be generated for
    95  	// string input as well.
    96  	SourceType string
    97  
    98  	Trie []*Trie
    99  
   100  	IndexBlocks []*node
   101  	ValueBlocks [][]uint64
   102  	Compactions []compaction
   103  	Checksum    uint64
   104  
   105  	ASCIIBlock   string
   106  	StarterBlock string
   107  
   108  	indexBlockIdx map[uint64]int
   109  	valueBlockIdx map[uint64]nodeIndex
   110  	asciiBlockIdx map[uint64]int
   111  
   112  	// Stats are used to fill out the template.
   113  	Stats struct {
   114  		NValueEntries int
   115  		NValueBytes   int
   116  		NIndexEntries int
   117  		NIndexBytes   int
   118  		NHandleBytes  int
   119  	}
   120  
   121  	err error
   122  }
   123  
   124  // A nodeIndex encodes the index of a node, which is defined by the compaction
   125  // which stores it and an index within the compaction. For internal nodes, the
   126  // compaction is always 0.
   127  type nodeIndex struct {
   128  	compaction int
   129  	index      int
   130  }
   131  
   132  // compaction keeps track of stats used for the compaction.
   133  type compaction struct {
   134  	c         Compacter
   135  	blocks    []*node
   136  	maxHandle uint32
   137  	totalSize int
   138  
   139  	// Used by template-based generator and thus exported.
   140  	Cutoff  uint32
   141  	Offset  uint32
   142  	Handler string
   143  }
   144  
   145  func (b *builder) setError(err error) {
   146  	if b.err == nil {
   147  		b.err = err
   148  	}
   149  }
   150  
   151  // An Option can be passed to Gen.
   152  type Option func(b *builder) error
   153  
   154  // Compact configures the trie generator to use the given Compacter.
   155  func Compact(c Compacter) Option {
   156  	return func(b *builder) error {
   157  		b.Compactions = append(b.Compactions, compaction{
   158  			c:       c,
   159  			Handler: c.Handler() + "(n, b)"})
   160  		return nil
   161  	}
   162  }
   163  
   164  // Gen writes Go code for a shared trie lookup structure to w for the given
   165  // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
   166  // return the *nameTrie for tries[x]. A value can be looked up by using one of
   167  // the various lookup methods defined on nameTrie. It returns the table size of
   168  // the generated trie.
   169  func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
   170  	// The index contains two dummy blocks, followed by the zero block. The zero
   171  	// block is at offset 0x80, so that the offset for the zero block for
   172  	// continuation bytes is 0.
   173  	b := &builder{
   174  		Name:        name,
   175  		Trie:        tries,
   176  		IndexBlocks: []*node{{}, {}, {}},
   177  		Compactions: []compaction{{
   178  			Handler: name + "Values[n<<6+uint32(b)]",
   179  		}},
   180  		// The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
   181  		// block.
   182  		indexBlockIdx: map[uint64]int{0: 0},
   183  		valueBlockIdx: map[uint64]nodeIndex{0: {}},
   184  		asciiBlockIdx: map[uint64]int{},
   185  	}
   186  	b.Compactions[0].c = (*simpleCompacter)(b)
   187  
   188  	for _, f := range opts {
   189  		if err := f(b); err != nil {
   190  			return 0, err
   191  		}
   192  	}
   193  	b.build()
   194  	if b.err != nil {
   195  		return 0, b.err
   196  	}
   197  	if err = b.print(w); err != nil {
   198  		return 0, err
   199  	}
   200  	return b.Size(), nil
   201  }
   202  
   203  // A Trie represents a single root node of a trie. A builder may build several
   204  // overlapping tries at once.
   205  type Trie struct {
   206  	root *node
   207  
   208  	hiddenTrie
   209  }
   210  
   211  // hiddenTrie contains values we want to be visible to the template generator,
   212  // but hidden from the API documentation.
   213  type hiddenTrie struct {
   214  	Name         string
   215  	Checksum     uint64
   216  	ASCIIIndex   int
   217  	StarterIndex int
   218  }
   219  
   220  // NewTrie returns a new trie root.
   221  func NewTrie(name string) *Trie {
   222  	return &Trie{
   223  		&node{
   224  			children: make([]*node, blockSize),
   225  			values:   make([]uint64, utf8.RuneSelf),
   226  		},
   227  		hiddenTrie{Name: name},
   228  	}
   229  }
   230  
   231  // Gen is a convenience wrapper around the Gen func passing t as the only trie
   232  // and uses the name passed to NewTrie. It returns the size of the generated
   233  // tables.
   234  func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
   235  	return Gen(w, t.Name, []*Trie{t}, opts...)
   236  }
   237  
   238  // node is a node of the intermediate trie structure.
   239  type node struct {
   240  	// children holds this node's children. It is always of length 64.
   241  	// A child node may be nil.
   242  	children []*node
   243  
   244  	// values contains the values of this node. If it is non-nil, this node is
   245  	// either a root or leaf node:
   246  	// For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
   247  	// For leaf nodes, len(values) ==  64 and it maps the bytes in [0x80, 0xBF].
   248  	values []uint64
   249  
   250  	index nodeIndex
   251  }
   252  
   253  // Insert associates value with the given rune. Insert will panic if a non-zero
   254  // value is passed for an invalid rune.
   255  func (t *Trie) Insert(r rune, value uint64) {
   256  	if value == 0 {
   257  		return
   258  	}
   259  	s := string(r)
   260  	if []rune(s)[0] != r && value != 0 {
   261  		// Note: The UCD tables will always assign what amounts to a zero value
   262  		// to a surrogate. Allowing a zero value for an illegal rune allows
   263  		// users to iterate over [0..MaxRune] without having to explicitly
   264  		// exclude surrogates, which would be tedious.
   265  		panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
   266  	}
   267  	if len(s) == 1 {
   268  		// It is a root node value (ASCII).
   269  		t.root.values[s[0]] = value
   270  		return
   271  	}
   272  
   273  	n := t.root
   274  	for ; len(s) > 1; s = s[1:] {
   275  		if n.children == nil {
   276  			n.children = make([]*node, blockSize)
   277  		}
   278  		p := s[0] % blockSize
   279  		c := n.children[p]
   280  		if c == nil {
   281  			c = &node{}
   282  			n.children[p] = c
   283  		}
   284  		if len(s) > 2 && c.values != nil {
   285  			log.Fatalf("triegen: insert(%U): found internal node with values", r)
   286  		}
   287  		n = c
   288  	}
   289  	if n.values == nil {
   290  		n.values = make([]uint64, blockSize)
   291  	}
   292  	if n.children != nil {
   293  		log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
   294  	}
   295  	n.values[s[0]-0x80] = value
   296  }
   297  
   298  // Size returns the number of bytes the generated trie will take to store. It
   299  // needs to be exported as it is used in the templates.
   300  func (b *builder) Size() int {
   301  	// Index blocks.
   302  	sz := len(b.IndexBlocks) * blockSize * b.IndexSize
   303  
   304  	// Skip the first compaction, which represents the normal value blocks, as
   305  	// its totalSize does not account for the ASCII blocks, which are managed
   306  	// separately.
   307  	sz += len(b.ValueBlocks) * blockSize * b.ValueSize
   308  	for _, c := range b.Compactions[1:] {
   309  		sz += c.totalSize
   310  	}
   311  
   312  	// TODO: this computation does not account for the fixed overhead of a using
   313  	// a compaction, either code or data. As for data, though, the typical
   314  	// overhead of data is in the order of bytes (2 bytes for cases). Further,
   315  	// the savings of using a compaction should anyway be substantial for it to
   316  	// be worth it.
   317  
   318  	// For multi-root tries, we also need to account for the handles.
   319  	if len(b.Trie) > 1 {
   320  		sz += 2 * b.IndexSize * len(b.Trie)
   321  	}
   322  	return sz
   323  }
   324  
   325  func (b *builder) build() {
   326  	// Compute the sizes of the values.
   327  	var vmax uint64
   328  	for _, t := range b.Trie {
   329  		vmax = maxValue(t.root, vmax)
   330  	}
   331  	b.ValueType, b.ValueSize = getIntType(vmax)
   332  
   333  	// Compute all block allocations.
   334  	// TODO: first compute the ASCII blocks for all tries and then the other
   335  	// nodes. ASCII blocks are more restricted in placement, as they require two
   336  	// blocks to be placed consecutively. Processing them first may improve
   337  	// sharing (at least one zero block can be expected to be saved.)
   338  	for _, t := range b.Trie {
   339  		b.Checksum += b.buildTrie(t)
   340  	}
   341  
   342  	// Compute the offsets for all the Compacters.
   343  	offset := uint32(0)
   344  	for i := range b.Compactions {
   345  		c := &b.Compactions[i]
   346  		c.Offset = offset
   347  		offset += c.maxHandle + 1
   348  		c.Cutoff = offset
   349  	}
   350  
   351  	// Compute the sizes of indexes.
   352  	// TODO: different byte positions could have different sizes. So far we have
   353  	// not found a case where this is beneficial.
   354  	imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
   355  	for _, ib := range b.IndexBlocks {
   356  		if x := uint64(ib.index.index); x > imax {
   357  			imax = x
   358  		}
   359  	}
   360  	b.IndexType, b.IndexSize = getIntType(imax)
   361  }
   362  
   363  func maxValue(n *node, max uint64) uint64 {
   364  	if n == nil {
   365  		return max
   366  	}
   367  	for _, c := range n.children {
   368  		max = maxValue(c, max)
   369  	}
   370  	for _, v := range n.values {
   371  		if max < v {
   372  			max = v
   373  		}
   374  	}
   375  	return max
   376  }
   377  
   378  func getIntType(v uint64) (string, int) {
   379  	switch {
   380  	case v < 1<<8:
   381  		return "uint8", 1
   382  	case v < 1<<16:
   383  		return "uint16", 2
   384  	case v < 1<<32:
   385  		return "uint32", 4
   386  	}
   387  	return "uint64", 8
   388  }
   389  
   390  const (
   391  	blockSize = 64
   392  
   393  	// Subtract two blocks to offset 0x80, the first continuation byte.
   394  	blockOffset = 2
   395  
   396  	// Subtract three blocks to offset 0xC0, the first non-ASCII starter.
   397  	rootBlockOffset = 3
   398  )
   399  
   400  var crcTable = crc64.MakeTable(crc64.ISO)
   401  
   402  func (b *builder) buildTrie(t *Trie) uint64 {
   403  	n := t.root
   404  
   405  	// Get the ASCII offset. For the first trie, the ASCII block will be at
   406  	// position 0.
   407  	hasher := crc64.New(crcTable)
   408  	binary.Write(hasher, binary.BigEndian, n.values)
   409  	hash := hasher.Sum64()
   410  
   411  	v, ok := b.asciiBlockIdx[hash]
   412  	if !ok {
   413  		v = len(b.ValueBlocks)
   414  		b.asciiBlockIdx[hash] = v
   415  
   416  		b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
   417  		if v == 0 {
   418  			// Add the zero block at position 2 so that it will be assigned a
   419  			// zero reference in the lookup blocks.
   420  			// TODO: always do this? This would allow us to remove a check from
   421  			// the trie lookup, but at the expense of extra space. Analyze
   422  			// performance for unicode/norm.
   423  			b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
   424  		}
   425  	}
   426  	t.ASCIIIndex = v
   427  
   428  	// Compute remaining offsets.
   429  	t.Checksum = b.computeOffsets(n, true)
   430  	// We already subtracted the normal blockOffset from the index. Subtract the
   431  	// difference for starter bytes.
   432  	t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
   433  	return t.Checksum
   434  }
   435  
   436  func (b *builder) computeOffsets(n *node, root bool) uint64 {
   437  	// For the first trie, the root lookup block will be at position 3, which is
   438  	// the offset for UTF-8 non-ASCII starter bytes.
   439  	first := len(b.IndexBlocks) == rootBlockOffset
   440  	if first {
   441  		b.IndexBlocks = append(b.IndexBlocks, n)
   442  	}
   443  
   444  	// We special-case the cases where all values recursively are 0. This allows
   445  	// for the use of a zero block to which all such values can be directed.
   446  	hash := uint64(0)
   447  	if n.children != nil || n.values != nil {
   448  		hasher := crc64.New(crcTable)
   449  		for _, c := range n.children {
   450  			var v uint64
   451  			if c != nil {
   452  				v = b.computeOffsets(c, false)
   453  			}
   454  			binary.Write(hasher, binary.BigEndian, v)
   455  		}
   456  		binary.Write(hasher, binary.BigEndian, n.values)
   457  		hash = hasher.Sum64()
   458  	}
   459  
   460  	if first {
   461  		b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
   462  	}
   463  
   464  	// Compacters don't apply to internal nodes.
   465  	if n.children != nil {
   466  		v, ok := b.indexBlockIdx[hash]
   467  		if !ok {
   468  			v = len(b.IndexBlocks) - blockOffset
   469  			b.IndexBlocks = append(b.IndexBlocks, n)
   470  			b.indexBlockIdx[hash] = v
   471  		}
   472  		n.index = nodeIndex{0, v}
   473  	} else {
   474  		h, ok := b.valueBlockIdx[hash]
   475  		if !ok {
   476  			bestI, bestSize := 0, blockSize*b.ValueSize
   477  			for i, c := range b.Compactions[1:] {
   478  				if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
   479  					bestI, bestSize = i+1, sz
   480  				}
   481  			}
   482  			c := &b.Compactions[bestI]
   483  			c.totalSize += bestSize
   484  			v := c.c.Store(n.values)
   485  			if c.maxHandle < v {
   486  				c.maxHandle = v
   487  			}
   488  			h = nodeIndex{bestI, int(v)}
   489  			b.valueBlockIdx[hash] = h
   490  		}
   491  		n.index = h
   492  	}
   493  	return hash
   494  }