github.com/liquid-dev/text@v0.3.3-liquid/internal/cldrtree/cldrtree.go (about)

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package cldrtree builds and generates a CLDR index file, including all
     6  // inheritance.
     7  //
     8  package cldrtree
     9  
    10  //go:generate go test -gen
    11  
    12  // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR
    13  // data each branch in the tree is indicated by either an element name or an
    14  // attribute value. A Tree does not distinguish between these two cases, but
    15  // rather assumes that all branches can be accessed by an enum with a compact
    16  // range of positive integer values starting from 0.
    17  //
    18  // Each Tree consists of three parts:
    19  //    - a slice mapping compact language identifiers to an offset into a set of
    20  //      indices,
    21  //    - a set of indices, stored as a large blob of uint16 values that encode
    22  //      the actual tree structure of data, and
    23  //    - a set of buckets that each holds a collection of strings.
    24  // each of which is explained in more detail below.
    25  //
    26  //
    27  // Tree lookup
    28  // A tree lookup is done by providing a locale and a "path", which is a
    29  // sequence of enum values. The search starts with getting the index for the
    30  // given locale and then incrementally jumping into the index using the path
    31  // values. If an element cannot be found in the index, the search starts anew
    32  // for the locale's parent locale. The path may change during lookup by means
    33  // of aliasing, described below.
    34  //
    35  // Buckets
    36  // Buckets hold the actual string data of the leaf values of the CLDR tree.
    37  // This data is stored in buckets, rather than one large string, for multiple
    38  // reasons:
    39  //   - it allows representing leaf values more compactly, by storing all leaf
    40  //     values in a single bucket and then needing only needing a uint16 to index
    41  //     into this bucket for all leaf values,
    42  //   - (TBD) allow multiple trees to share subsets of buckets, mostly to allow
    43  //     linking in a smaller amount of data if only a subset of the buckets is
    44  //     needed,
    45  //   - to be nice to go fmt and the compiler.
    46  //
    47  // indices
    48  // An index is a slice of uint16 for which the values are interpreted in one of
    49  // two ways: as a node or a set of leaf values.
    50  // A set of leaf values has the following form:
    51  //      <max_size>, <bucket>, <offset>...
    52  // max_size indicates the maximum enum value for which an offset is defined.
    53  // An offset value of 0xFFFF (missingValue) also indicates an undefined value.
    54  // If defined offset indicates the offset within the given bucket of the string.
    55  // A node value has the following form:
    56  //      <max_size>, <offset_or_alias>...
    57  // max_size indicates the maximum value for which an offset is defined.
    58  // A missing offset may also be indicated with 0. If the high bit (0x8000, or
    59  // inheritMask) is not set, the offset points to the offset within the index
    60  // for the current locale.
    61  // An offset with high bit set is an alias. In this case the uint16 has the form
    62  //       bits:
    63  //         15: 1
    64  //      14-12: negative offset into path relative to current position
    65  //       0-11: new enum value for path element.
    66  // On encountering an alias, the path is modified accordingly and the lookup is
    67  // restarted for the given locale.
    68  
    69  import (
    70  	"fmt"
    71  	"reflect"
    72  	"regexp"
    73  	"strings"
    74  	"unicode/utf8"
    75  
    76  	"github.com/liquid-dev/text/internal/gen"
    77  	"github.com/liquid-dev/text/language"
    78  	"github.com/liquid-dev/text/unicode/cldr"
    79  )
    80  
    81  // TODO:
    82  // - allow two Trees to share the same set of buckets.
    83  
    84  // A Builder allows storing CLDR data in compact form.
    85  type Builder struct {
    86  	table []string
    87  
    88  	rootMeta    *metaData
    89  	locales     []locale
    90  	strToBucket map[string]stringInfo
    91  	buckets     [][]byte
    92  	enums       []*enum
    93  	err         error
    94  
    95  	// Stats
    96  	size        int
    97  	sizeAll     int
    98  	bucketWaste int
    99  }
   100  
   101  const (
   102  	maxBucketSize = 8 * 1024 // 8K
   103  	maxStrlen     = 254      // allow 0xFF sentinel
   104  )
   105  
   106  func (b *Builder) setError(err error) {
   107  	if b.err == nil {
   108  		b.err = err
   109  	}
   110  }
   111  
   112  func (b *Builder) addString(data string) stringInfo {
   113  	data = b.makeString(data)
   114  	info, ok := b.strToBucket[data]
   115  	if !ok {
   116  		b.size += len(data)
   117  		x := len(b.buckets) - 1
   118  		bucket := b.buckets[x]
   119  		if len(bucket)+len(data) < maxBucketSize {
   120  			info.bucket = uint16(x)
   121  			info.bucketPos = uint16(len(bucket))
   122  			b.buckets[x] = append(bucket, data...)
   123  		} else {
   124  			info.bucket = uint16(len(b.buckets))
   125  			info.bucketPos = 0
   126  			b.buckets = append(b.buckets, []byte(data))
   127  		}
   128  		b.strToBucket[data] = info
   129  	}
   130  	return info
   131  }
   132  
   133  func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo {
   134  	data = b.makeString(data)
   135  	info, ok := b.strToBucket[data]
   136  	if !ok || info.bucket != bucket {
   137  		if ok {
   138  			b.bucketWaste += len(data)
   139  		}
   140  		b.size += len(data)
   141  		bk := b.buckets[bucket]
   142  		info.bucket = bucket
   143  		info.bucketPos = uint16(len(bk))
   144  		b.buckets[bucket] = append(bk, data...)
   145  		b.strToBucket[data] = info
   146  	}
   147  	return info
   148  }
   149  
   150  func (b *Builder) makeString(data string) string {
   151  	if len(data) > maxStrlen {
   152  		b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen))
   153  		data = data[:maxStrlen]
   154  		for i := len(data) - 1; i > len(data)-4; i-- {
   155  			if utf8.RuneStart(data[i]) {
   156  				data = data[:i]
   157  				break
   158  			}
   159  		}
   160  	}
   161  	data = string([]byte{byte(len(data))}) + data
   162  	b.sizeAll += len(data)
   163  	return data
   164  }
   165  
   166  type stringInfo struct {
   167  	bufferPos uint32
   168  	bucket    uint16
   169  	bucketPos uint16
   170  }
   171  
   172  // New creates a new Builder.
   173  func New(tableName string) *Builder {
   174  	b := &Builder{
   175  		strToBucket: map[string]stringInfo{},
   176  		buckets:     [][]byte{nil}, // initialize with first bucket.
   177  	}
   178  	b.rootMeta = &metaData{
   179  		b:        b,
   180  		typeInfo: &typeInfo{},
   181  	}
   182  	return b
   183  }
   184  
   185  // Gen writes all the tables and types for the collected data.
   186  func (b *Builder) Gen(w *gen.CodeWriter) error {
   187  	t, err := build(b)
   188  	if err != nil {
   189  		return err
   190  	}
   191  	return generate(b, t, w)
   192  }
   193  
   194  // GenTestData generates tables useful for testing data generated with Gen.
   195  func (b *Builder) GenTestData(w *gen.CodeWriter) error {
   196  	return generateTestData(b, w)
   197  }
   198  
   199  type locale struct {
   200  	tag  language.Tag
   201  	root *Index
   202  }
   203  
   204  // Locale creates an index for the given locale.
   205  func (b *Builder) Locale(t language.Tag) *Index {
   206  	index := &Index{
   207  		meta: b.rootMeta,
   208  	}
   209  	b.locales = append(b.locales, locale{tag: t, root: index})
   210  	return index
   211  }
   212  
   213  // An Index holds a map of either leaf values or other indices.
   214  type Index struct {
   215  	meta *metaData
   216  
   217  	subIndex []*Index
   218  	values   []keyValue
   219  }
   220  
   221  func (i *Index) setError(err error) { i.meta.b.setError(err) }
   222  
   223  type keyValue struct {
   224  	key   enumIndex
   225  	value stringInfo
   226  }
   227  
   228  // Element is a CLDR XML element.
   229  type Element interface {
   230  	GetCommon() *cldr.Common
   231  }
   232  
   233  // Index creates a subindex where the type and enum values are not shared
   234  // with siblings by default. The name is derived from the elem. If elem is
   235  // an alias reference, the alias will be resolved and linked. If elem is nil
   236  // Index returns nil.
   237  func (i *Index) Index(elem Element, opt ...Option) *Index {
   238  	if elem == nil || reflect.ValueOf(elem).IsNil() {
   239  		return nil
   240  	}
   241  	c := elem.GetCommon()
   242  	o := &options{
   243  		parent: i,
   244  		name:   c.GetCommon().Element(),
   245  	}
   246  	o.fill(opt)
   247  	o.setAlias(elem)
   248  	return i.subIndexForKey(o)
   249  }
   250  
   251  // IndexWithName is like Section but derives the name from the given name.
   252  func (i *Index) IndexWithName(name string, opt ...Option) *Index {
   253  	o := &options{parent: i, name: name}
   254  	o.fill(opt)
   255  	return i.subIndexForKey(o)
   256  }
   257  
   258  // IndexFromType creates a subindex the value of tye type attribute as key. It
   259  // will also configure the Index to share the enumeration values with all
   260  // sibling values. If elem is an alias, it will be resolved and linked.
   261  func (i *Index) IndexFromType(elem Element, opts ...Option) *Index {
   262  	o := &options{
   263  		parent: i,
   264  		name:   elem.GetCommon().Type,
   265  	}
   266  	o.fill(opts)
   267  	o.setAlias(elem)
   268  	useSharedType()(o)
   269  	return i.subIndexForKey(o)
   270  }
   271  
   272  // IndexFromAlt creates a subindex the value of tye alt attribute as key. It
   273  // will also configure the Index to share the enumeration values with all
   274  // sibling values. If elem is an alias, it will be resolved and linked.
   275  func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index {
   276  	o := &options{
   277  		parent: i,
   278  		name:   elem.GetCommon().Alt,
   279  	}
   280  	o.fill(opts)
   281  	o.setAlias(elem)
   282  	useSharedType()(o)
   283  	return i.subIndexForKey(o)
   284  }
   285  
   286  func (i *Index) subIndexForKey(opts *options) *Index {
   287  	key := opts.name
   288  	if len(i.values) > 0 {
   289  		panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key))
   290  	}
   291  	meta := i.meta.sub(key, opts)
   292  	for _, x := range i.subIndex {
   293  		if x.meta == meta {
   294  			return x
   295  		}
   296  	}
   297  	if alias := opts.alias; alias != nil {
   298  		if a := alias.GetCommon().Alias; a != nil {
   299  			if a.Source != "locale" {
   300  				i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path))
   301  			}
   302  			if meta.inheritOffset < 0 {
   303  				i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path))
   304  			}
   305  			path := a.Path
   306  			for ; strings.HasPrefix(path, "../"); path = path[len("../"):] {
   307  				meta.inheritOffset--
   308  			}
   309  			m := aliasRe.FindStringSubmatch(path)
   310  			if m == nil {
   311  				i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path))
   312  			} else {
   313  				key := m[4]
   314  				if key == "" {
   315  					key = m[1]
   316  				}
   317  				meta.inheritIndex = key
   318  			}
   319  		}
   320  	}
   321  	x := &Index{meta: meta}
   322  	i.subIndex = append(i.subIndex, x)
   323  	return x
   324  }
   325  
   326  var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`)
   327  
   328  // SetValue sets the value, the data from a CLDR XML element, for the given key.
   329  func (i *Index) SetValue(key string, value Element, opt ...Option) {
   330  	if len(i.subIndex) > 0 {
   331  		panic(fmt.Errorf("adding value for key %q when index already exists", key))
   332  	}
   333  	o := &options{parent: i}
   334  	o.fill(opt)
   335  	c := value.GetCommon()
   336  	if c.Alias != nil {
   337  		i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path))
   338  	}
   339  	i.setValue(key, c.Data(), o)
   340  }
   341  
   342  func (i *Index) setValue(key, data string, o *options) {
   343  	index, _ := i.meta.typeInfo.lookupSubtype(key, o)
   344  	kv := keyValue{key: index}
   345  	if len(i.values) > 0 {
   346  		// Add string to the same bucket as the other values.
   347  		bucket := i.values[0].value.bucket
   348  		kv.value = i.meta.b.addStringToBucket(data, bucket)
   349  	} else {
   350  		kv.value = i.meta.b.addString(data)
   351  	}
   352  	i.values = append(i.values, kv)
   353  }