github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/cldrtree/cldrtree.go (about)

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package cldrtree builds and generates a CLDR index file, including all
     6  // inheritance.
     7  //
     8  package cldrtree
     9  
    10  // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR
    11  // data each branch in the tree is indicated by either an element name or an
    12  // attribute value. A Tree does not distinguish between these two cases, but
    13  // rather assumes that all branches can be accessed by an enum with a compact
    14  // range of positive integer values starting from 0.
    15  //
    16  // Each Tree consists of three parts:
    17  //    - a slice mapping compact language identifiers to an offset into a set of
    18  //      indices,
    19  //    - a set of indices, stored as a large blob of uint16 values that encode
    20  //      the actual tree structure of data, and
    21  //    - a set of buckets that each holds a collection of strings.
    22  // each of which is explained in more detail below.
    23  //
    24  //
    25  // Tree lookup
    26  // A tree lookup is done by providing a locale and a "path", which is a
    27  // sequence of enum values. The search starts with getting the index for the
    28  // given locale and then incrementally jumping into the index using the path
    29  // values. If an element cannot be found in the index, the search starts anew
    30  // for the locale's parent locale. The path may change during lookup by means
    31  // of aliasing, described below.
    32  //
    33  // Buckets
    34  // Buckets hold the actual string data of the leaf values of the CLDR tree.
    35  // This data is stored in buckets, rather than one large string, for multiple
    36  // reasons:
    37  //   - it allows representing leaf values more compactly, by storing all leaf
    38  //     values in a single bucket and then needing only needing a uint16 to index
    39  //     into this bucket for all leaf values,
    40  //   - (TBD) allow multiple trees to share subsets of buckets, mostly to allow
    41  //     linking in a smaller amount of data if only a subset of the buckets is
    42  //     needed,
    43  //   - to be nice to go fmt and the compiler.
    44  //
    45  // indices
    46  // An index is a slice of uint16 for which the values are interpreted in one of
    47  // two ways: as a node or a set of leaf values.
    48  // A set of leaf values has the following form:
    49  //      <max_size>, <bucket>, <offset>...
    50  // max_size indicates the maximum enum value for which an offset is defined.
    51  // An offset value of 0xFFFF (missingValue) also indicates an undefined value.
    52  // If defined offset indicates the offset within the given bucket of the string.
    53  // A node value has the following form:
    54  //      <max_size>, <offset_or_alias>...
    55  // max_size indicates the maximum value for which an offset is defined.
    56  // A missing offset may also be indicated with 0. If the high bit (0x8000, or
    57  // inheritMask) is not set, the offset points to the offset within the index
    58  // for the current locale.
    59  // An offset with high bit set is an alias. In this case the uint16 has the form
    60  //       bits:
    61  //         15: 1
    62  //      14-12: negative offset into path relative to current position
    63  //       0-11: new enum value for path element.
    64  // On encountering an alias, the path is modified accordingly and the lookup is
    65  // restarted for the given locale.
    66  
    67  import (
    68  	"fmt"
    69  	"reflect"
    70  	"regexp"
    71  	"strings"
    72  	"unicode/utf8"
    73  
    74  	"github.com/go-xe2/third/golang.org/x/text/internal/gen"
    75  	"github.com/go-xe2/third/golang.org/x/text/language"
    76  	"github.com/go-xe2/third/golang.org/x/text/unicode/cldr"
    77  )
    78  
    79  // TODO:
    80  // - allow two Trees to share the same set of buckets.
    81  
    82  // A Builder allows storing CLDR data in compact form.
    83  type Builder struct {
    84  	table []string
    85  
    86  	rootMeta    *metaData
    87  	locales     []locale
    88  	strToBucket map[string]stringInfo
    89  	buckets     [][]byte
    90  	enums       []*enum
    91  	err         error
    92  
    93  	// Stats
    94  	size        int
    95  	sizeAll     int
    96  	bucketWaste int
    97  }
    98  
    99  const (
   100  	maxBucketSize = 8 * 1024 // 8K
   101  	maxStrlen     = 254      // allow 0xFF sentinel
   102  )
   103  
   104  func (b *Builder) setError(err error) {
   105  	if b.err == nil {
   106  		b.err = err
   107  	}
   108  }
   109  
   110  func (b *Builder) addString(data string) stringInfo {
   111  	data = b.makeString(data)
   112  	info, ok := b.strToBucket[data]
   113  	if !ok {
   114  		b.size += len(data)
   115  		x := len(b.buckets) - 1
   116  		bucket := b.buckets[x]
   117  		if len(bucket)+len(data) < maxBucketSize {
   118  			info.bucket = uint16(x)
   119  			info.bucketPos = uint16(len(bucket))
   120  			b.buckets[x] = append(bucket, data...)
   121  		} else {
   122  			info.bucket = uint16(len(b.buckets))
   123  			info.bucketPos = 0
   124  			b.buckets = append(b.buckets, []byte(data))
   125  		}
   126  		b.strToBucket[data] = info
   127  	}
   128  	return info
   129  }
   130  
   131  func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo {
   132  	data = b.makeString(data)
   133  	info, ok := b.strToBucket[data]
   134  	if !ok || info.bucket != bucket {
   135  		if ok {
   136  			b.bucketWaste += len(data)
   137  		}
   138  		b.size += len(data)
   139  		bk := b.buckets[bucket]
   140  		info.bucket = bucket
   141  		info.bucketPos = uint16(len(bk))
   142  		b.buckets[bucket] = append(bk, data...)
   143  		b.strToBucket[data] = info
   144  	}
   145  	return info
   146  }
   147  
   148  func (b *Builder) makeString(data string) string {
   149  	if len(data) > maxStrlen {
   150  		b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen))
   151  		data = data[:maxStrlen]
   152  		for i := len(data) - 1; i > len(data)-4; i-- {
   153  			if utf8.RuneStart(data[i]) {
   154  				data = data[:i]
   155  				break
   156  			}
   157  		}
   158  	}
   159  	data = string([]byte{byte(len(data))}) + data
   160  	b.sizeAll += len(data)
   161  	return data
   162  }
   163  
   164  type stringInfo struct {
   165  	bufferPos uint32
   166  	bucket    uint16
   167  	bucketPos uint16
   168  }
   169  
   170  // New creates a new Builder.
   171  func New(tableName string) *Builder {
   172  	b := &Builder{
   173  		strToBucket: map[string]stringInfo{},
   174  		buckets:     [][]byte{nil}, // initialize with first bucket.
   175  	}
   176  	b.rootMeta = &metaData{
   177  		b:        b,
   178  		typeInfo: &typeInfo{},
   179  	}
   180  	return b
   181  }
   182  
   183  // Gen writes all the tables and types for the collected data.
   184  func (b *Builder) Gen(w *gen.CodeWriter) error {
   185  	t, err := build(b)
   186  	if err != nil {
   187  		return err
   188  	}
   189  	return generate(b, t, w)
   190  }
   191  
   192  // GenTestData generates tables useful for testing data generated with Gen.
   193  func (b *Builder) GenTestData(w *gen.CodeWriter) error {
   194  	return generateTestData(b, w)
   195  }
   196  
   197  type locale struct {
   198  	tag  language.Tag
   199  	root *Index
   200  }
   201  
   202  // Locale creates an index for the given locale.
   203  func (b *Builder) Locale(t language.Tag) *Index {
   204  	index := &Index{
   205  		meta: b.rootMeta,
   206  	}
   207  	b.locales = append(b.locales, locale{tag: t, root: index})
   208  	return index
   209  }
   210  
   211  // An Index holds a map of either leaf values or other indices.
   212  type Index struct {
   213  	meta *metaData
   214  
   215  	subIndex []*Index
   216  	values   []keyValue
   217  }
   218  
   219  func (i *Index) setError(err error) { i.meta.b.setError(err) }
   220  
   221  type keyValue struct {
   222  	key   enumIndex
   223  	value stringInfo
   224  }
   225  
   226  // Element is a CLDR XML element.
   227  type Element interface {
   228  	GetCommon() *cldr.Common
   229  }
   230  
   231  // Index creates a subindex where the type and enum values are not shared
   232  // with siblings by default. The name is derived from the elem. If elem is
   233  // an alias reference, the alias will be resolved and linked. If elem is nil
   234  // Index returns nil.
   235  func (i *Index) Index(elem Element, opt ...Option) *Index {
   236  	if elem == nil || reflect.ValueOf(elem).IsNil() {
   237  		return nil
   238  	}
   239  	c := elem.GetCommon()
   240  	o := &options{
   241  		parent: i,
   242  		name:   c.GetCommon().Element(),
   243  	}
   244  	o.fill(opt)
   245  	o.setAlias(elem)
   246  	return i.subIndexForKey(o)
   247  }
   248  
   249  // IndexWithName is like Section but derives the name from the given name.
   250  func (i *Index) IndexWithName(name string, opt ...Option) *Index {
   251  	o := &options{parent: i, name: name}
   252  	o.fill(opt)
   253  	return i.subIndexForKey(o)
   254  }
   255  
   256  // IndexFromType creates a subindex the value of tye type attribute as key. It
   257  // will also configure the Index to share the enumeration values with all
   258  // sibling values. If elem is an alias, it will be resolved and linked.
   259  func (i *Index) IndexFromType(elem Element, opts ...Option) *Index {
   260  	o := &options{
   261  		parent: i,
   262  		name:   elem.GetCommon().Type,
   263  	}
   264  	o.fill(opts)
   265  	o.setAlias(elem)
   266  	useSharedType()(o)
   267  	return i.subIndexForKey(o)
   268  }
   269  
   270  // IndexFromAlt creates a subindex the value of tye alt attribute as key. It
   271  // will also configure the Index to share the enumeration values with all
   272  // sibling values. If elem is an alias, it will be resolved and linked.
   273  func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index {
   274  	o := &options{
   275  		parent: i,
   276  		name:   elem.GetCommon().Alt,
   277  	}
   278  	o.fill(opts)
   279  	o.setAlias(elem)
   280  	useSharedType()(o)
   281  	return i.subIndexForKey(o)
   282  }
   283  
   284  func (i *Index) subIndexForKey(opts *options) *Index {
   285  	key := opts.name
   286  	if len(i.values) > 0 {
   287  		panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key))
   288  	}
   289  	meta := i.meta.sub(key, opts)
   290  	for _, x := range i.subIndex {
   291  		if x.meta == meta {
   292  			return x
   293  		}
   294  	}
   295  	if alias := opts.alias; alias != nil {
   296  		if a := alias.GetCommon().Alias; a != nil {
   297  			if a.Source != "locale" {
   298  				i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path))
   299  			}
   300  			if meta.inheritOffset < 0 {
   301  				i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path))
   302  			}
   303  			path := a.Path
   304  			for ; strings.HasPrefix(path, "../"); path = path[len("../"):] {
   305  				meta.inheritOffset--
   306  			}
   307  			m := aliasRe.FindStringSubmatch(path)
   308  			if m == nil {
   309  				i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path))
   310  			} else {
   311  				key := m[4]
   312  				if key == "" {
   313  					key = m[1]
   314  				}
   315  				meta.inheritIndex = key
   316  			}
   317  		}
   318  	}
   319  	x := &Index{meta: meta}
   320  	i.subIndex = append(i.subIndex, x)
   321  	return x
   322  }
   323  
   324  var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`)
   325  
   326  // SetValue sets the value, the data from a CLDR XML element, for the given key.
   327  func (i *Index) SetValue(key string, value Element, opt ...Option) {
   328  	if len(i.subIndex) > 0 {
   329  		panic(fmt.Errorf("adding value for key %q when index already exists", key))
   330  	}
   331  	o := &options{parent: i}
   332  	o.fill(opt)
   333  	c := value.GetCommon()
   334  	if c.Alias != nil {
   335  		i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path))
   336  	}
   337  	i.setValue(key, c.Data(), o)
   338  }
   339  
   340  func (i *Index) setValue(key, data string, o *options) {
   341  	index, _ := i.meta.typeInfo.lookupSubtype(key, o)
   342  	kv := keyValue{key: index}
   343  	if len(i.values) > 0 {
   344  		// Add string to the same bucket as the other values.
   345  		bucket := i.values[0].value.bucket
   346  		kv.value = i.meta.b.addStringToBucket(data, bucket)
   347  	} else {
   348  		kv.value = i.meta.b.addString(data)
   349  	}
   350  	i.values = append(i.values, kv)
   351  }