github.com/liquid-dev/text@v0.3.3-liquid/internal/cldrtree/cldrtree.go (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package cldrtree builds and generates a CLDR index file, including all 6 // inheritance. 7 // 8 package cldrtree 9 10 //go:generate go test -gen 11 12 // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR 13 // data each branch in the tree is indicated by either an element name or an 14 // attribute value. A Tree does not distinguish between these two cases, but 15 // rather assumes that all branches can be accessed by an enum with a compact 16 // range of positive integer values starting from 0. 17 // 18 // Each Tree consists of three parts: 19 // - a slice mapping compact language identifiers to an offset into a set of 20 // indices, 21 // - a set of indices, stored as a large blob of uint16 values that encode 22 // the actual tree structure of data, and 23 // - a set of buckets that each holds a collection of strings. 24 // each of which is explained in more detail below. 25 // 26 // 27 // Tree lookup 28 // A tree lookup is done by providing a locale and a "path", which is a 29 // sequence of enum values. The search starts with getting the index for the 30 // given locale and then incrementally jumping into the index using the path 31 // values. If an element cannot be found in the index, the search starts anew 32 // for the locale's parent locale. The path may change during lookup by means 33 // of aliasing, described below. 34 // 35 // Buckets 36 // Buckets hold the actual string data of the leaf values of the CLDR tree. 37 // This data is stored in buckets, rather than one large string, for multiple 38 // reasons: 39 // - it allows representing leaf values more compactly, by storing all leaf 40 // values in a single bucket and then needing only needing a uint16 to index 41 // into this bucket for all leaf values, 42 // - (TBD) allow multiple trees to share subsets of buckets, mostly to allow 43 // linking in a smaller amount of data if only a subset of the buckets is 44 // needed, 45 // - to be nice to go fmt and the compiler. 46 // 47 // indices 48 // An index is a slice of uint16 for which the values are interpreted in one of 49 // two ways: as a node or a set of leaf values. 50 // A set of leaf values has the following form: 51 // <max_size>, <bucket>, <offset>... 52 // max_size indicates the maximum enum value for which an offset is defined. 53 // An offset value of 0xFFFF (missingValue) also indicates an undefined value. 54 // If defined offset indicates the offset within the given bucket of the string. 55 // A node value has the following form: 56 // <max_size>, <offset_or_alias>... 57 // max_size indicates the maximum value for which an offset is defined. 58 // A missing offset may also be indicated with 0. If the high bit (0x8000, or 59 // inheritMask) is not set, the offset points to the offset within the index 60 // for the current locale. 61 // An offset with high bit set is an alias. In this case the uint16 has the form 62 // bits: 63 // 15: 1 64 // 14-12: negative offset into path relative to current position 65 // 0-11: new enum value for path element. 66 // On encountering an alias, the path is modified accordingly and the lookup is 67 // restarted for the given locale. 68 69 import ( 70 "fmt" 71 "reflect" 72 "regexp" 73 "strings" 74 "unicode/utf8" 75 76 "github.com/liquid-dev/text/internal/gen" 77 "github.com/liquid-dev/text/language" 78 "github.com/liquid-dev/text/unicode/cldr" 79 ) 80 81 // TODO: 82 // - allow two Trees to share the same set of buckets. 83 84 // A Builder allows storing CLDR data in compact form. 85 type Builder struct { 86 table []string 87 88 rootMeta *metaData 89 locales []locale 90 strToBucket map[string]stringInfo 91 buckets [][]byte 92 enums []*enum 93 err error 94 95 // Stats 96 size int 97 sizeAll int 98 bucketWaste int 99 } 100 101 const ( 102 maxBucketSize = 8 * 1024 // 8K 103 maxStrlen = 254 // allow 0xFF sentinel 104 ) 105 106 func (b *Builder) setError(err error) { 107 if b.err == nil { 108 b.err = err 109 } 110 } 111 112 func (b *Builder) addString(data string) stringInfo { 113 data = b.makeString(data) 114 info, ok := b.strToBucket[data] 115 if !ok { 116 b.size += len(data) 117 x := len(b.buckets) - 1 118 bucket := b.buckets[x] 119 if len(bucket)+len(data) < maxBucketSize { 120 info.bucket = uint16(x) 121 info.bucketPos = uint16(len(bucket)) 122 b.buckets[x] = append(bucket, data...) 123 } else { 124 info.bucket = uint16(len(b.buckets)) 125 info.bucketPos = 0 126 b.buckets = append(b.buckets, []byte(data)) 127 } 128 b.strToBucket[data] = info 129 } 130 return info 131 } 132 133 func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo { 134 data = b.makeString(data) 135 info, ok := b.strToBucket[data] 136 if !ok || info.bucket != bucket { 137 if ok { 138 b.bucketWaste += len(data) 139 } 140 b.size += len(data) 141 bk := b.buckets[bucket] 142 info.bucket = bucket 143 info.bucketPos = uint16(len(bk)) 144 b.buckets[bucket] = append(bk, data...) 145 b.strToBucket[data] = info 146 } 147 return info 148 } 149 150 func (b *Builder) makeString(data string) string { 151 if len(data) > maxStrlen { 152 b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen)) 153 data = data[:maxStrlen] 154 for i := len(data) - 1; i > len(data)-4; i-- { 155 if utf8.RuneStart(data[i]) { 156 data = data[:i] 157 break 158 } 159 } 160 } 161 data = string([]byte{byte(len(data))}) + data 162 b.sizeAll += len(data) 163 return data 164 } 165 166 type stringInfo struct { 167 bufferPos uint32 168 bucket uint16 169 bucketPos uint16 170 } 171 172 // New creates a new Builder. 173 func New(tableName string) *Builder { 174 b := &Builder{ 175 strToBucket: map[string]stringInfo{}, 176 buckets: [][]byte{nil}, // initialize with first bucket. 177 } 178 b.rootMeta = &metaData{ 179 b: b, 180 typeInfo: &typeInfo{}, 181 } 182 return b 183 } 184 185 // Gen writes all the tables and types for the collected data. 186 func (b *Builder) Gen(w *gen.CodeWriter) error { 187 t, err := build(b) 188 if err != nil { 189 return err 190 } 191 return generate(b, t, w) 192 } 193 194 // GenTestData generates tables useful for testing data generated with Gen. 195 func (b *Builder) GenTestData(w *gen.CodeWriter) error { 196 return generateTestData(b, w) 197 } 198 199 type locale struct { 200 tag language.Tag 201 root *Index 202 } 203 204 // Locale creates an index for the given locale. 205 func (b *Builder) Locale(t language.Tag) *Index { 206 index := &Index{ 207 meta: b.rootMeta, 208 } 209 b.locales = append(b.locales, locale{tag: t, root: index}) 210 return index 211 } 212 213 // An Index holds a map of either leaf values or other indices. 214 type Index struct { 215 meta *metaData 216 217 subIndex []*Index 218 values []keyValue 219 } 220 221 func (i *Index) setError(err error) { i.meta.b.setError(err) } 222 223 type keyValue struct { 224 key enumIndex 225 value stringInfo 226 } 227 228 // Element is a CLDR XML element. 229 type Element interface { 230 GetCommon() *cldr.Common 231 } 232 233 // Index creates a subindex where the type and enum values are not shared 234 // with siblings by default. The name is derived from the elem. If elem is 235 // an alias reference, the alias will be resolved and linked. If elem is nil 236 // Index returns nil. 237 func (i *Index) Index(elem Element, opt ...Option) *Index { 238 if elem == nil || reflect.ValueOf(elem).IsNil() { 239 return nil 240 } 241 c := elem.GetCommon() 242 o := &options{ 243 parent: i, 244 name: c.GetCommon().Element(), 245 } 246 o.fill(opt) 247 o.setAlias(elem) 248 return i.subIndexForKey(o) 249 } 250 251 // IndexWithName is like Section but derives the name from the given name. 252 func (i *Index) IndexWithName(name string, opt ...Option) *Index { 253 o := &options{parent: i, name: name} 254 o.fill(opt) 255 return i.subIndexForKey(o) 256 } 257 258 // IndexFromType creates a subindex the value of tye type attribute as key. It 259 // will also configure the Index to share the enumeration values with all 260 // sibling values. If elem is an alias, it will be resolved and linked. 261 func (i *Index) IndexFromType(elem Element, opts ...Option) *Index { 262 o := &options{ 263 parent: i, 264 name: elem.GetCommon().Type, 265 } 266 o.fill(opts) 267 o.setAlias(elem) 268 useSharedType()(o) 269 return i.subIndexForKey(o) 270 } 271 272 // IndexFromAlt creates a subindex the value of tye alt attribute as key. It 273 // will also configure the Index to share the enumeration values with all 274 // sibling values. If elem is an alias, it will be resolved and linked. 275 func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index { 276 o := &options{ 277 parent: i, 278 name: elem.GetCommon().Alt, 279 } 280 o.fill(opts) 281 o.setAlias(elem) 282 useSharedType()(o) 283 return i.subIndexForKey(o) 284 } 285 286 func (i *Index) subIndexForKey(opts *options) *Index { 287 key := opts.name 288 if len(i.values) > 0 { 289 panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key)) 290 } 291 meta := i.meta.sub(key, opts) 292 for _, x := range i.subIndex { 293 if x.meta == meta { 294 return x 295 } 296 } 297 if alias := opts.alias; alias != nil { 298 if a := alias.GetCommon().Alias; a != nil { 299 if a.Source != "locale" { 300 i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path)) 301 } 302 if meta.inheritOffset < 0 { 303 i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path)) 304 } 305 path := a.Path 306 for ; strings.HasPrefix(path, "../"); path = path[len("../"):] { 307 meta.inheritOffset-- 308 } 309 m := aliasRe.FindStringSubmatch(path) 310 if m == nil { 311 i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path)) 312 } else { 313 key := m[4] 314 if key == "" { 315 key = m[1] 316 } 317 meta.inheritIndex = key 318 } 319 } 320 } 321 x := &Index{meta: meta} 322 i.subIndex = append(i.subIndex, x) 323 return x 324 } 325 326 var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`) 327 328 // SetValue sets the value, the data from a CLDR XML element, for the given key. 329 func (i *Index) SetValue(key string, value Element, opt ...Option) { 330 if len(i.subIndex) > 0 { 331 panic(fmt.Errorf("adding value for key %q when index already exists", key)) 332 } 333 o := &options{parent: i} 334 o.fill(opt) 335 c := value.GetCommon() 336 if c.Alias != nil { 337 i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path)) 338 } 339 i.setValue(key, c.Data(), o) 340 } 341 342 func (i *Index) setValue(key, data string, o *options) { 343 index, _ := i.meta.typeInfo.lookupSubtype(key, o) 344 kv := keyValue{key: index} 345 if len(i.values) > 0 { 346 // Add string to the same bucket as the other values. 347 bucket := i.values[0].value.bucket 348 kv.value = i.meta.b.addStringToBucket(data, bucket) 349 } else { 350 kv.value = i.meta.b.addString(data) 351 } 352 i.values = append(i.values, kv) 353 }