golang.org/x/text@v0.14.0/internal/cldrtree/cldrtree.go (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package cldrtree builds and generates a CLDR index file, including all 6 // inheritance. 7 package cldrtree 8 9 //go:generate go test -gen 10 11 // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR 12 // data each branch in the tree is indicated by either an element name or an 13 // attribute value. A Tree does not distinguish between these two cases, but 14 // rather assumes that all branches can be accessed by an enum with a compact 15 // range of positive integer values starting from 0. 16 // 17 // Each Tree consists of three parts: 18 // - a slice mapping compact language identifiers to an offset into a set of 19 // indices, 20 // - a set of indices, stored as a large blob of uint16 values that encode 21 // the actual tree structure of data, and 22 // - a set of buckets that each holds a collection of strings. 23 // each of which is explained in more detail below. 24 // 25 // 26 // Tree lookup 27 // A tree lookup is done by providing a locale and a "path", which is a 28 // sequence of enum values. The search starts with getting the index for the 29 // given locale and then incrementally jumping into the index using the path 30 // values. If an element cannot be found in the index, the search starts anew 31 // for the locale's parent locale. The path may change during lookup by means 32 // of aliasing, described below. 33 // 34 // Buckets 35 // Buckets hold the actual string data of the leaf values of the CLDR tree. 36 // This data is stored in buckets, rather than one large string, for multiple 37 // reasons: 38 // - it allows representing leaf values more compactly, by storing all leaf 39 // values in a single bucket and then needing only needing a uint16 to index 40 // into this bucket for all leaf values, 41 // - (TBD) allow multiple trees to share subsets of buckets, mostly to allow 42 // linking in a smaller amount of data if only a subset of the buckets is 43 // needed, 44 // - to be nice to go fmt and the compiler. 45 // 46 // indices 47 // An index is a slice of uint16 for which the values are interpreted in one of 48 // two ways: as a node or a set of leaf values. 49 // A set of leaf values has the following form: 50 // <max_size>, <bucket>, <offset>... 51 // max_size indicates the maximum enum value for which an offset is defined. 52 // An offset value of 0xFFFF (missingValue) also indicates an undefined value. 53 // If defined offset indicates the offset within the given bucket of the string. 54 // A node value has the following form: 55 // <max_size>, <offset_or_alias>... 56 // max_size indicates the maximum value for which an offset is defined. 57 // A missing offset may also be indicated with 0. If the high bit (0x8000, or 58 // inheritMask) is not set, the offset points to the offset within the index 59 // for the current locale. 60 // An offset with high bit set is an alias. In this case the uint16 has the form 61 // bits: 62 // 15: 1 63 // 14-12: negative offset into path relative to current position 64 // 0-11: new enum value for path element. 65 // On encountering an alias, the path is modified accordingly and the lookup is 66 // restarted for the given locale. 67 68 import ( 69 "fmt" 70 "reflect" 71 "regexp" 72 "strings" 73 "unicode/utf8" 74 75 "golang.org/x/text/internal/gen" 76 "golang.org/x/text/language" 77 "golang.org/x/text/unicode/cldr" 78 ) 79 80 // TODO: 81 // - allow two Trees to share the same set of buckets. 82 83 // A Builder allows storing CLDR data in compact form. 84 type Builder struct { 85 table []string 86 87 rootMeta *metaData 88 locales []locale 89 strToBucket map[string]stringInfo 90 buckets [][]byte 91 enums []*enum 92 err error 93 94 // Stats 95 size int 96 sizeAll int 97 bucketWaste int 98 } 99 100 const ( 101 maxBucketSize = 8 * 1024 // 8K 102 maxStrlen = 254 // allow 0xFF sentinel 103 ) 104 105 func (b *Builder) setError(err error) { 106 if b.err == nil { 107 b.err = err 108 } 109 } 110 111 func (b *Builder) addString(data string) stringInfo { 112 data = b.makeString(data) 113 info, ok := b.strToBucket[data] 114 if !ok { 115 b.size += len(data) 116 x := len(b.buckets) - 1 117 bucket := b.buckets[x] 118 if len(bucket)+len(data) < maxBucketSize { 119 info.bucket = uint16(x) 120 info.bucketPos = uint16(len(bucket)) 121 b.buckets[x] = append(bucket, data...) 122 } else { 123 info.bucket = uint16(len(b.buckets)) 124 info.bucketPos = 0 125 b.buckets = append(b.buckets, []byte(data)) 126 } 127 b.strToBucket[data] = info 128 } 129 return info 130 } 131 132 func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo { 133 data = b.makeString(data) 134 info, ok := b.strToBucket[data] 135 if !ok || info.bucket != bucket { 136 if ok { 137 b.bucketWaste += len(data) 138 } 139 b.size += len(data) 140 bk := b.buckets[bucket] 141 info.bucket = bucket 142 info.bucketPos = uint16(len(bk)) 143 b.buckets[bucket] = append(bk, data...) 144 b.strToBucket[data] = info 145 } 146 return info 147 } 148 149 func (b *Builder) makeString(data string) string { 150 if len(data) > maxStrlen { 151 b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen)) 152 data = data[:maxStrlen] 153 for i := len(data) - 1; i > len(data)-4; i-- { 154 if utf8.RuneStart(data[i]) { 155 data = data[:i] 156 break 157 } 158 } 159 } 160 data = string([]byte{byte(len(data))}) + data 161 b.sizeAll += len(data) 162 return data 163 } 164 165 type stringInfo struct { 166 bufferPos uint32 167 bucket uint16 168 bucketPos uint16 169 } 170 171 // New creates a new Builder. 172 func New(tableName string) *Builder { 173 b := &Builder{ 174 strToBucket: map[string]stringInfo{}, 175 buckets: [][]byte{nil}, // initialize with first bucket. 176 } 177 b.rootMeta = &metaData{ 178 b: b, 179 typeInfo: &typeInfo{}, 180 } 181 return b 182 } 183 184 // Gen writes all the tables and types for the collected data. 185 func (b *Builder) Gen(w *gen.CodeWriter) error { 186 t, err := build(b) 187 if err != nil { 188 return err 189 } 190 return generate(b, t, w) 191 } 192 193 // GenTestData generates tables useful for testing data generated with Gen. 194 func (b *Builder) GenTestData(w *gen.CodeWriter) error { 195 return generateTestData(b, w) 196 } 197 198 type locale struct { 199 tag language.Tag 200 root *Index 201 } 202 203 // Locale creates an index for the given locale. 204 func (b *Builder) Locale(t language.Tag) *Index { 205 index := &Index{ 206 meta: b.rootMeta, 207 } 208 b.locales = append(b.locales, locale{tag: t, root: index}) 209 return index 210 } 211 212 // An Index holds a map of either leaf values or other indices. 213 type Index struct { 214 meta *metaData 215 216 subIndex []*Index 217 values []keyValue 218 } 219 220 func (i *Index) setError(err error) { i.meta.b.setError(err) } 221 222 type keyValue struct { 223 key enumIndex 224 value stringInfo 225 } 226 227 // Element is a CLDR XML element. 228 type Element interface { 229 GetCommon() *cldr.Common 230 } 231 232 // Index creates a subindex where the type and enum values are not shared 233 // with siblings by default. The name is derived from the elem. If elem is 234 // an alias reference, the alias will be resolved and linked. If elem is nil 235 // Index returns nil. 236 func (i *Index) Index(elem Element, opt ...Option) *Index { 237 if elem == nil || reflect.ValueOf(elem).IsNil() { 238 return nil 239 } 240 c := elem.GetCommon() 241 o := &options{ 242 parent: i, 243 name: c.GetCommon().Element(), 244 } 245 o.fill(opt) 246 o.setAlias(elem) 247 return i.subIndexForKey(o) 248 } 249 250 // IndexWithName is like Section but derives the name from the given name. 251 func (i *Index) IndexWithName(name string, opt ...Option) *Index { 252 o := &options{parent: i, name: name} 253 o.fill(opt) 254 return i.subIndexForKey(o) 255 } 256 257 // IndexFromType creates a subindex the value of tye type attribute as key. It 258 // will also configure the Index to share the enumeration values with all 259 // sibling values. If elem is an alias, it will be resolved and linked. 260 func (i *Index) IndexFromType(elem Element, opts ...Option) *Index { 261 o := &options{ 262 parent: i, 263 name: elem.GetCommon().Type, 264 } 265 o.fill(opts) 266 o.setAlias(elem) 267 useSharedType()(o) 268 return i.subIndexForKey(o) 269 } 270 271 // IndexFromAlt creates a subindex the value of tye alt attribute as key. It 272 // will also configure the Index to share the enumeration values with all 273 // sibling values. If elem is an alias, it will be resolved and linked. 274 func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index { 275 o := &options{ 276 parent: i, 277 name: elem.GetCommon().Alt, 278 } 279 o.fill(opts) 280 o.setAlias(elem) 281 useSharedType()(o) 282 return i.subIndexForKey(o) 283 } 284 285 func (i *Index) subIndexForKey(opts *options) *Index { 286 key := opts.name 287 if len(i.values) > 0 { 288 panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key)) 289 } 290 meta := i.meta.sub(key, opts) 291 for _, x := range i.subIndex { 292 if x.meta == meta { 293 return x 294 } 295 } 296 if alias := opts.alias; alias != nil { 297 if a := alias.GetCommon().Alias; a != nil { 298 if a.Source != "locale" { 299 i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path)) 300 } 301 if meta.inheritOffset < 0 { 302 i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path)) 303 } 304 path := a.Path 305 for ; strings.HasPrefix(path, "../"); path = path[len("../"):] { 306 meta.inheritOffset-- 307 } 308 m := aliasRe.FindStringSubmatch(path) 309 if m == nil { 310 i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path)) 311 } else { 312 key := m[4] 313 if key == "" { 314 key = m[1] 315 } 316 meta.inheritIndex = key 317 } 318 } 319 } 320 x := &Index{meta: meta} 321 i.subIndex = append(i.subIndex, x) 322 return x 323 } 324 325 var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`) 326 327 // SetValue sets the value, the data from a CLDR XML element, for the given key. 328 func (i *Index) SetValue(key string, value Element, opt ...Option) { 329 if len(i.subIndex) > 0 { 330 panic(fmt.Errorf("adding value for key %q when index already exists", key)) 331 } 332 o := &options{parent: i} 333 o.fill(opt) 334 c := value.GetCommon() 335 if c.Alias != nil { 336 i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path)) 337 } 338 i.setValue(key, c.Data(), o) 339 } 340 341 func (i *Index) setValue(key, data string, o *options) { 342 index, _ := i.meta.typeInfo.lookupSubtype(key, o) 343 kv := keyValue{key: index} 344 if len(i.values) > 0 { 345 // Add string to the same bucket as the other values. 346 bucket := i.values[0].value.bucket 347 kv.value = i.meta.b.addStringToBucket(data, bucket) 348 } else { 349 kv.value = i.meta.b.addString(data) 350 } 351 i.values = append(i.values, kv) 352 }