github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/cldrtree/cldrtree.go (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package cldrtree builds and generates a CLDR index file, including all 6 // inheritance. 7 // 8 package cldrtree 9 10 // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR 11 // data each branch in the tree is indicated by either an element name or an 12 // attribute value. A Tree does not distinguish between these two cases, but 13 // rather assumes that all branches can be accessed by an enum with a compact 14 // range of positive integer values starting from 0. 15 // 16 // Each Tree consists of three parts: 17 // - a slice mapping compact language identifiers to an offset into a set of 18 // indices, 19 // - a set of indices, stored as a large blob of uint16 values that encode 20 // the actual tree structure of data, and 21 // - a set of buckets that each holds a collection of strings. 22 // each of which is explained in more detail below. 23 // 24 // 25 // Tree lookup 26 // A tree lookup is done by providing a locale and a "path", which is a 27 // sequence of enum values. The search starts with getting the index for the 28 // given locale and then incrementally jumping into the index using the path 29 // values. If an element cannot be found in the index, the search starts anew 30 // for the locale's parent locale. The path may change during lookup by means 31 // of aliasing, described below. 32 // 33 // Buckets 34 // Buckets hold the actual string data of the leaf values of the CLDR tree. 35 // This data is stored in buckets, rather than one large string, for multiple 36 // reasons: 37 // - it allows representing leaf values more compactly, by storing all leaf 38 // values in a single bucket and then needing only needing a uint16 to index 39 // into this bucket for all leaf values, 40 // - (TBD) allow multiple trees to share subsets of buckets, mostly to allow 41 // linking in a smaller amount of data if only a subset of the buckets is 42 // needed, 43 // - to be nice to go fmt and the compiler. 44 // 45 // indices 46 // An index is a slice of uint16 for which the values are interpreted in one of 47 // two ways: as a node or a set of leaf values. 48 // A set of leaf values has the following form: 49 // <max_size>, <bucket>, <offset>... 50 // max_size indicates the maximum enum value for which an offset is defined. 51 // An offset value of 0xFFFF (missingValue) also indicates an undefined value. 52 // If defined offset indicates the offset within the given bucket of the string. 53 // A node value has the following form: 54 // <max_size>, <offset_or_alias>... 55 // max_size indicates the maximum value for which an offset is defined. 56 // A missing offset may also be indicated with 0. If the high bit (0x8000, or 57 // inheritMask) is not set, the offset points to the offset within the index 58 // for the current locale. 59 // An offset with high bit set is an alias. In this case the uint16 has the form 60 // bits: 61 // 15: 1 62 // 14-12: negative offset into path relative to current position 63 // 0-11: new enum value for path element. 64 // On encountering an alias, the path is modified accordingly and the lookup is 65 // restarted for the given locale. 66 67 import ( 68 "fmt" 69 "reflect" 70 "regexp" 71 "strings" 72 "unicode/utf8" 73 74 "github.com/go-xe2/third/golang.org/x/text/internal/gen" 75 "github.com/go-xe2/third/golang.org/x/text/language" 76 "github.com/go-xe2/third/golang.org/x/text/unicode/cldr" 77 ) 78 79 // TODO: 80 // - allow two Trees to share the same set of buckets. 81 82 // A Builder allows storing CLDR data in compact form. 83 type Builder struct { 84 table []string 85 86 rootMeta *metaData 87 locales []locale 88 strToBucket map[string]stringInfo 89 buckets [][]byte 90 enums []*enum 91 err error 92 93 // Stats 94 size int 95 sizeAll int 96 bucketWaste int 97 } 98 99 const ( 100 maxBucketSize = 8 * 1024 // 8K 101 maxStrlen = 254 // allow 0xFF sentinel 102 ) 103 104 func (b *Builder) setError(err error) { 105 if b.err == nil { 106 b.err = err 107 } 108 } 109 110 func (b *Builder) addString(data string) stringInfo { 111 data = b.makeString(data) 112 info, ok := b.strToBucket[data] 113 if !ok { 114 b.size += len(data) 115 x := len(b.buckets) - 1 116 bucket := b.buckets[x] 117 if len(bucket)+len(data) < maxBucketSize { 118 info.bucket = uint16(x) 119 info.bucketPos = uint16(len(bucket)) 120 b.buckets[x] = append(bucket, data...) 121 } else { 122 info.bucket = uint16(len(b.buckets)) 123 info.bucketPos = 0 124 b.buckets = append(b.buckets, []byte(data)) 125 } 126 b.strToBucket[data] = info 127 } 128 return info 129 } 130 131 func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo { 132 data = b.makeString(data) 133 info, ok := b.strToBucket[data] 134 if !ok || info.bucket != bucket { 135 if ok { 136 b.bucketWaste += len(data) 137 } 138 b.size += len(data) 139 bk := b.buckets[bucket] 140 info.bucket = bucket 141 info.bucketPos = uint16(len(bk)) 142 b.buckets[bucket] = append(bk, data...) 143 b.strToBucket[data] = info 144 } 145 return info 146 } 147 148 func (b *Builder) makeString(data string) string { 149 if len(data) > maxStrlen { 150 b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen)) 151 data = data[:maxStrlen] 152 for i := len(data) - 1; i > len(data)-4; i-- { 153 if utf8.RuneStart(data[i]) { 154 data = data[:i] 155 break 156 } 157 } 158 } 159 data = string([]byte{byte(len(data))}) + data 160 b.sizeAll += len(data) 161 return data 162 } 163 164 type stringInfo struct { 165 bufferPos uint32 166 bucket uint16 167 bucketPos uint16 168 } 169 170 // New creates a new Builder. 171 func New(tableName string) *Builder { 172 b := &Builder{ 173 strToBucket: map[string]stringInfo{}, 174 buckets: [][]byte{nil}, // initialize with first bucket. 175 } 176 b.rootMeta = &metaData{ 177 b: b, 178 typeInfo: &typeInfo{}, 179 } 180 return b 181 } 182 183 // Gen writes all the tables and types for the collected data. 184 func (b *Builder) Gen(w *gen.CodeWriter) error { 185 t, err := build(b) 186 if err != nil { 187 return err 188 } 189 return generate(b, t, w) 190 } 191 192 // GenTestData generates tables useful for testing data generated with Gen. 193 func (b *Builder) GenTestData(w *gen.CodeWriter) error { 194 return generateTestData(b, w) 195 } 196 197 type locale struct { 198 tag language.Tag 199 root *Index 200 } 201 202 // Locale creates an index for the given locale. 203 func (b *Builder) Locale(t language.Tag) *Index { 204 index := &Index{ 205 meta: b.rootMeta, 206 } 207 b.locales = append(b.locales, locale{tag: t, root: index}) 208 return index 209 } 210 211 // An Index holds a map of either leaf values or other indices. 212 type Index struct { 213 meta *metaData 214 215 subIndex []*Index 216 values []keyValue 217 } 218 219 func (i *Index) setError(err error) { i.meta.b.setError(err) } 220 221 type keyValue struct { 222 key enumIndex 223 value stringInfo 224 } 225 226 // Element is a CLDR XML element. 227 type Element interface { 228 GetCommon() *cldr.Common 229 } 230 231 // Index creates a subindex where the type and enum values are not shared 232 // with siblings by default. The name is derived from the elem. If elem is 233 // an alias reference, the alias will be resolved and linked. If elem is nil 234 // Index returns nil. 235 func (i *Index) Index(elem Element, opt ...Option) *Index { 236 if elem == nil || reflect.ValueOf(elem).IsNil() { 237 return nil 238 } 239 c := elem.GetCommon() 240 o := &options{ 241 parent: i, 242 name: c.GetCommon().Element(), 243 } 244 o.fill(opt) 245 o.setAlias(elem) 246 return i.subIndexForKey(o) 247 } 248 249 // IndexWithName is like Section but derives the name from the given name. 250 func (i *Index) IndexWithName(name string, opt ...Option) *Index { 251 o := &options{parent: i, name: name} 252 o.fill(opt) 253 return i.subIndexForKey(o) 254 } 255 256 // IndexFromType creates a subindex the value of tye type attribute as key. It 257 // will also configure the Index to share the enumeration values with all 258 // sibling values. If elem is an alias, it will be resolved and linked. 259 func (i *Index) IndexFromType(elem Element, opts ...Option) *Index { 260 o := &options{ 261 parent: i, 262 name: elem.GetCommon().Type, 263 } 264 o.fill(opts) 265 o.setAlias(elem) 266 useSharedType()(o) 267 return i.subIndexForKey(o) 268 } 269 270 // IndexFromAlt creates a subindex the value of tye alt attribute as key. It 271 // will also configure the Index to share the enumeration values with all 272 // sibling values. If elem is an alias, it will be resolved and linked. 273 func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index { 274 o := &options{ 275 parent: i, 276 name: elem.GetCommon().Alt, 277 } 278 o.fill(opts) 279 o.setAlias(elem) 280 useSharedType()(o) 281 return i.subIndexForKey(o) 282 } 283 284 func (i *Index) subIndexForKey(opts *options) *Index { 285 key := opts.name 286 if len(i.values) > 0 { 287 panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key)) 288 } 289 meta := i.meta.sub(key, opts) 290 for _, x := range i.subIndex { 291 if x.meta == meta { 292 return x 293 } 294 } 295 if alias := opts.alias; alias != nil { 296 if a := alias.GetCommon().Alias; a != nil { 297 if a.Source != "locale" { 298 i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path)) 299 } 300 if meta.inheritOffset < 0 { 301 i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path)) 302 } 303 path := a.Path 304 for ; strings.HasPrefix(path, "../"); path = path[len("../"):] { 305 meta.inheritOffset-- 306 } 307 m := aliasRe.FindStringSubmatch(path) 308 if m == nil { 309 i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path)) 310 } else { 311 key := m[4] 312 if key == "" { 313 key = m[1] 314 } 315 meta.inheritIndex = key 316 } 317 } 318 } 319 x := &Index{meta: meta} 320 i.subIndex = append(i.subIndex, x) 321 return x 322 } 323 324 var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`) 325 326 // SetValue sets the value, the data from a CLDR XML element, for the given key. 327 func (i *Index) SetValue(key string, value Element, opt ...Option) { 328 if len(i.subIndex) > 0 { 329 panic(fmt.Errorf("adding value for key %q when index already exists", key)) 330 } 331 o := &options{parent: i} 332 o.fill(opt) 333 c := value.GetCommon() 334 if c.Alias != nil { 335 i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path)) 336 } 337 i.setValue(key, c.Data(), o) 338 } 339 340 func (i *Index) setValue(key, data string, o *options) { 341 index, _ := i.meta.typeInfo.lookupSubtype(key, o) 342 kv := keyValue{key: index} 343 if len(i.values) > 0 { 344 // Add string to the same bucket as the other values. 345 bucket := i.values[0].value.bucket 346 kv.value = i.meta.b.addStringToBucket(data, bucket) 347 } else { 348 kv.value = i.meta.b.addString(data) 349 } 350 i.values = append(i.values, kv) 351 }