golang.org/x/text@v0.14.0/unicode/cldr/collate.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cldr 6 7 import ( 8 "bufio" 9 "encoding/xml" 10 "errors" 11 "fmt" 12 "strconv" 13 "strings" 14 "unicode" 15 "unicode/utf8" 16 ) 17 18 // RuleProcessor can be passed to Collator's Process method, which 19 // parses the rules and calls the respective method for each rule found. 20 type RuleProcessor interface { 21 Reset(anchor string, before int) error 22 Insert(level int, str, context, extend string) error 23 Index(id string) 24 } 25 26 const ( 27 // cldrIndex is a Unicode-reserved sentinel value used to mark the start 28 // of a grouping within an index. 29 // We ignore any rule that starts with this rune. 30 // See https://unicode.org/reports/tr35/#Collation_Elements for details. 31 cldrIndex = "\uFDD0" 32 33 // specialAnchor is the format in which to represent logical reset positions, 34 // such as "first tertiary ignorable". 35 specialAnchor = "<%s/>" 36 ) 37 38 // Process parses the rules for the tailorings of this collation 39 // and calls the respective methods of p for each rule found. 40 func (c Collation) Process(p RuleProcessor) (err error) { 41 if len(c.Cr) > 0 { 42 if len(c.Cr) > 1 { 43 return fmt.Errorf("multiple cr elements, want 0 or 1") 44 } 45 return processRules(p, c.Cr[0].Data()) 46 } 47 if c.Rules.Any != nil { 48 return c.processXML(p) 49 } 50 return errors.New("no tailoring data") 51 } 52 53 // processRules parses rules in the Collation Rule Syntax defined in 54 // https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings. 55 func processRules(p RuleProcessor, s string) (err error) { 56 chk := func(s string, e error) string { 57 if err == nil { 58 err = e 59 } 60 return s 61 } 62 i := 0 // Save the line number for use after the loop. 63 scanner := bufio.NewScanner(strings.NewReader(s)) 64 for ; scanner.Scan() && err == nil; i++ { 65 for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) { 66 level := 5 67 var ch byte 68 switch ch, s = s[0], s[1:]; ch { 69 case '&': // followed by <anchor> or '[' <key> ']' 70 if s = skipSpace(s); consume(&s, '[') { 71 s = chk(parseSpecialAnchor(p, s)) 72 } else { 73 s = chk(parseAnchor(p, 0, s)) 74 } 75 case '<': // sort relation '<'{1,4}, optionally followed by '*'. 76 for level = 1; consume(&s, '<'); level++ { 77 } 78 if level > 4 { 79 err = fmt.Errorf("level %d > 4", level) 80 } 81 fallthrough 82 case '=': // identity relation, optionally followed by *. 83 if consume(&s, '*') { 84 s = chk(parseSequence(p, level, s)) 85 } else { 86 s = chk(parseOrder(p, level, s)) 87 } 88 default: 89 chk("", fmt.Errorf("illegal operator %q", ch)) 90 break 91 } 92 } 93 } 94 if chk("", scanner.Err()); err != nil { 95 return fmt.Errorf("%d: %v", i, err) 96 } 97 return nil 98 } 99 100 // parseSpecialAnchor parses the anchor syntax which is either of the form 101 // 102 // ['before' <level>] <anchor> 103 // 104 // or 105 // 106 // [<label>] 107 // 108 // The starting should already be consumed. 109 func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) { 110 i := strings.IndexByte(s, ']') 111 if i == -1 { 112 return "", errors.New("unmatched bracket") 113 } 114 a := strings.TrimSpace(s[:i]) 115 s = s[i+1:] 116 if strings.HasPrefix(a, "before ") { 117 l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3) 118 if err != nil { 119 return s, err 120 } 121 return parseAnchor(p, int(l), s) 122 } 123 return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0) 124 } 125 126 func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) { 127 anchor, s, err := scanString(s) 128 if err != nil { 129 return s, err 130 } 131 return s, p.Reset(anchor, level) 132 } 133 134 func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) { 135 var value, context, extend string 136 if value, s, err = scanString(s); err != nil { 137 return s, err 138 } 139 if strings.HasPrefix(value, cldrIndex) { 140 p.Index(value[len(cldrIndex):]) 141 return 142 } 143 if consume(&s, '|') { 144 if context, s, err = scanString(s); err != nil { 145 return s, errors.New("missing string after context") 146 } 147 } 148 if consume(&s, '/') { 149 if extend, s, err = scanString(s); err != nil { 150 return s, errors.New("missing string after extension") 151 } 152 } 153 return s, p.Insert(level, value, context, extend) 154 } 155 156 // scanString scans a single input string. 157 func scanString(s string) (str, tail string, err error) { 158 if s = skipSpace(s); s == "" { 159 return s, s, errors.New("missing string") 160 } 161 buf := [16]byte{} // small but enough to hold most cases. 162 value := buf[:0] 163 for s != "" { 164 if consume(&s, '\'') { 165 i := strings.IndexByte(s, '\'') 166 if i == -1 { 167 return "", "", errors.New(`unmatched single quote`) 168 } 169 if i == 0 { 170 value = append(value, '\'') 171 } else { 172 value = append(value, s[:i]...) 173 } 174 s = s[i+1:] 175 continue 176 } 177 r, sz := utf8.DecodeRuneInString(s) 178 if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) { 179 break 180 } 181 value = append(value, s[:sz]...) 182 s = s[sz:] 183 } 184 return string(value), skipSpace(s), nil 185 } 186 187 func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) { 188 if s = skipSpace(s); s == "" { 189 return s, errors.New("empty sequence") 190 } 191 last := rune(0) 192 for s != "" { 193 r, sz := utf8.DecodeRuneInString(s) 194 s = s[sz:] 195 196 if r == '-' { 197 // We have a range. The first element was already written. 198 if last == 0 { 199 return s, errors.New("range without starter value") 200 } 201 r, sz = utf8.DecodeRuneInString(s) 202 s = s[sz:] 203 if r == utf8.RuneError || r < last { 204 return s, fmt.Errorf("invalid range %q-%q", last, r) 205 } 206 for i := last + 1; i <= r; i++ { 207 if err := p.Insert(level, string(i), "", ""); err != nil { 208 return s, err 209 } 210 } 211 last = 0 212 continue 213 } 214 215 if unicode.IsSpace(r) || unicode.IsPunct(r) { 216 break 217 } 218 219 // normal case 220 if err := p.Insert(level, string(r), "", ""); err != nil { 221 return s, err 222 } 223 last = r 224 } 225 return s, nil 226 } 227 228 func skipSpace(s string) string { 229 return strings.TrimLeftFunc(s, unicode.IsSpace) 230 } 231 232 // consume returns whether the next byte is ch. If so, it gobbles it by 233 // updating s. 234 func consume(s *string, ch byte) (ok bool) { 235 if *s == "" || (*s)[0] != ch { 236 return false 237 } 238 *s = (*s)[1:] 239 return true 240 } 241 242 // The following code parses Collation rules of CLDR version 24 and before. 243 244 var lmap = map[byte]int{ 245 'p': 1, 246 's': 2, 247 't': 3, 248 'i': 5, 249 } 250 251 type rulesElem struct { 252 Rules struct { 253 Common 254 Any []*struct { 255 XMLName xml.Name 256 rule 257 } `xml:",any"` 258 } `xml:"rules"` 259 } 260 261 type rule struct { 262 Value string `xml:",chardata"` 263 Before string `xml:"before,attr"` 264 Any []*struct { 265 XMLName xml.Name 266 rule 267 } `xml:",any"` 268 } 269 270 var emptyValueError = errors.New("cldr: empty rule value") 271 272 func (r *rule) value() (string, error) { 273 // Convert hexadecimal Unicode codepoint notation to a string. 274 s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode) 275 r.Value = s 276 if s == "" { 277 if len(r.Any) != 1 { 278 return "", emptyValueError 279 } 280 r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local) 281 r.Any = nil 282 } else if len(r.Any) != 0 { 283 return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any) 284 } 285 return r.Value, nil 286 } 287 288 func (r rule) process(p RuleProcessor, name, context, extend string) error { 289 v, err := r.value() 290 if err != nil { 291 return err 292 } 293 switch name { 294 case "p", "s", "t", "i": 295 if strings.HasPrefix(v, cldrIndex) { 296 p.Index(v[len(cldrIndex):]) 297 return nil 298 } 299 if err := p.Insert(lmap[name[0]], v, context, extend); err != nil { 300 return err 301 } 302 case "pc", "sc", "tc", "ic": 303 level := lmap[name[0]] 304 for _, s := range v { 305 if err := p.Insert(level, string(s), context, extend); err != nil { 306 return err 307 } 308 } 309 default: 310 return fmt.Errorf("cldr: unsupported tag: %q", name) 311 } 312 return nil 313 } 314 315 // processXML parses the format of CLDR versions 24 and older. 316 func (c Collation) processXML(p RuleProcessor) (err error) { 317 // Collation is generated and defined in xml.go. 318 var v string 319 for _, r := range c.Rules.Any { 320 switch r.XMLName.Local { 321 case "reset": 322 level := 0 323 switch r.Before { 324 case "primary", "1": 325 level = 1 326 case "secondary", "2": 327 level = 2 328 case "tertiary", "3": 329 level = 3 330 case "": 331 default: 332 return fmt.Errorf("cldr: unknown level %q", r.Before) 333 } 334 v, err = r.value() 335 if err == nil { 336 err = p.Reset(v, level) 337 } 338 case "x": 339 var context, extend string 340 for _, r1 := range r.Any { 341 v, err = r1.value() 342 switch r1.XMLName.Local { 343 case "context": 344 context = v 345 case "extend": 346 extend = v 347 } 348 } 349 for _, r1 := range r.Any { 350 if t := r1.XMLName.Local; t == "context" || t == "extend" { 351 continue 352 } 353 r1.rule.process(p, r1.XMLName.Local, context, extend) 354 } 355 default: 356 err = r.rule.process(p, r.XMLName.Local, "", "") 357 } 358 if err != nil { 359 return err 360 } 361 } 362 return nil 363 }