github.com/liquid-dev/text@v0.3.3-liquid/internal/language/parse.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package language 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "sort" 12 13 "github.com/liquid-dev/text/internal/tag" 14 ) 15 16 // isAlpha returns true if the byte is not a digit. 17 // b must be an ASCII letter or digit. 18 func isAlpha(b byte) bool { 19 return b > '9' 20 } 21 22 // isAlphaNum returns true if the string contains only ASCII letters or digits. 23 func isAlphaNum(s []byte) bool { 24 for _, c := range s { 25 if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { 26 return false 27 } 28 } 29 return true 30 } 31 32 // ErrSyntax is returned by any of the parsing functions when the 33 // input is not well-formed, according to BCP 47. 34 // TODO: return the position at which the syntax error occurred? 35 var ErrSyntax = errors.New("language: tag is not well-formed") 36 37 // ErrDuplicateKey is returned when a tag contains the same key twice with 38 // different values in the -u section. 39 var ErrDuplicateKey = errors.New("language: different values for same key in -u extension") 40 41 // ValueError is returned by any of the parsing functions when the 42 // input is well-formed but the respective subtag is not recognized 43 // as a valid value. 44 type ValueError struct { 45 v [8]byte 46 } 47 48 // NewValueError creates a new ValueError. 49 func NewValueError(tag []byte) ValueError { 50 var e ValueError 51 copy(e.v[:], tag) 52 return e 53 } 54 55 func (e ValueError) tag() []byte { 56 n := bytes.IndexByte(e.v[:], 0) 57 if n == -1 { 58 n = 8 59 } 60 return e.v[:n] 61 } 62 63 // Error implements the error interface. 64 func (e ValueError) Error() string { 65 return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag()) 66 } 67 68 // Subtag returns the subtag for which the error occurred. 69 func (e ValueError) Subtag() string { 70 return string(e.tag()) 71 } 72 73 // scanner is used to scan BCP 47 tokens, which are separated by _ or -. 74 type scanner struct { 75 b []byte 76 bytes [max99thPercentileSize]byte 77 token []byte 78 start int // start position of the current token 79 end int // end position of the current token 80 next int // next point for scan 81 err error 82 done bool 83 } 84 85 func makeScannerString(s string) scanner { 86 scan := scanner{} 87 if len(s) <= len(scan.bytes) { 88 scan.b = scan.bytes[:copy(scan.bytes[:], s)] 89 } else { 90 scan.b = []byte(s) 91 } 92 scan.init() 93 return scan 94 } 95 96 // makeScanner returns a scanner using b as the input buffer. 97 // b is not copied and may be modified by the scanner routines. 98 func makeScanner(b []byte) scanner { 99 scan := scanner{b: b} 100 scan.init() 101 return scan 102 } 103 104 func (s *scanner) init() { 105 for i, c := range s.b { 106 if c == '_' { 107 s.b[i] = '-' 108 } 109 } 110 s.scan() 111 } 112 113 // restToLower converts the string between start and end to lower case. 114 func (s *scanner) toLower(start, end int) { 115 for i := start; i < end; i++ { 116 c := s.b[i] 117 if 'A' <= c && c <= 'Z' { 118 s.b[i] += 'a' - 'A' 119 } 120 } 121 } 122 123 func (s *scanner) setError(e error) { 124 if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) { 125 s.err = e 126 } 127 } 128 129 // resizeRange shrinks or grows the array at position oldStart such that 130 // a new string of size newSize can fit between oldStart and oldEnd. 131 // Sets the scan point to after the resized range. 132 func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) { 133 s.start = oldStart 134 if end := oldStart + newSize; end != oldEnd { 135 diff := end - oldEnd 136 if end < cap(s.b) { 137 b := make([]byte, len(s.b)+diff) 138 copy(b, s.b[:oldStart]) 139 copy(b[end:], s.b[oldEnd:]) 140 s.b = b 141 } else { 142 s.b = append(s.b[end:], s.b[oldEnd:]...) 143 } 144 s.next = end + (s.next - s.end) 145 s.end = end 146 } 147 } 148 149 // replace replaces the current token with repl. 150 func (s *scanner) replace(repl string) { 151 s.resizeRange(s.start, s.end, len(repl)) 152 copy(s.b[s.start:], repl) 153 } 154 155 // gobble removes the current token from the input. 156 // Caller must call scan after calling gobble. 157 func (s *scanner) gobble(e error) { 158 s.setError(e) 159 if s.start == 0 { 160 s.b = s.b[:+copy(s.b, s.b[s.next:])] 161 s.end = 0 162 } else { 163 s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])] 164 s.end = s.start - 1 165 } 166 s.next = s.start 167 } 168 169 // deleteRange removes the given range from s.b before the current token. 170 func (s *scanner) deleteRange(start, end int) { 171 s.b = s.b[:start+copy(s.b[start:], s.b[end:])] 172 diff := end - start 173 s.next -= diff 174 s.start -= diff 175 s.end -= diff 176 } 177 178 // scan parses the next token of a BCP 47 string. Tokens that are larger 179 // than 8 characters or include non-alphanumeric characters result in an error 180 // and are gobbled and removed from the output. 181 // It returns the end position of the last token consumed. 182 func (s *scanner) scan() (end int) { 183 end = s.end 184 s.token = nil 185 for s.start = s.next; s.next < len(s.b); { 186 i := bytes.IndexByte(s.b[s.next:], '-') 187 if i == -1 { 188 s.end = len(s.b) 189 s.next = len(s.b) 190 i = s.end - s.start 191 } else { 192 s.end = s.next + i 193 s.next = s.end + 1 194 } 195 token := s.b[s.start:s.end] 196 if i < 1 || i > 8 || !isAlphaNum(token) { 197 s.gobble(ErrSyntax) 198 continue 199 } 200 s.token = token 201 return end 202 } 203 if n := len(s.b); n > 0 && s.b[n-1] == '-' { 204 s.setError(ErrSyntax) 205 s.b = s.b[:len(s.b)-1] 206 } 207 s.done = true 208 return end 209 } 210 211 // acceptMinSize parses multiple tokens of the given size or greater. 212 // It returns the end position of the last token consumed. 213 func (s *scanner) acceptMinSize(min int) (end int) { 214 end = s.end 215 s.scan() 216 for ; len(s.token) >= min; s.scan() { 217 end = s.end 218 } 219 return end 220 } 221 222 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing 223 // failed it returns an error and any part of the tag that could be parsed. 224 // If parsing succeeded but an unknown value was found, it returns 225 // ValueError. The Tag returned in this case is just stripped of the unknown 226 // value. All other values are preserved. It accepts tags in the BCP 47 format 227 // and extensions to this standard defined in 228 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. 229 func Parse(s string) (t Tag, err error) { 230 // TODO: consider supporting old-style locale key-value pairs. 231 if s == "" { 232 return Und, ErrSyntax 233 } 234 if len(s) <= maxAltTaglen { 235 b := [maxAltTaglen]byte{} 236 for i, c := range s { 237 // Generating invalid UTF-8 is okay as it won't match. 238 if 'A' <= c && c <= 'Z' { 239 c += 'a' - 'A' 240 } else if c == '_' { 241 c = '-' 242 } 243 b[i] = byte(c) 244 } 245 if t, ok := grandfathered(b); ok { 246 return t, nil 247 } 248 } 249 scan := makeScannerString(s) 250 return parse(&scan, s) 251 } 252 253 func parse(scan *scanner, s string) (t Tag, err error) { 254 t = Und 255 var end int 256 if n := len(scan.token); n <= 1 { 257 scan.toLower(0, len(scan.b)) 258 if n == 0 || scan.token[0] != 'x' { 259 return t, ErrSyntax 260 } 261 end = parseExtensions(scan) 262 } else if n >= 4 { 263 return Und, ErrSyntax 264 } else { // the usual case 265 t, end = parseTag(scan) 266 if n := len(scan.token); n == 1 { 267 t.pExt = uint16(end) 268 end = parseExtensions(scan) 269 } else if end < len(scan.b) { 270 scan.setError(ErrSyntax) 271 scan.b = scan.b[:end] 272 } 273 } 274 if int(t.pVariant) < len(scan.b) { 275 if end < len(s) { 276 s = s[:end] 277 } 278 if len(s) > 0 && tag.Compare(s, scan.b) == 0 { 279 t.str = s 280 } else { 281 t.str = string(scan.b) 282 } 283 } else { 284 t.pVariant, t.pExt = 0, 0 285 } 286 return t, scan.err 287 } 288 289 // parseTag parses language, script, region and variants. 290 // It returns a Tag and the end position in the input that was parsed. 291 func parseTag(scan *scanner) (t Tag, end int) { 292 var e error 293 // TODO: set an error if an unknown lang, script or region is encountered. 294 t.LangID, e = getLangID(scan.token) 295 scan.setError(e) 296 scan.replace(t.LangID.String()) 297 langStart := scan.start 298 end = scan.scan() 299 for len(scan.token) == 3 && isAlpha(scan.token[0]) { 300 // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent 301 // to a tag of the form <extlang>. 302 lang, e := getLangID(scan.token) 303 if lang != 0 { 304 t.LangID = lang 305 copy(scan.b[langStart:], lang.String()) 306 scan.b[langStart+3] = '-' 307 scan.start = langStart + 4 308 } 309 scan.gobble(e) 310 end = scan.scan() 311 } 312 if len(scan.token) == 4 && isAlpha(scan.token[0]) { 313 t.ScriptID, e = getScriptID(script, scan.token) 314 if t.ScriptID == 0 { 315 scan.gobble(e) 316 } 317 end = scan.scan() 318 } 319 if n := len(scan.token); n >= 2 && n <= 3 { 320 t.RegionID, e = getRegionID(scan.token) 321 if t.RegionID == 0 { 322 scan.gobble(e) 323 } else { 324 scan.replace(t.RegionID.String()) 325 } 326 end = scan.scan() 327 } 328 scan.toLower(scan.start, len(scan.b)) 329 t.pVariant = byte(end) 330 end = parseVariants(scan, end, t) 331 t.pExt = uint16(end) 332 return t, end 333 } 334 335 var separator = []byte{'-'} 336 337 // parseVariants scans tokens as long as each token is a valid variant string. 338 // Duplicate variants are removed. 339 func parseVariants(scan *scanner, end int, t Tag) int { 340 start := scan.start 341 varIDBuf := [4]uint8{} 342 variantBuf := [4][]byte{} 343 varID := varIDBuf[:0] 344 variant := variantBuf[:0] 345 last := -1 346 needSort := false 347 for ; len(scan.token) >= 4; scan.scan() { 348 // TODO: measure the impact of needing this conversion and redesign 349 // the data structure if there is an issue. 350 v, ok := variantIndex[string(scan.token)] 351 if !ok { 352 // unknown variant 353 // TODO: allow user-defined variants? 354 scan.gobble(NewValueError(scan.token)) 355 continue 356 } 357 varID = append(varID, v) 358 variant = append(variant, scan.token) 359 if !needSort { 360 if last < int(v) { 361 last = int(v) 362 } else { 363 needSort = true 364 // There is no legal combinations of more than 7 variants 365 // (and this is by no means a useful sequence). 366 const maxVariants = 8 367 if len(varID) > maxVariants { 368 break 369 } 370 } 371 } 372 end = scan.end 373 } 374 if needSort { 375 sort.Sort(variantsSort{varID, variant}) 376 k, l := 0, -1 377 for i, v := range varID { 378 w := int(v) 379 if l == w { 380 // Remove duplicates. 381 continue 382 } 383 varID[k] = varID[i] 384 variant[k] = variant[i] 385 k++ 386 l = w 387 } 388 if str := bytes.Join(variant[:k], separator); len(str) == 0 { 389 end = start - 1 390 } else { 391 scan.resizeRange(start, end, len(str)) 392 copy(scan.b[scan.start:], str) 393 end = scan.end 394 } 395 } 396 return end 397 } 398 399 type variantsSort struct { 400 i []uint8 401 v [][]byte 402 } 403 404 func (s variantsSort) Len() int { 405 return len(s.i) 406 } 407 408 func (s variantsSort) Swap(i, j int) { 409 s.i[i], s.i[j] = s.i[j], s.i[i] 410 s.v[i], s.v[j] = s.v[j], s.v[i] 411 } 412 413 func (s variantsSort) Less(i, j int) bool { 414 return s.i[i] < s.i[j] 415 } 416 417 type bytesSort struct { 418 b [][]byte 419 n int // first n bytes to compare 420 } 421 422 func (b bytesSort) Len() int { 423 return len(b.b) 424 } 425 426 func (b bytesSort) Swap(i, j int) { 427 b.b[i], b.b[j] = b.b[j], b.b[i] 428 } 429 430 func (b bytesSort) Less(i, j int) bool { 431 for k := 0; k < b.n; k++ { 432 if b.b[i][k] == b.b[j][k] { 433 continue 434 } 435 return b.b[i][k] < b.b[j][k] 436 } 437 return false 438 } 439 440 // parseExtensions parses and normalizes the extensions in the buffer. 441 // It returns the last position of scan.b that is part of any extension. 442 // It also trims scan.b to remove excess parts accordingly. 443 func parseExtensions(scan *scanner) int { 444 start := scan.start 445 exts := [][]byte{} 446 private := []byte{} 447 end := scan.end 448 for len(scan.token) == 1 { 449 extStart := scan.start 450 ext := scan.token[0] 451 end = parseExtension(scan) 452 extension := scan.b[extStart:end] 453 if len(extension) < 3 || (ext != 'x' && len(extension) < 4) { 454 scan.setError(ErrSyntax) 455 end = extStart 456 continue 457 } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) { 458 scan.b = scan.b[:end] 459 return end 460 } else if ext == 'x' { 461 private = extension 462 break 463 } 464 exts = append(exts, extension) 465 } 466 sort.Sort(bytesSort{exts, 1}) 467 if len(private) > 0 { 468 exts = append(exts, private) 469 } 470 scan.b = scan.b[:start] 471 if len(exts) > 0 { 472 scan.b = append(scan.b, bytes.Join(exts, separator)...) 473 } else if start > 0 { 474 // Strip trailing '-'. 475 scan.b = scan.b[:start-1] 476 } 477 return end 478 } 479 480 // parseExtension parses a single extension and returns the position of 481 // the extension end. 482 func parseExtension(scan *scanner) int { 483 start, end := scan.start, scan.end 484 switch scan.token[0] { 485 case 'u': 486 attrStart := end 487 scan.scan() 488 for last := []byte{}; len(scan.token) > 2; scan.scan() { 489 if bytes.Compare(scan.token, last) != -1 { 490 // Attributes are unsorted. Start over from scratch. 491 p := attrStart + 1 492 scan.next = p 493 attrs := [][]byte{} 494 for scan.scan(); len(scan.token) > 2; scan.scan() { 495 attrs = append(attrs, scan.token) 496 end = scan.end 497 } 498 sort.Sort(bytesSort{attrs, 3}) 499 copy(scan.b[p:], bytes.Join(attrs, separator)) 500 break 501 } 502 last = scan.token 503 end = scan.end 504 } 505 var last, key []byte 506 for attrEnd := end; len(scan.token) == 2; last = key { 507 key = scan.token 508 keyEnd := scan.end 509 end = scan.acceptMinSize(3) 510 // TODO: check key value validity 511 if keyEnd == end || bytes.Compare(key, last) != 1 { 512 // We have an invalid key or the keys are not sorted. 513 // Start scanning keys from scratch and reorder. 514 p := attrEnd + 1 515 scan.next = p 516 keys := [][]byte{} 517 for scan.scan(); len(scan.token) == 2; { 518 keyStart, keyEnd := scan.start, scan.end 519 end = scan.acceptMinSize(3) 520 if keyEnd != end { 521 keys = append(keys, scan.b[keyStart:end]) 522 } else { 523 scan.setError(ErrSyntax) 524 end = keyStart 525 } 526 } 527 sort.Stable(bytesSort{keys, 2}) 528 if n := len(keys); n > 0 { 529 k := 0 530 for i := 1; i < n; i++ { 531 if !bytes.Equal(keys[k][:2], keys[i][:2]) { 532 k++ 533 keys[k] = keys[i] 534 } else if !bytes.Equal(keys[k], keys[i]) { 535 scan.setError(ErrDuplicateKey) 536 } 537 } 538 keys = keys[:k+1] 539 } 540 reordered := bytes.Join(keys, separator) 541 if e := p + len(reordered); e < end { 542 scan.deleteRange(e, end) 543 end = e 544 } 545 copy(scan.b[p:], reordered) 546 break 547 } 548 } 549 case 't': 550 scan.scan() 551 if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) { 552 _, end = parseTag(scan) 553 scan.toLower(start, end) 554 } 555 for len(scan.token) == 2 && !isAlpha(scan.token[1]) { 556 end = scan.acceptMinSize(3) 557 } 558 case 'x': 559 end = scan.acceptMinSize(1) 560 default: 561 end = scan.acceptMinSize(2) 562 } 563 return end 564 } 565 566 // getExtension returns the name, body and end position of the extension. 567 func getExtension(s string, p int) (end int, ext string) { 568 if s[p] == '-' { 569 p++ 570 } 571 if s[p] == 'x' { 572 return len(s), s[p:] 573 } 574 end = nextExtension(s, p) 575 return end, s[p:end] 576 } 577 578 // nextExtension finds the next extension within the string, searching 579 // for the -<char>- pattern from position p. 580 // In the fast majority of cases, language tags will have at most 581 // one extension and extensions tend to be small. 582 func nextExtension(s string, p int) int { 583 for n := len(s) - 3; p < n; { 584 if s[p] == '-' { 585 if s[p+2] == '-' { 586 return p 587 } 588 p += 3 589 } else { 590 p++ 591 } 592 } 593 return len(s) 594 }