github.com/liquid-dev/text@v0.3.3-liquid/internal/export/idna/idna10.0.0.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build go1.10 6 //go:generate go run gen.go gen_trieval.go gen_common.go 7 8 // Package idna implements IDNA2008 using the compatibility processing 9 // defined by UTS (Unicode Technical Standard) #46, which defines a standard to 10 // deal with the transition from IDNA2003. 11 // 12 // IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC 13 // 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894. 14 // UTS #46 is defined in https://www.unicode.org/reports/tr46. 15 // See https://unicode.org/cldr/utility/idna.jsp for a visualization of the 16 // differences between these two standards. 17 package idna // import "github.com/liquid-dev/text/internal/export/idna" 18 19 import ( 20 "fmt" 21 "strings" 22 "unicode/utf8" 23 24 "github.com/liquid-dev/text/secure/bidirule" 25 "github.com/liquid-dev/text/unicode/bidi" 26 "github.com/liquid-dev/text/unicode/norm" 27 ) 28 29 // NOTE: Unlike common practice in Go APIs, the functions will return a 30 // sanitized domain name in case of errors. Browsers sometimes use a partially 31 // evaluated string as lookup. 32 // TODO: the current error handling is, in my opinion, the least opinionated. 33 // Other strategies are also viable, though: 34 // Option 1) Return an empty string in case of error, but allow the user to 35 // specify explicitly which errors to ignore. 36 // Option 2) Return the partially evaluated string if it is itself a valid 37 // string, otherwise return the empty string in case of error. 38 // Option 3) Option 1 and 2. 39 // Option 4) Always return an empty string for now and implement Option 1 as 40 // needed, and document that the return string may not be empty in case of 41 // error in the future. 42 // I think Option 1 is best, but it is quite opinionated. 43 44 // ToASCII is a wrapper for Punycode.ToASCII. 45 func ToASCII(s string) (string, error) { 46 return Punycode.process(s, true) 47 } 48 49 // ToUnicode is a wrapper for Punycode.ToUnicode. 50 func ToUnicode(s string) (string, error) { 51 return Punycode.process(s, false) 52 } 53 54 // An Option configures a Profile at creation time. 55 type Option func(*options) 56 57 // Transitional sets a Profile to use the Transitional mapping as defined in UTS 58 // #46. This will cause, for example, "ß" to be mapped to "ss". Using the 59 // transitional mapping provides a compromise between IDNA2003 and IDNA2008 60 // compatibility. It is used by most browsers when resolving domain names. This 61 // option is only meaningful if combined with MapForLookup. 62 func Transitional(transitional bool) Option { 63 return func(o *options) { o.transitional = true } 64 } 65 66 // VerifyDNSLength sets whether a Profile should fail if any of the IDN parts 67 // are longer than allowed by the RFC. 68 func VerifyDNSLength(verify bool) Option { 69 return func(o *options) { o.verifyDNSLength = verify } 70 } 71 72 // RemoveLeadingDots removes leading label separators. Leading runes that map to 73 // dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well. 74 // 75 // This is the behavior suggested by the UTS #46 and is adopted by some 76 // browsers. 77 func RemoveLeadingDots(remove bool) Option { 78 return func(o *options) { o.removeLeadingDots = remove } 79 } 80 81 // ValidateLabels sets whether to check the mandatory label validation criteria 82 // as defined in Section 5.4 of RFC 5891. This includes testing for correct use 83 // of hyphens ('-'), normalization, validity of runes, and the context rules. 84 func ValidateLabels(enable bool) Option { 85 return func(o *options) { 86 // Don't override existing mappings, but set one that at least checks 87 // normalization if it is not set. 88 if o.mapping == nil && enable { 89 o.mapping = normalize 90 } 91 o.trie = trie 92 o.validateLabels = enable 93 o.fromPuny = validateFromPunycode 94 } 95 } 96 97 // StrictDomainName limits the set of permissible ASCII characters to those 98 // allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the 99 // hyphen). This is set by default for MapForLookup and ValidateForRegistration. 100 // 101 // This option is useful, for instance, for browsers that allow characters 102 // outside this range, for example a '_' (U+005F LOW LINE). See 103 // http://www.rfc-editor.org/std/std3.txt for more details This option 104 // corresponds to the UseSTD3ASCIIRules option in UTS #46. 105 func StrictDomainName(use bool) Option { 106 return func(o *options) { 107 o.trie = trie 108 o.useSTD3Rules = use 109 o.fromPuny = validateFromPunycode 110 } 111 } 112 113 // NOTE: the following options pull in tables. The tables should not be linked 114 // in as long as the options are not used. 115 116 // BidiRule enables the Bidi rule as defined in RFC 5893. Any application 117 // that relies on proper validation of labels should include this rule. 118 func BidiRule() Option { 119 return func(o *options) { o.bidirule = bidirule.ValidString } 120 } 121 122 // ValidateForRegistration sets validation options to verify that a given IDN is 123 // properly formatted for registration as defined by Section 4 of RFC 5891. 124 func ValidateForRegistration() Option { 125 return func(o *options) { 126 o.mapping = validateRegistration 127 StrictDomainName(true)(o) 128 ValidateLabels(true)(o) 129 VerifyDNSLength(true)(o) 130 BidiRule()(o) 131 } 132 } 133 134 // MapForLookup sets validation and mapping options such that a given IDN is 135 // transformed for domain name lookup according to the requirements set out in 136 // Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894, 137 // RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option 138 // to add this check. 139 // 140 // The mappings include normalization and mapping case, width and other 141 // compatibility mappings. 142 func MapForLookup() Option { 143 return func(o *options) { 144 o.mapping = validateAndMap 145 StrictDomainName(true)(o) 146 ValidateLabels(true)(o) 147 } 148 } 149 150 type options struct { 151 transitional bool 152 useSTD3Rules bool 153 validateLabels bool 154 verifyDNSLength bool 155 removeLeadingDots bool 156 157 trie *idnaTrie 158 159 // fromPuny calls validation rules when converting A-labels to U-labels. 160 fromPuny func(p *Profile, s string) error 161 162 // mapping implements a validation and mapping step as defined in RFC 5895 163 // or UTS 46, tailored to, for example, domain registration or lookup. 164 mapping func(p *Profile, s string) (mapped string, isBidi bool, err error) 165 166 // bidirule, if specified, checks whether s conforms to the Bidi Rule 167 // defined in RFC 5893. 168 bidirule func(s string) bool 169 } 170 171 // A Profile defines the configuration of an IDNA mapper. 172 type Profile struct { 173 options 174 } 175 176 func apply(o *options, opts []Option) { 177 for _, f := range opts { 178 f(o) 179 } 180 } 181 182 // New creates a new Profile. 183 // 184 // With no options, the returned Profile is the most permissive and equals the 185 // Punycode Profile. Options can be passed to further restrict the Profile. The 186 // MapForLookup and ValidateForRegistration options set a collection of options, 187 // for lookup and registration purposes respectively, which can be tailored by 188 // adding more fine-grained options, where later options override earlier 189 // options. 190 func New(o ...Option) *Profile { 191 p := &Profile{} 192 apply(&p.options, o) 193 return p 194 } 195 196 // ToASCII converts a domain or domain label to its ASCII form. For example, 197 // ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and 198 // ToASCII("golang") is "golang". If an error is encountered it will return 199 // an error and a (partially) processed result. 200 func (p *Profile) ToASCII(s string) (string, error) { 201 return p.process(s, true) 202 } 203 204 // ToUnicode converts a domain or domain label to its Unicode form. For example, 205 // ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and 206 // ToUnicode("golang") is "golang". If an error is encountered it will return 207 // an error and a (partially) processed result. 208 func (p *Profile) ToUnicode(s string) (string, error) { 209 pp := *p 210 pp.transitional = false 211 return pp.process(s, false) 212 } 213 214 // String reports a string with a description of the profile for debugging 215 // purposes. The string format may change with different versions. 216 func (p *Profile) String() string { 217 s := "" 218 if p.transitional { 219 s = "Transitional" 220 } else { 221 s = "NonTransitional" 222 } 223 if p.useSTD3Rules { 224 s += ":UseSTD3Rules" 225 } 226 if p.validateLabels { 227 s += ":ValidateLabels" 228 } 229 if p.verifyDNSLength { 230 s += ":VerifyDNSLength" 231 } 232 return s 233 } 234 235 var ( 236 // Punycode is a Profile that does raw punycode processing with a minimum 237 // of validation. 238 Punycode *Profile = punycode 239 240 // Lookup is the recommended profile for looking up domain names, according 241 // to Section 5 of RFC 5891. The exact configuration of this profile may 242 // change over time. 243 Lookup *Profile = lookup 244 245 // Display is the recommended profile for displaying domain names. 246 // The configuration of this profile may change over time. 247 Display *Profile = display 248 249 // Registration is the recommended profile for checking whether a given 250 // IDN is valid for registration, according to Section 4 of RFC 5891. 251 Registration *Profile = registration 252 253 punycode = &Profile{} 254 lookup = &Profile{options{ 255 transitional: true, 256 useSTD3Rules: true, 257 validateLabels: true, 258 trie: trie, 259 fromPuny: validateFromPunycode, 260 mapping: validateAndMap, 261 bidirule: bidirule.ValidString, 262 }} 263 display = &Profile{options{ 264 useSTD3Rules: true, 265 validateLabels: true, 266 trie: trie, 267 fromPuny: validateFromPunycode, 268 mapping: validateAndMap, 269 bidirule: bidirule.ValidString, 270 }} 271 registration = &Profile{options{ 272 useSTD3Rules: true, 273 validateLabels: true, 274 verifyDNSLength: true, 275 trie: trie, 276 fromPuny: validateFromPunycode, 277 mapping: validateRegistration, 278 bidirule: bidirule.ValidString, 279 }} 280 281 // TODO: profiles 282 // Register: recommended for approving domain names: don't do any mappings 283 // but rather reject on invalid input. Bundle or block deviation characters. 284 ) 285 286 type labelError struct{ label, code_ string } 287 288 func (e labelError) code() string { return e.code_ } 289 func (e labelError) Error() string { 290 return fmt.Sprintf("idna: invalid label %q", e.label) 291 } 292 293 type runeError rune 294 295 func (e runeError) code() string { return "P1" } 296 func (e runeError) Error() string { 297 return fmt.Sprintf("idna: disallowed rune %U", e) 298 } 299 300 // process implements the algorithm described in section 4 of UTS #46, 301 // see https://www.unicode.org/reports/tr46. 302 func (p *Profile) process(s string, toASCII bool) (string, error) { 303 var err error 304 var isBidi bool 305 if p.mapping != nil { 306 s, isBidi, err = p.mapping(p, s) 307 } 308 // Remove leading empty labels. 309 if p.removeLeadingDots { 310 for ; len(s) > 0 && s[0] == '.'; s = s[1:] { 311 } 312 } 313 // TODO: allow for a quick check of the tables data. 314 // It seems like we should only create this error on ToASCII, but the 315 // UTS 46 conformance tests suggests we should always check this. 316 if err == nil && p.verifyDNSLength && s == "" { 317 err = &labelError{s, "A4"} 318 } 319 labels := labelIter{orig: s} 320 for ; !labels.done(); labels.next() { 321 label := labels.label() 322 if label == "" { 323 // Empty labels are not okay. The label iterator skips the last 324 // label if it is empty. 325 if err == nil && p.verifyDNSLength { 326 err = &labelError{s, "A4"} 327 } 328 continue 329 } 330 if strings.HasPrefix(label, acePrefix) { 331 u, err2 := decode(label[len(acePrefix):]) 332 if err2 != nil { 333 if err == nil { 334 err = err2 335 } 336 // Spec says keep the old label. 337 continue 338 } 339 isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight 340 labels.set(u) 341 if err == nil && p.validateLabels { 342 err = p.fromPuny(p, u) 343 } 344 if err == nil { 345 // This should be called on NonTransitional, according to the 346 // spec, but that currently does not have any effect. Use the 347 // original profile to preserve options. 348 err = p.validateLabel(u) 349 } 350 } else if err == nil { 351 err = p.validateLabel(label) 352 } 353 } 354 if isBidi && p.bidirule != nil && err == nil { 355 for labels.reset(); !labels.done(); labels.next() { 356 if !p.bidirule(labels.label()) { 357 err = &labelError{s, "B"} 358 break 359 } 360 } 361 } 362 if toASCII { 363 for labels.reset(); !labels.done(); labels.next() { 364 label := labels.label() 365 if !ascii(label) { 366 a, err2 := encode(acePrefix, label) 367 if err == nil { 368 err = err2 369 } 370 label = a 371 labels.set(a) 372 } 373 n := len(label) 374 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { 375 err = &labelError{label, "A4"} 376 } 377 } 378 } 379 s = labels.result() 380 if toASCII && p.verifyDNSLength && err == nil { 381 // Compute the length of the domain name minus the root label and its dot. 382 n := len(s) 383 if n > 0 && s[n-1] == '.' { 384 n-- 385 } 386 if len(s) < 1 || n > 253 { 387 err = &labelError{s, "A4"} 388 } 389 } 390 return s, err 391 } 392 393 func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) { 394 // TODO: consider first doing a quick check to see if any of these checks 395 // need to be done. This will make it slower in the general case, but 396 // faster in the common case. 397 mapped = norm.NFC.String(s) 398 isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft 399 return mapped, isBidi, nil 400 } 401 402 func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) { 403 // TODO: filter need for normalization in loop below. 404 if !norm.NFC.IsNormalString(s) { 405 return s, false, &labelError{s, "V1"} 406 } 407 for i := 0; i < len(s); { 408 v, sz := trie.lookupString(s[i:]) 409 if sz == 0 { 410 return s, bidi, runeError(utf8.RuneError) 411 } 412 bidi = bidi || info(v).isBidi(s[i:]) 413 // Copy bytes not copied so far. 414 switch p.simplify(info(v).category()) { 415 // TODO: handle the NV8 defined in the Unicode idna data set to allow 416 // for strict conformance to IDNA2008. 417 case valid, deviation: 418 case disallowed, mapped, unknown, ignored: 419 r, _ := utf8.DecodeRuneInString(s[i:]) 420 return s, bidi, runeError(r) 421 } 422 i += sz 423 } 424 return s, bidi, nil 425 } 426 427 func (c info) isBidi(s string) bool { 428 if !c.isMapped() { 429 return c&attributesMask == rtl 430 } 431 // TODO: also store bidi info for mapped data. This is possible, but a bit 432 // cumbersome and not for the common case. 433 p, _ := bidi.LookupString(s) 434 switch p.Class() { 435 case bidi.R, bidi.AL, bidi.AN: 436 return true 437 } 438 return false 439 } 440 441 func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) { 442 var ( 443 b []byte 444 k int 445 ) 446 // combinedInfoBits contains the or-ed bits of all runes. We use this 447 // to derive the mayNeedNorm bit later. This may trigger normalization 448 // overeagerly, but it will not do so in the common case. The end result 449 // is another 10% saving on BenchmarkProfile for the common case. 450 var combinedInfoBits info 451 for i := 0; i < len(s); { 452 v, sz := trie.lookupString(s[i:]) 453 if sz == 0 { 454 b = append(b, s[k:i]...) 455 b = append(b, "\ufffd"...) 456 k = len(s) 457 if err == nil { 458 err = runeError(utf8.RuneError) 459 } 460 break 461 } 462 combinedInfoBits |= info(v) 463 bidi = bidi || info(v).isBidi(s[i:]) 464 start := i 465 i += sz 466 // Copy bytes not copied so far. 467 switch p.simplify(info(v).category()) { 468 case valid: 469 continue 470 case disallowed: 471 if err == nil { 472 r, _ := utf8.DecodeRuneInString(s[start:]) 473 err = runeError(r) 474 } 475 continue 476 case mapped, deviation: 477 b = append(b, s[k:start]...) 478 b = info(v).appendMapping(b, s[start:i]) 479 case ignored: 480 b = append(b, s[k:start]...) 481 // drop the rune 482 case unknown: 483 b = append(b, s[k:start]...) 484 b = append(b, "\ufffd"...) 485 } 486 k = i 487 } 488 if k == 0 { 489 // No changes so far. 490 if combinedInfoBits&mayNeedNorm != 0 { 491 s = norm.NFC.String(s) 492 } 493 } else { 494 b = append(b, s[k:]...) 495 if norm.NFC.QuickSpan(b) != len(b) { 496 b = norm.NFC.Bytes(b) 497 } 498 // TODO: the punycode converters require strings as input. 499 s = string(b) 500 } 501 return s, bidi, err 502 } 503 504 // A labelIter allows iterating over domain name labels. 505 type labelIter struct { 506 orig string 507 slice []string 508 curStart int 509 curEnd int 510 i int 511 } 512 513 func (l *labelIter) reset() { 514 l.curStart = 0 515 l.curEnd = 0 516 l.i = 0 517 } 518 519 func (l *labelIter) done() bool { 520 return l.curStart >= len(l.orig) 521 } 522 523 func (l *labelIter) result() string { 524 if l.slice != nil { 525 return strings.Join(l.slice, ".") 526 } 527 return l.orig 528 } 529 530 func (l *labelIter) label() string { 531 if l.slice != nil { 532 return l.slice[l.i] 533 } 534 p := strings.IndexByte(l.orig[l.curStart:], '.') 535 l.curEnd = l.curStart + p 536 if p == -1 { 537 l.curEnd = len(l.orig) 538 } 539 return l.orig[l.curStart:l.curEnd] 540 } 541 542 // next sets the value to the next label. It skips the last label if it is empty. 543 func (l *labelIter) next() { 544 l.i++ 545 if l.slice != nil { 546 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { 547 l.curStart = len(l.orig) 548 } 549 } else { 550 l.curStart = l.curEnd + 1 551 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { 552 l.curStart = len(l.orig) 553 } 554 } 555 } 556 557 func (l *labelIter) set(s string) { 558 if l.slice == nil { 559 l.slice = strings.Split(l.orig, ".") 560 } 561 l.slice[l.i] = s 562 } 563 564 // acePrefix is the ASCII Compatible Encoding prefix. 565 const acePrefix = "xn--" 566 567 func (p *Profile) simplify(cat category) category { 568 switch cat { 569 case disallowedSTD3Mapped: 570 if p.useSTD3Rules { 571 cat = disallowed 572 } else { 573 cat = mapped 574 } 575 case disallowedSTD3Valid: 576 if p.useSTD3Rules { 577 cat = disallowed 578 } else { 579 cat = valid 580 } 581 case deviation: 582 if !p.transitional { 583 cat = valid 584 } 585 case validNV8, validXV8: 586 // TODO: handle V2008 587 cat = valid 588 } 589 return cat 590 } 591 592 func validateFromPunycode(p *Profile, s string) error { 593 if !norm.NFC.IsNormalString(s) { 594 return &labelError{s, "V1"} 595 } 596 // TODO: detect whether string may have to be normalized in the following 597 // loop. 598 for i := 0; i < len(s); { 599 v, sz := trie.lookupString(s[i:]) 600 if sz == 0 { 601 return runeError(utf8.RuneError) 602 } 603 if c := p.simplify(info(v).category()); c != valid && c != deviation { 604 return &labelError{s, "V6"} 605 } 606 i += sz 607 } 608 return nil 609 } 610 611 const ( 612 zwnj = "\u200c" 613 zwj = "\u200d" 614 ) 615 616 type joinState int8 617 618 const ( 619 stateStart joinState = iota 620 stateVirama 621 stateBefore 622 stateBeforeVirama 623 stateAfter 624 stateFAIL 625 ) 626 627 var joinStates = [][numJoinTypes]joinState{ 628 stateStart: { 629 joiningL: stateBefore, 630 joiningD: stateBefore, 631 joinZWNJ: stateFAIL, 632 joinZWJ: stateFAIL, 633 joinVirama: stateVirama, 634 }, 635 stateVirama: { 636 joiningL: stateBefore, 637 joiningD: stateBefore, 638 }, 639 stateBefore: { 640 joiningL: stateBefore, 641 joiningD: stateBefore, 642 joiningT: stateBefore, 643 joinZWNJ: stateAfter, 644 joinZWJ: stateFAIL, 645 joinVirama: stateBeforeVirama, 646 }, 647 stateBeforeVirama: { 648 joiningL: stateBefore, 649 joiningD: stateBefore, 650 joiningT: stateBefore, 651 }, 652 stateAfter: { 653 joiningL: stateFAIL, 654 joiningD: stateBefore, 655 joiningT: stateAfter, 656 joiningR: stateStart, 657 joinZWNJ: stateFAIL, 658 joinZWJ: stateFAIL, 659 joinVirama: stateAfter, // no-op as we can't accept joiners here 660 }, 661 stateFAIL: { 662 0: stateFAIL, 663 joiningL: stateFAIL, 664 joiningD: stateFAIL, 665 joiningT: stateFAIL, 666 joiningR: stateFAIL, 667 joinZWNJ: stateFAIL, 668 joinZWJ: stateFAIL, 669 joinVirama: stateFAIL, 670 }, 671 } 672 673 // validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are 674 // already implicitly satisfied by the overall implementation. 675 func (p *Profile) validateLabel(s string) (err error) { 676 if s == "" { 677 if p.verifyDNSLength { 678 return &labelError{s, "A4"} 679 } 680 return nil 681 } 682 if !p.validateLabels { 683 return nil 684 } 685 trie := p.trie // p.validateLabels is only set if trie is set. 686 if len(s) > 4 && s[2] == '-' && s[3] == '-' { 687 return &labelError{s, "V2"} 688 } 689 if s[0] == '-' || s[len(s)-1] == '-' { 690 return &labelError{s, "V3"} 691 } 692 // TODO: merge the use of this in the trie. 693 v, sz := trie.lookupString(s) 694 x := info(v) 695 if x.isModifier() { 696 return &labelError{s, "V5"} 697 } 698 // Quickly return in the absence of zero-width (non) joiners. 699 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { 700 return nil 701 } 702 st := stateStart 703 for i := 0; ; { 704 jt := x.joinType() 705 if s[i:i+sz] == zwj { 706 jt = joinZWJ 707 } else if s[i:i+sz] == zwnj { 708 jt = joinZWNJ 709 } 710 st = joinStates[st][jt] 711 if x.isViramaModifier() { 712 st = joinStates[st][joinVirama] 713 } 714 if i += sz; i == len(s) { 715 break 716 } 717 v, sz = trie.lookupString(s[i:]) 718 x = info(v) 719 } 720 if st == stateFAIL || st == stateAfter { 721 return &labelError{s, "C"} 722 } 723 return nil 724 } 725 726 func ascii(s string) bool { 727 for i := 0; i < len(s); i++ { 728 if s[i] >= utf8.RuneSelf { 729 return false 730 } 731 } 732 return true 733 }