github.com/go-enjin/golang-org-x-text@v0.12.1-enjin.2/internal/export/idna/idna9.0.0.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !go1.10 6 // +build !go1.10 7 8 //go:generate go run gen.go gen_trieval.go gen_common.go 9 10 // Package idna implements IDNA2008 using the compatibility processing 11 // defined by UTS (Unicode Technical Standard) #46, which defines a standard to 12 // deal with the transition from IDNA2003. 13 // 14 // IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC 15 // 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894. 16 // UTS #46 is defined in https://www.unicode.org/reports/tr46. 17 // See https://unicode.org/cldr/utility/idna.jsp for a visualization of the 18 // differences between these two standards. 19 package idna // import "github.com/go-enjin/golang-org-x-text/internal/export/idna" 20 21 import ( 22 "fmt" 23 "strings" 24 "unicode/utf8" 25 26 "github.com/go-enjin/golang-org-x-text/secure/bidirule" 27 "github.com/go-enjin/golang-org-x-text/unicode/norm" 28 ) 29 30 // NOTE: Unlike common practice in Go APIs, the functions will return a 31 // sanitized domain name in case of errors. Browsers sometimes use a partially 32 // evaluated string as lookup. 33 // TODO: the current error handling is, in my opinion, the least opinionated. 34 // Other strategies are also viable, though: 35 // Option 1) Return an empty string in case of error, but allow the user to 36 // specify explicitly which errors to ignore. 37 // Option 2) Return the partially evaluated string if it is itself a valid 38 // string, otherwise return the empty string in case of error. 39 // Option 3) Option 1 and 2. 40 // Option 4) Always return an empty string for now and implement Option 1 as 41 // needed, and document that the return string may not be empty in case of 42 // error in the future. 43 // I think Option 1 is best, but it is quite opinionated. 44 45 // ToASCII is a wrapper for Punycode.ToASCII. 46 func ToASCII(s string) (string, error) { 47 return Punycode.process(s, true) 48 } 49 50 // ToUnicode is a wrapper for Punycode.ToUnicode. 51 func ToUnicode(s string) (string, error) { 52 return Punycode.process(s, false) 53 } 54 55 // An Option configures a Profile at creation time. 56 type Option func(*options) 57 58 // Transitional sets a Profile to use the Transitional mapping as defined in UTS 59 // #46. This will cause, for example, "ß" to be mapped to "ss". Using the 60 // transitional mapping provides a compromise between IDNA2003 and IDNA2008 61 // compatibility. It is used by some browsers when resolving domain names. This 62 // option is only meaningful if combined with MapForLookup. 63 func Transitional(transitional bool) Option { 64 return func(o *options) { o.transitional = transitional } 65 } 66 67 // VerifyDNSLength sets whether a Profile should fail if any of the IDN parts 68 // are longer than allowed by the RFC. 69 // 70 // This option corresponds to the VerifyDnsLength flag in UTS #46. 71 func VerifyDNSLength(verify bool) Option { 72 return func(o *options) { o.verifyDNSLength = verify } 73 } 74 75 // RemoveLeadingDots removes leading label separators. Leading runes that map to 76 // dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well. 77 func RemoveLeadingDots(remove bool) Option { 78 return func(o *options) { o.removeLeadingDots = remove } 79 } 80 81 // ValidateLabels sets whether to check the mandatory label validation criteria 82 // as defined in Section 5.4 of RFC 5891. This includes testing for correct use 83 // of hyphens ('-'), normalization, validity of runes, and the context rules. 84 // In particular, ValidateLabels also sets the CheckHyphens and CheckJoiners flags 85 // in UTS #46. 86 func ValidateLabels(enable bool) Option { 87 return func(o *options) { 88 // Don't override existing mappings, but set one that at least checks 89 // normalization if it is not set. 90 if o.mapping == nil && enable { 91 o.mapping = normalize 92 } 93 o.trie = trie 94 o.checkJoiners = enable 95 o.checkHyphens = enable 96 if enable { 97 o.fromPuny = validateFromPunycode 98 } else { 99 o.fromPuny = nil 100 } 101 } 102 } 103 104 // CheckHyphens sets whether to check for correct use of hyphens ('-') in 105 // labels. Most web browsers do not have this option set, since labels such as 106 // "r3---sn-apo3qvuoxuxbt-j5pe" are in common use. 107 // 108 // This option corresponds to the CheckHyphens flag in UTS #46. 109 func CheckHyphens(enable bool) Option { 110 return func(o *options) { o.checkHyphens = enable } 111 } 112 113 // CheckJoiners sets whether to check the ContextJ rules as defined in Appendix 114 // A of RFC 5892, concerning the use of joiner runes. 115 // 116 // This option corresponds to the CheckJoiners flag in UTS #46. 117 func CheckJoiners(enable bool) Option { 118 return func(o *options) { 119 o.trie = trie 120 o.checkJoiners = enable 121 } 122 } 123 124 // StrictDomainName limits the set of permissible ASCII characters to those 125 // allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the 126 // hyphen). This is set by default for MapForLookup and ValidateForRegistration, 127 // but is only useful if ValidateLabels is set. 128 // 129 // This option is useful, for instance, for browsers that allow characters 130 // outside this range, for example a '_' (U+005F LOW LINE). See 131 // http://www.rfc-editor.org/std/std3.txt for more details. 132 // 133 // This option corresponds to the UseSTD3ASCIIRules flag in UTS #46. 134 func StrictDomainName(use bool) Option { 135 return func(o *options) { o.useSTD3Rules = use } 136 } 137 138 // NOTE: the following options pull in tables. The tables should not be linked 139 // in as long as the options are not used. 140 141 // BidiRule enables the Bidi rule as defined in RFC 5893. Any application 142 // that relies on proper validation of labels should include this rule. 143 // 144 // This option corresponds to the CheckBidi flag in UTS #46. 145 func BidiRule() Option { 146 return func(o *options) { o.bidirule = bidirule.ValidString } 147 } 148 149 // ValidateForRegistration sets validation options to verify that a given IDN is 150 // properly formatted for registration as defined by Section 4 of RFC 5891. 151 func ValidateForRegistration() Option { 152 return func(o *options) { 153 o.mapping = validateRegistration 154 StrictDomainName(true)(o) 155 ValidateLabels(true)(o) 156 VerifyDNSLength(true)(o) 157 BidiRule()(o) 158 } 159 } 160 161 // MapForLookup sets validation and mapping options such that a given IDN is 162 // transformed for domain name lookup according to the requirements set out in 163 // Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894, 164 // RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option 165 // to add this check. 166 // 167 // The mappings include normalization and mapping case, width and other 168 // compatibility mappings. 169 func MapForLookup() Option { 170 return func(o *options) { 171 o.mapping = validateAndMap 172 StrictDomainName(true)(o) 173 ValidateLabels(true)(o) 174 RemoveLeadingDots(true)(o) 175 } 176 } 177 178 type options struct { 179 transitional bool 180 useSTD3Rules bool 181 checkHyphens bool 182 checkJoiners bool 183 verifyDNSLength bool 184 removeLeadingDots bool 185 186 trie *idnaTrie 187 188 // fromPuny calls validation rules when converting A-labels to U-labels. 189 fromPuny func(p *Profile, s string) error 190 191 // mapping implements a validation and mapping step as defined in RFC 5895 192 // or UTS 46, tailored to, for example, domain registration or lookup. 193 mapping func(p *Profile, s string) (string, error) 194 195 // bidirule, if specified, checks whether s conforms to the Bidi Rule 196 // defined in RFC 5893. 197 bidirule func(s string) bool 198 } 199 200 // A Profile defines the configuration of an IDNA mapper. 201 type Profile struct { 202 options 203 } 204 205 func apply(o *options, opts []Option) { 206 for _, f := range opts { 207 f(o) 208 } 209 } 210 211 // New creates a new Profile. 212 // 213 // With no options, the returned Profile is the most permissive and equals the 214 // Punycode Profile. Options can be passed to further restrict the Profile. The 215 // MapForLookup and ValidateForRegistration options set a collection of options, 216 // for lookup and registration purposes respectively, which can be tailored by 217 // adding more fine-grained options, where later options override earlier 218 // options. 219 func New(o ...Option) *Profile { 220 p := &Profile{} 221 apply(&p.options, o) 222 return p 223 } 224 225 // ToASCII converts a domain or domain label to its ASCII form. For example, 226 // ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and 227 // ToASCII("golang") is "golang". If an error is encountered it will return 228 // an error and a (partially) processed result. 229 func (p *Profile) ToASCII(s string) (string, error) { 230 return p.process(s, true) 231 } 232 233 // ToUnicode converts a domain or domain label to its Unicode form. For example, 234 // ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and 235 // ToUnicode("golang") is "golang". If an error is encountered it will return 236 // an error and a (partially) processed result. 237 func (p *Profile) ToUnicode(s string) (string, error) { 238 pp := *p 239 pp.transitional = false 240 return pp.process(s, false) 241 } 242 243 // String reports a string with a description of the profile for debugging 244 // purposes. The string format may change with different versions. 245 func (p *Profile) String() string { 246 s := "" 247 if p.transitional { 248 s = "Transitional" 249 } else { 250 s = "NonTransitional" 251 } 252 if p.useSTD3Rules { 253 s += ":UseSTD3Rules" 254 } 255 if p.checkHyphens { 256 s += ":CheckHyphens" 257 } 258 if p.checkJoiners { 259 s += ":CheckJoiners" 260 } 261 if p.verifyDNSLength { 262 s += ":VerifyDNSLength" 263 } 264 return s 265 } 266 267 var ( 268 // Punycode is a Profile that does raw punycode processing with a minimum 269 // of validation. 270 Punycode *Profile = punycode 271 272 // Lookup is the recommended profile for looking up domain names, according 273 // to Section 5 of RFC 5891. The exact configuration of this profile may 274 // change over time. 275 Lookup *Profile = lookup 276 277 // Display is the recommended profile for displaying domain names. 278 // The configuration of this profile may change over time. 279 Display *Profile = display 280 281 // Registration is the recommended profile for checking whether a given 282 // IDN is valid for registration, according to Section 4 of RFC 5891. 283 Registration *Profile = registration 284 285 punycode = &Profile{} 286 lookup = &Profile{options{ 287 transitional: true, 288 removeLeadingDots: true, 289 useSTD3Rules: true, 290 checkHyphens: true, 291 checkJoiners: true, 292 trie: trie, 293 fromPuny: validateFromPunycode, 294 mapping: validateAndMap, 295 bidirule: bidirule.ValidString, 296 }} 297 display = &Profile{options{ 298 useSTD3Rules: true, 299 removeLeadingDots: true, 300 checkHyphens: true, 301 checkJoiners: true, 302 trie: trie, 303 fromPuny: validateFromPunycode, 304 mapping: validateAndMap, 305 bidirule: bidirule.ValidString, 306 }} 307 registration = &Profile{options{ 308 useSTD3Rules: true, 309 verifyDNSLength: true, 310 checkHyphens: true, 311 checkJoiners: true, 312 trie: trie, 313 fromPuny: validateFromPunycode, 314 mapping: validateRegistration, 315 bidirule: bidirule.ValidString, 316 }} 317 318 // TODO: profiles 319 // Register: recommended for approving domain names: don't do any mappings 320 // but rather reject on invalid input. Bundle or block deviation characters. 321 ) 322 323 type labelError struct{ label, code_ string } 324 325 func (e labelError) code() string { return e.code_ } 326 func (e labelError) Error() string { 327 return fmt.Sprintf("idna: invalid label %q", e.label) 328 } 329 330 type runeError rune 331 332 func (e runeError) code() string { return "P1" } 333 func (e runeError) Error() string { 334 return fmt.Sprintf("idna: disallowed rune %U", e) 335 } 336 337 // process implements the algorithm described in section 4 of UTS #46, 338 // see https://www.unicode.org/reports/tr46. 339 func (p *Profile) process(s string, toASCII bool) (string, error) { 340 var err error 341 if p.mapping != nil { 342 s, err = p.mapping(p, s) 343 } 344 // Remove leading empty labels. 345 if p.removeLeadingDots { 346 for ; len(s) > 0 && s[0] == '.'; s = s[1:] { 347 } 348 } 349 // It seems like we should only create this error on ToASCII, but the 350 // UTS 46 conformance tests suggests we should always check this. 351 if err == nil && p.verifyDNSLength && s == "" { 352 err = &labelError{s, "A4"} 353 } 354 labels := labelIter{orig: s} 355 for ; !labels.done(); labels.next() { 356 label := labels.label() 357 if label == "" { 358 // Empty labels are not okay. The label iterator skips the last 359 // label if it is empty. 360 if err == nil && p.verifyDNSLength { 361 err = &labelError{s, "A4"} 362 } 363 continue 364 } 365 if strings.HasPrefix(label, acePrefix) { 366 u, err2 := decode(label[len(acePrefix):]) 367 if err2 != nil { 368 if err == nil { 369 err = err2 370 } 371 // Spec says keep the old label. 372 continue 373 } 374 labels.set(u) 375 if err == nil && p.fromPuny != nil { 376 err = p.fromPuny(p, u) 377 } 378 if err == nil { 379 // This should be called on NonTransitional, according to the 380 // spec, but that currently does not have any effect. Use the 381 // original profile to preserve options. 382 err = p.validateLabel(u) 383 } 384 } else if err == nil { 385 err = p.validateLabel(label) 386 } 387 } 388 if toASCII { 389 for labels.reset(); !labels.done(); labels.next() { 390 label := labels.label() 391 if !ascii(label) { 392 a, err2 := encode(acePrefix, label) 393 if err == nil { 394 err = err2 395 } 396 label = a 397 labels.set(a) 398 } 399 n := len(label) 400 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { 401 err = &labelError{label, "A4"} 402 } 403 } 404 } 405 s = labels.result() 406 if toASCII && p.verifyDNSLength && err == nil { 407 // Compute the length of the domain name minus the root label and its dot. 408 n := len(s) 409 if n > 0 && s[n-1] == '.' { 410 n-- 411 } 412 if len(s) < 1 || n > 253 { 413 err = &labelError{s, "A4"} 414 } 415 } 416 return s, err 417 } 418 419 func normalize(p *Profile, s string) (string, error) { 420 return norm.NFC.String(s), nil 421 } 422 423 func validateRegistration(p *Profile, s string) (string, error) { 424 if !norm.NFC.IsNormalString(s) { 425 return s, &labelError{s, "V1"} 426 } 427 for i := 0; i < len(s); { 428 v, sz := trie.lookupString(s[i:]) 429 if sz == 0 { 430 return s, runeError(utf8.RuneError) 431 } 432 // Copy bytes not copied so far. 433 switch p.simplify(info(v).category()) { 434 // TODO: handle the NV8 defined in the Unicode idna data set to allow 435 // for strict conformance to IDNA2008. 436 case valid, deviation: 437 case disallowed, mapped, unknown, ignored: 438 r, _ := utf8.DecodeRuneInString(s[i:]) 439 return s, runeError(r) 440 } 441 i += sz 442 } 443 return s, nil 444 } 445 446 func validateAndMap(p *Profile, s string) (string, error) { 447 var ( 448 err error 449 b []byte 450 k int 451 ) 452 for i := 0; i < len(s); { 453 v, sz := trie.lookupString(s[i:]) 454 if sz == 0 { 455 b = append(b, s[k:i]...) 456 b = append(b, "\ufffd"...) 457 k = len(s) 458 if err == nil { 459 err = runeError(utf8.RuneError) 460 } 461 break 462 } 463 start := i 464 i += sz 465 // Copy bytes not copied so far. 466 switch p.simplify(info(v).category()) { 467 case valid: 468 continue 469 case disallowed: 470 if err == nil { 471 r, _ := utf8.DecodeRuneInString(s[start:]) 472 err = runeError(r) 473 } 474 continue 475 case mapped, deviation: 476 b = append(b, s[k:start]...) 477 b = info(v).appendMapping(b, s[start:i]) 478 case ignored: 479 b = append(b, s[k:start]...) 480 // drop the rune 481 case unknown: 482 b = append(b, s[k:start]...) 483 b = append(b, "\ufffd"...) 484 } 485 k = i 486 } 487 if k == 0 { 488 // No changes so far. 489 s = norm.NFC.String(s) 490 } else { 491 b = append(b, s[k:]...) 492 if norm.NFC.QuickSpan(b) != len(b) { 493 b = norm.NFC.Bytes(b) 494 } 495 // TODO: the punycode converters require strings as input. 496 s = string(b) 497 } 498 return s, err 499 } 500 501 // A labelIter allows iterating over domain name labels. 502 type labelIter struct { 503 orig string 504 slice []string 505 curStart int 506 curEnd int 507 i int 508 } 509 510 func (l *labelIter) reset() { 511 l.curStart = 0 512 l.curEnd = 0 513 l.i = 0 514 } 515 516 func (l *labelIter) done() bool { 517 return l.curStart >= len(l.orig) 518 } 519 520 func (l *labelIter) result() string { 521 if l.slice != nil { 522 return strings.Join(l.slice, ".") 523 } 524 return l.orig 525 } 526 527 func (l *labelIter) label() string { 528 if l.slice != nil { 529 return l.slice[l.i] 530 } 531 p := strings.IndexByte(l.orig[l.curStart:], '.') 532 l.curEnd = l.curStart + p 533 if p == -1 { 534 l.curEnd = len(l.orig) 535 } 536 return l.orig[l.curStart:l.curEnd] 537 } 538 539 // next sets the value to the next label. It skips the last label if it is empty. 540 func (l *labelIter) next() { 541 l.i++ 542 if l.slice != nil { 543 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { 544 l.curStart = len(l.orig) 545 } 546 } else { 547 l.curStart = l.curEnd + 1 548 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { 549 l.curStart = len(l.orig) 550 } 551 } 552 } 553 554 func (l *labelIter) set(s string) { 555 if l.slice == nil { 556 l.slice = strings.Split(l.orig, ".") 557 } 558 l.slice[l.i] = s 559 } 560 561 // acePrefix is the ASCII Compatible Encoding prefix. 562 const acePrefix = "xn--" 563 564 func (p *Profile) simplify(cat category) category { 565 switch cat { 566 case disallowedSTD3Mapped: 567 if p.useSTD3Rules { 568 cat = disallowed 569 } else { 570 cat = mapped 571 } 572 case disallowedSTD3Valid: 573 if p.useSTD3Rules { 574 cat = disallowed 575 } else { 576 cat = valid 577 } 578 case deviation: 579 if !p.transitional { 580 cat = valid 581 } 582 case validNV8, validXV8: 583 // TODO: handle V2008 584 cat = valid 585 } 586 return cat 587 } 588 589 func validateFromPunycode(p *Profile, s string) error { 590 if !norm.NFC.IsNormalString(s) { 591 return &labelError{s, "V1"} 592 } 593 for i := 0; i < len(s); { 594 v, sz := trie.lookupString(s[i:]) 595 if sz == 0 { 596 return runeError(utf8.RuneError) 597 } 598 if c := p.simplify(info(v).category()); c != valid && c != deviation { 599 return &labelError{s, "V6"} 600 } 601 i += sz 602 } 603 return nil 604 } 605 606 const ( 607 zwnj = "\u200c" 608 zwj = "\u200d" 609 ) 610 611 type joinState int8 612 613 const ( 614 stateStart joinState = iota 615 stateVirama 616 stateBefore 617 stateBeforeVirama 618 stateAfter 619 stateFAIL 620 ) 621 622 var joinStates = [][numJoinTypes]joinState{ 623 stateStart: { 624 joiningL: stateBefore, 625 joiningD: stateBefore, 626 joinZWNJ: stateFAIL, 627 joinZWJ: stateFAIL, 628 joinVirama: stateVirama, 629 }, 630 stateVirama: { 631 joiningL: stateBefore, 632 joiningD: stateBefore, 633 }, 634 stateBefore: { 635 joiningL: stateBefore, 636 joiningD: stateBefore, 637 joiningT: stateBefore, 638 joinZWNJ: stateAfter, 639 joinZWJ: stateFAIL, 640 joinVirama: stateBeforeVirama, 641 }, 642 stateBeforeVirama: { 643 joiningL: stateBefore, 644 joiningD: stateBefore, 645 joiningT: stateBefore, 646 }, 647 stateAfter: { 648 joiningL: stateFAIL, 649 joiningD: stateBefore, 650 joiningT: stateAfter, 651 joiningR: stateStart, 652 joinZWNJ: stateFAIL, 653 joinZWJ: stateFAIL, 654 joinVirama: stateAfter, // no-op as we can't accept joiners here 655 }, 656 stateFAIL: { 657 0: stateFAIL, 658 joiningL: stateFAIL, 659 joiningD: stateFAIL, 660 joiningT: stateFAIL, 661 joiningR: stateFAIL, 662 joinZWNJ: stateFAIL, 663 joinZWJ: stateFAIL, 664 joinVirama: stateFAIL, 665 }, 666 } 667 668 // validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are 669 // already implicitly satisfied by the overall implementation. 670 func (p *Profile) validateLabel(s string) error { 671 if s == "" { 672 if p.verifyDNSLength { 673 return &labelError{s, "A4"} 674 } 675 return nil 676 } 677 if p.bidirule != nil && !p.bidirule(s) { 678 return &labelError{s, "B"} 679 } 680 if p.checkHyphens { 681 if len(s) > 4 && s[2] == '-' && s[3] == '-' { 682 return &labelError{s, "V2"} 683 } 684 if s[0] == '-' || s[len(s)-1] == '-' { 685 return &labelError{s, "V3"} 686 } 687 } 688 if !p.checkJoiners { 689 return nil 690 } 691 trie := p.trie // p.checkJoiners is only set if trie is set. 692 // TODO: merge the use of this in the trie. 693 v, sz := trie.lookupString(s) 694 x := info(v) 695 if x.isModifier() { 696 return &labelError{s, "V5"} 697 } 698 // Quickly return in the absence of zero-width (non) joiners. 699 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { 700 return nil 701 } 702 st := stateStart 703 for i := 0; ; { 704 jt := x.joinType() 705 if s[i:i+sz] == zwj { 706 jt = joinZWJ 707 } else if s[i:i+sz] == zwnj { 708 jt = joinZWNJ 709 } 710 st = joinStates[st][jt] 711 if x.isViramaModifier() { 712 st = joinStates[st][joinVirama] 713 } 714 if i += sz; i == len(s) { 715 break 716 } 717 v, sz = trie.lookupString(s[i:]) 718 x = info(v) 719 } 720 if st == stateFAIL || st == stateAfter { 721 return &labelError{s, "C"} 722 } 723 return nil 724 } 725 726 func ascii(s string) bool { 727 for i := 0; i < len(s); i++ { 728 if s[i] >= utf8.RuneSelf { 729 return false 730 } 731 } 732 return true 733 }