golang.org/x/text@v0.14.0/internal/export/idna/idna9.0.0.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !go1.10 6 7 //go:generate go run gen.go gen_trieval.go gen_common.go 8 9 // Package idna implements IDNA2008 using the compatibility processing 10 // defined by UTS (Unicode Technical Standard) #46, which defines a standard to 11 // deal with the transition from IDNA2003. 12 // 13 // IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC 14 // 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894. 15 // UTS #46 is defined in https://www.unicode.org/reports/tr46. 16 // See https://unicode.org/cldr/utility/idna.jsp for a visualization of the 17 // differences between these two standards. 18 package idna // import "golang.org/x/text/internal/export/idna" 19 20 import ( 21 "fmt" 22 "strings" 23 "unicode/utf8" 24 25 "golang.org/x/text/secure/bidirule" 26 "golang.org/x/text/unicode/norm" 27 ) 28 29 // NOTE: Unlike common practice in Go APIs, the functions will return a 30 // sanitized domain name in case of errors. Browsers sometimes use a partially 31 // evaluated string as lookup. 32 // TODO: the current error handling is, in my opinion, the least opinionated. 33 // Other strategies are also viable, though: 34 // Option 1) Return an empty string in case of error, but allow the user to 35 // specify explicitly which errors to ignore. 36 // Option 2) Return the partially evaluated string if it is itself a valid 37 // string, otherwise return the empty string in case of error. 38 // Option 3) Option 1 and 2. 39 // Option 4) Always return an empty string for now and implement Option 1 as 40 // needed, and document that the return string may not be empty in case of 41 // error in the future. 42 // I think Option 1 is best, but it is quite opinionated. 43 44 // ToASCII is a wrapper for Punycode.ToASCII. 45 func ToASCII(s string) (string, error) { 46 return Punycode.process(s, true) 47 } 48 49 // ToUnicode is a wrapper for Punycode.ToUnicode. 50 func ToUnicode(s string) (string, error) { 51 return Punycode.process(s, false) 52 } 53 54 // An Option configures a Profile at creation time. 55 type Option func(*options) 56 57 // Transitional sets a Profile to use the Transitional mapping as defined in UTS 58 // #46. This will cause, for example, "ß" to be mapped to "ss". Using the 59 // transitional mapping provides a compromise between IDNA2003 and IDNA2008 60 // compatibility. It is used by some browsers when resolving domain names. This 61 // option is only meaningful if combined with MapForLookup. 62 func Transitional(transitional bool) Option { 63 return func(o *options) { o.transitional = transitional } 64 } 65 66 // VerifyDNSLength sets whether a Profile should fail if any of the IDN parts 67 // are longer than allowed by the RFC. 68 // 69 // This option corresponds to the VerifyDnsLength flag in UTS #46. 70 func VerifyDNSLength(verify bool) Option { 71 return func(o *options) { o.verifyDNSLength = verify } 72 } 73 74 // RemoveLeadingDots removes leading label separators. Leading runes that map to 75 // dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well. 76 func RemoveLeadingDots(remove bool) Option { 77 return func(o *options) { o.removeLeadingDots = remove } 78 } 79 80 // ValidateLabels sets whether to check the mandatory label validation criteria 81 // as defined in Section 5.4 of RFC 5891. This includes testing for correct use 82 // of hyphens ('-'), normalization, validity of runes, and the context rules. 83 // In particular, ValidateLabels also sets the CheckHyphens and CheckJoiners flags 84 // in UTS #46. 85 func ValidateLabels(enable bool) Option { 86 return func(o *options) { 87 // Don't override existing mappings, but set one that at least checks 88 // normalization if it is not set. 89 if o.mapping == nil && enable { 90 o.mapping = normalize 91 } 92 o.trie = trie 93 o.checkJoiners = enable 94 o.checkHyphens = enable 95 if enable { 96 o.fromPuny = validateFromPunycode 97 } else { 98 o.fromPuny = nil 99 } 100 } 101 } 102 103 // CheckHyphens sets whether to check for correct use of hyphens ('-') in 104 // labels. Most web browsers do not have this option set, since labels such as 105 // "r3---sn-apo3qvuoxuxbt-j5pe" are in common use. 106 // 107 // This option corresponds to the CheckHyphens flag in UTS #46. 108 func CheckHyphens(enable bool) Option { 109 return func(o *options) { o.checkHyphens = enable } 110 } 111 112 // CheckJoiners sets whether to check the ContextJ rules as defined in Appendix 113 // A of RFC 5892, concerning the use of joiner runes. 114 // 115 // This option corresponds to the CheckJoiners flag in UTS #46. 116 func CheckJoiners(enable bool) Option { 117 return func(o *options) { 118 o.trie = trie 119 o.checkJoiners = enable 120 } 121 } 122 123 // StrictDomainName limits the set of permissible ASCII characters to those 124 // allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the 125 // hyphen). This is set by default for MapForLookup and ValidateForRegistration, 126 // but is only useful if ValidateLabels is set. 127 // 128 // This option is useful, for instance, for browsers that allow characters 129 // outside this range, for example a '_' (U+005F LOW LINE). See 130 // http://www.rfc-editor.org/std/std3.txt for more details. 131 // 132 // This option corresponds to the UseSTD3ASCIIRules flag in UTS #46. 133 func StrictDomainName(use bool) Option { 134 return func(o *options) { o.useSTD3Rules = use } 135 } 136 137 // NOTE: the following options pull in tables. The tables should not be linked 138 // in as long as the options are not used. 139 140 // BidiRule enables the Bidi rule as defined in RFC 5893. Any application 141 // that relies on proper validation of labels should include this rule. 142 // 143 // This option corresponds to the CheckBidi flag in UTS #46. 144 func BidiRule() Option { 145 return func(o *options) { o.bidirule = bidirule.ValidString } 146 } 147 148 // ValidateForRegistration sets validation options to verify that a given IDN is 149 // properly formatted for registration as defined by Section 4 of RFC 5891. 150 func ValidateForRegistration() Option { 151 return func(o *options) { 152 o.mapping = validateRegistration 153 StrictDomainName(true)(o) 154 ValidateLabels(true)(o) 155 VerifyDNSLength(true)(o) 156 BidiRule()(o) 157 } 158 } 159 160 // MapForLookup sets validation and mapping options such that a given IDN is 161 // transformed for domain name lookup according to the requirements set out in 162 // Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894, 163 // RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option 164 // to add this check. 165 // 166 // The mappings include normalization and mapping case, width and other 167 // compatibility mappings. 168 func MapForLookup() Option { 169 return func(o *options) { 170 o.mapping = validateAndMap 171 StrictDomainName(true)(o) 172 ValidateLabels(true)(o) 173 RemoveLeadingDots(true)(o) 174 } 175 } 176 177 type options struct { 178 transitional bool 179 useSTD3Rules bool 180 checkHyphens bool 181 checkJoiners bool 182 verifyDNSLength bool 183 removeLeadingDots bool 184 185 trie *idnaTrie 186 187 // fromPuny calls validation rules when converting A-labels to U-labels. 188 fromPuny func(p *Profile, s string) error 189 190 // mapping implements a validation and mapping step as defined in RFC 5895 191 // or UTS 46, tailored to, for example, domain registration or lookup. 192 mapping func(p *Profile, s string) (string, error) 193 194 // bidirule, if specified, checks whether s conforms to the Bidi Rule 195 // defined in RFC 5893. 196 bidirule func(s string) bool 197 } 198 199 // A Profile defines the configuration of an IDNA mapper. 200 type Profile struct { 201 options 202 } 203 204 func apply(o *options, opts []Option) { 205 for _, f := range opts { 206 f(o) 207 } 208 } 209 210 // New creates a new Profile. 211 // 212 // With no options, the returned Profile is the most permissive and equals the 213 // Punycode Profile. Options can be passed to further restrict the Profile. The 214 // MapForLookup and ValidateForRegistration options set a collection of options, 215 // for lookup and registration purposes respectively, which can be tailored by 216 // adding more fine-grained options, where later options override earlier 217 // options. 218 func New(o ...Option) *Profile { 219 p := &Profile{} 220 apply(&p.options, o) 221 return p 222 } 223 224 // ToASCII converts a domain or domain label to its ASCII form. For example, 225 // ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and 226 // ToASCII("golang") is "golang". If an error is encountered it will return 227 // an error and a (partially) processed result. 228 func (p *Profile) ToASCII(s string) (string, error) { 229 return p.process(s, true) 230 } 231 232 // ToUnicode converts a domain or domain label to its Unicode form. For example, 233 // ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and 234 // ToUnicode("golang") is "golang". If an error is encountered it will return 235 // an error and a (partially) processed result. 236 func (p *Profile) ToUnicode(s string) (string, error) { 237 pp := *p 238 pp.transitional = false 239 return pp.process(s, false) 240 } 241 242 // String reports a string with a description of the profile for debugging 243 // purposes. The string format may change with different versions. 244 func (p *Profile) String() string { 245 s := "" 246 if p.transitional { 247 s = "Transitional" 248 } else { 249 s = "NonTransitional" 250 } 251 if p.useSTD3Rules { 252 s += ":UseSTD3Rules" 253 } 254 if p.checkHyphens { 255 s += ":CheckHyphens" 256 } 257 if p.checkJoiners { 258 s += ":CheckJoiners" 259 } 260 if p.verifyDNSLength { 261 s += ":VerifyDNSLength" 262 } 263 return s 264 } 265 266 var ( 267 // Punycode is a Profile that does raw punycode processing with a minimum 268 // of validation. 269 Punycode *Profile = punycode 270 271 // Lookup is the recommended profile for looking up domain names, according 272 // to Section 5 of RFC 5891. The exact configuration of this profile may 273 // change over time. 274 Lookup *Profile = lookup 275 276 // Display is the recommended profile for displaying domain names. 277 // The configuration of this profile may change over time. 278 Display *Profile = display 279 280 // Registration is the recommended profile for checking whether a given 281 // IDN is valid for registration, according to Section 4 of RFC 5891. 282 Registration *Profile = registration 283 284 punycode = &Profile{} 285 lookup = &Profile{options{ 286 transitional: true, 287 removeLeadingDots: true, 288 useSTD3Rules: true, 289 checkHyphens: true, 290 checkJoiners: true, 291 trie: trie, 292 fromPuny: validateFromPunycode, 293 mapping: validateAndMap, 294 bidirule: bidirule.ValidString, 295 }} 296 display = &Profile{options{ 297 useSTD3Rules: true, 298 removeLeadingDots: true, 299 checkHyphens: true, 300 checkJoiners: true, 301 trie: trie, 302 fromPuny: validateFromPunycode, 303 mapping: validateAndMap, 304 bidirule: bidirule.ValidString, 305 }} 306 registration = &Profile{options{ 307 useSTD3Rules: true, 308 verifyDNSLength: true, 309 checkHyphens: true, 310 checkJoiners: true, 311 trie: trie, 312 fromPuny: validateFromPunycode, 313 mapping: validateRegistration, 314 bidirule: bidirule.ValidString, 315 }} 316 317 // TODO: profiles 318 // Register: recommended for approving domain names: don't do any mappings 319 // but rather reject on invalid input. Bundle or block deviation characters. 320 ) 321 322 type labelError struct{ label, code_ string } 323 324 func (e labelError) code() string { return e.code_ } 325 func (e labelError) Error() string { 326 return fmt.Sprintf("idna: invalid label %q", e.label) 327 } 328 329 type runeError rune 330 331 func (e runeError) code() string { return "P1" } 332 func (e runeError) Error() string { 333 return fmt.Sprintf("idna: disallowed rune %U", e) 334 } 335 336 // process implements the algorithm described in section 4 of UTS #46, 337 // see https://www.unicode.org/reports/tr46. 338 func (p *Profile) process(s string, toASCII bool) (string, error) { 339 var err error 340 if p.mapping != nil { 341 s, err = p.mapping(p, s) 342 } 343 // Remove leading empty labels. 344 if p.removeLeadingDots { 345 for ; len(s) > 0 && s[0] == '.'; s = s[1:] { 346 } 347 } 348 // It seems like we should only create this error on ToASCII, but the 349 // UTS 46 conformance tests suggests we should always check this. 350 if err == nil && p.verifyDNSLength && s == "" { 351 err = &labelError{s, "A4"} 352 } 353 labels := labelIter{orig: s} 354 for ; !labels.done(); labels.next() { 355 label := labels.label() 356 if label == "" { 357 // Empty labels are not okay. The label iterator skips the last 358 // label if it is empty. 359 if err == nil && p.verifyDNSLength { 360 err = &labelError{s, "A4"} 361 } 362 continue 363 } 364 if strings.HasPrefix(label, acePrefix) { 365 u, err2 := decode(label[len(acePrefix):]) 366 if err2 != nil { 367 if err == nil { 368 err = err2 369 } 370 // Spec says keep the old label. 371 continue 372 } 373 labels.set(u) 374 if err == nil && p.fromPuny != nil { 375 err = p.fromPuny(p, u) 376 } 377 if err == nil { 378 // This should be called on NonTransitional, according to the 379 // spec, but that currently does not have any effect. Use the 380 // original profile to preserve options. 381 err = p.validateLabel(u) 382 } 383 } else if err == nil { 384 err = p.validateLabel(label) 385 } 386 } 387 if toASCII { 388 for labels.reset(); !labels.done(); labels.next() { 389 label := labels.label() 390 if !ascii(label) { 391 a, err2 := encode(acePrefix, label) 392 if err == nil { 393 err = err2 394 } 395 label = a 396 labels.set(a) 397 } 398 n := len(label) 399 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { 400 err = &labelError{label, "A4"} 401 } 402 } 403 } 404 s = labels.result() 405 if toASCII && p.verifyDNSLength && err == nil { 406 // Compute the length of the domain name minus the root label and its dot. 407 n := len(s) 408 if n > 0 && s[n-1] == '.' { 409 n-- 410 } 411 if len(s) < 1 || n > 253 { 412 err = &labelError{s, "A4"} 413 } 414 } 415 return s, err 416 } 417 418 func normalize(p *Profile, s string) (string, error) { 419 return norm.NFC.String(s), nil 420 } 421 422 func validateRegistration(p *Profile, s string) (string, error) { 423 if !norm.NFC.IsNormalString(s) { 424 return s, &labelError{s, "V1"} 425 } 426 for i := 0; i < len(s); { 427 v, sz := trie.lookupString(s[i:]) 428 if sz == 0 { 429 return s, runeError(utf8.RuneError) 430 } 431 // Copy bytes not copied so far. 432 switch p.simplify(info(v).category()) { 433 // TODO: handle the NV8 defined in the Unicode idna data set to allow 434 // for strict conformance to IDNA2008. 435 case valid, deviation: 436 case disallowed, mapped, unknown, ignored: 437 r, _ := utf8.DecodeRuneInString(s[i:]) 438 return s, runeError(r) 439 } 440 i += sz 441 } 442 return s, nil 443 } 444 445 func validateAndMap(p *Profile, s string) (string, error) { 446 var ( 447 err error 448 b []byte 449 k int 450 ) 451 for i := 0; i < len(s); { 452 v, sz := trie.lookupString(s[i:]) 453 if sz == 0 { 454 b = append(b, s[k:i]...) 455 b = append(b, "\ufffd"...) 456 k = len(s) 457 if err == nil { 458 err = runeError(utf8.RuneError) 459 } 460 break 461 } 462 start := i 463 i += sz 464 // Copy bytes not copied so far. 465 switch p.simplify(info(v).category()) { 466 case valid: 467 continue 468 case disallowed: 469 if err == nil { 470 r, _ := utf8.DecodeRuneInString(s[start:]) 471 err = runeError(r) 472 } 473 continue 474 case mapped, deviation: 475 b = append(b, s[k:start]...) 476 b = info(v).appendMapping(b, s[start:i]) 477 case ignored: 478 b = append(b, s[k:start]...) 479 // drop the rune 480 case unknown: 481 b = append(b, s[k:start]...) 482 b = append(b, "\ufffd"...) 483 } 484 k = i 485 } 486 if k == 0 { 487 // No changes so far. 488 s = norm.NFC.String(s) 489 } else { 490 b = append(b, s[k:]...) 491 if norm.NFC.QuickSpan(b) != len(b) { 492 b = norm.NFC.Bytes(b) 493 } 494 // TODO: the punycode converters require strings as input. 495 s = string(b) 496 } 497 return s, err 498 } 499 500 // A labelIter allows iterating over domain name labels. 501 type labelIter struct { 502 orig string 503 slice []string 504 curStart int 505 curEnd int 506 i int 507 } 508 509 func (l *labelIter) reset() { 510 l.curStart = 0 511 l.curEnd = 0 512 l.i = 0 513 } 514 515 func (l *labelIter) done() bool { 516 return l.curStart >= len(l.orig) 517 } 518 519 func (l *labelIter) result() string { 520 if l.slice != nil { 521 return strings.Join(l.slice, ".") 522 } 523 return l.orig 524 } 525 526 func (l *labelIter) label() string { 527 if l.slice != nil { 528 return l.slice[l.i] 529 } 530 p := strings.IndexByte(l.orig[l.curStart:], '.') 531 l.curEnd = l.curStart + p 532 if p == -1 { 533 l.curEnd = len(l.orig) 534 } 535 return l.orig[l.curStart:l.curEnd] 536 } 537 538 // next sets the value to the next label. It skips the last label if it is empty. 539 func (l *labelIter) next() { 540 l.i++ 541 if l.slice != nil { 542 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { 543 l.curStart = len(l.orig) 544 } 545 } else { 546 l.curStart = l.curEnd + 1 547 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { 548 l.curStart = len(l.orig) 549 } 550 } 551 } 552 553 func (l *labelIter) set(s string) { 554 if l.slice == nil { 555 l.slice = strings.Split(l.orig, ".") 556 } 557 l.slice[l.i] = s 558 } 559 560 // acePrefix is the ASCII Compatible Encoding prefix. 561 const acePrefix = "xn--" 562 563 func (p *Profile) simplify(cat category) category { 564 switch cat { 565 case disallowedSTD3Mapped: 566 if p.useSTD3Rules { 567 cat = disallowed 568 } else { 569 cat = mapped 570 } 571 case disallowedSTD3Valid: 572 if p.useSTD3Rules { 573 cat = disallowed 574 } else { 575 cat = valid 576 } 577 case deviation: 578 if !p.transitional { 579 cat = valid 580 } 581 case validNV8, validXV8: 582 // TODO: handle V2008 583 cat = valid 584 } 585 return cat 586 } 587 588 func validateFromPunycode(p *Profile, s string) error { 589 if !norm.NFC.IsNormalString(s) { 590 return &labelError{s, "V1"} 591 } 592 for i := 0; i < len(s); { 593 v, sz := trie.lookupString(s[i:]) 594 if sz == 0 { 595 return runeError(utf8.RuneError) 596 } 597 if c := p.simplify(info(v).category()); c != valid && c != deviation { 598 return &labelError{s, "V6"} 599 } 600 i += sz 601 } 602 return nil 603 } 604 605 const ( 606 zwnj = "\u200c" 607 zwj = "\u200d" 608 ) 609 610 type joinState int8 611 612 const ( 613 stateStart joinState = iota 614 stateVirama 615 stateBefore 616 stateBeforeVirama 617 stateAfter 618 stateFAIL 619 ) 620 621 var joinStates = [][numJoinTypes]joinState{ 622 stateStart: { 623 joiningL: stateBefore, 624 joiningD: stateBefore, 625 joinZWNJ: stateFAIL, 626 joinZWJ: stateFAIL, 627 joinVirama: stateVirama, 628 }, 629 stateVirama: { 630 joiningL: stateBefore, 631 joiningD: stateBefore, 632 }, 633 stateBefore: { 634 joiningL: stateBefore, 635 joiningD: stateBefore, 636 joiningT: stateBefore, 637 joinZWNJ: stateAfter, 638 joinZWJ: stateFAIL, 639 joinVirama: stateBeforeVirama, 640 }, 641 stateBeforeVirama: { 642 joiningL: stateBefore, 643 joiningD: stateBefore, 644 joiningT: stateBefore, 645 }, 646 stateAfter: { 647 joiningL: stateFAIL, 648 joiningD: stateBefore, 649 joiningT: stateAfter, 650 joiningR: stateStart, 651 joinZWNJ: stateFAIL, 652 joinZWJ: stateFAIL, 653 joinVirama: stateAfter, // no-op as we can't accept joiners here 654 }, 655 stateFAIL: { 656 0: stateFAIL, 657 joiningL: stateFAIL, 658 joiningD: stateFAIL, 659 joiningT: stateFAIL, 660 joiningR: stateFAIL, 661 joinZWNJ: stateFAIL, 662 joinZWJ: stateFAIL, 663 joinVirama: stateFAIL, 664 }, 665 } 666 667 // validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are 668 // already implicitly satisfied by the overall implementation. 669 func (p *Profile) validateLabel(s string) error { 670 if s == "" { 671 if p.verifyDNSLength { 672 return &labelError{s, "A4"} 673 } 674 return nil 675 } 676 if p.bidirule != nil && !p.bidirule(s) { 677 return &labelError{s, "B"} 678 } 679 if p.checkHyphens { 680 if len(s) > 4 && s[2] == '-' && s[3] == '-' { 681 return &labelError{s, "V2"} 682 } 683 if s[0] == '-' || s[len(s)-1] == '-' { 684 return &labelError{s, "V3"} 685 } 686 } 687 if !p.checkJoiners { 688 return nil 689 } 690 trie := p.trie // p.checkJoiners is only set if trie is set. 691 // TODO: merge the use of this in the trie. 692 v, sz := trie.lookupString(s) 693 x := info(v) 694 if x.isModifier() { 695 return &labelError{s, "V5"} 696 } 697 // Quickly return in the absence of zero-width (non) joiners. 698 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { 699 return nil 700 } 701 st := stateStart 702 for i := 0; ; { 703 jt := x.joinType() 704 if s[i:i+sz] == zwj { 705 jt = joinZWJ 706 } else if s[i:i+sz] == zwnj { 707 jt = joinZWNJ 708 } 709 st = joinStates[st][jt] 710 if x.isViramaModifier() { 711 st = joinStates[st][joinVirama] 712 } 713 if i += sz; i == len(s) { 714 break 715 } 716 v, sz = trie.lookupString(s[i:]) 717 x = info(v) 718 } 719 if st == stateFAIL || st == stateAfter { 720 return &labelError{s, "C"} 721 } 722 return nil 723 } 724 725 func ascii(s string) bool { 726 for i := 0; i < len(s); i++ { 727 if s[i] >= utf8.RuneSelf { 728 return false 729 } 730 } 731 return true 732 }