github.com/go-xe2/third@v1.0.3/golang.org/x/text/internal/export/idna/idna.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run gen.go gen_trieval.go gen_common.go 6 7 // Package idna implements IDNA2008 using the compatibility processing 8 // defined by UTS (Unicode Technical Standard) #46, which defines a standard to 9 // deal with the transition from IDNA2003. 10 // 11 // IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC 12 // 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894. 13 // UTS #46 is defined in http://www.unicode.org/reports/tr46. 14 // See http://unicode.org/cldr/utility/idna.jsp for a visualization of the 15 // differences between these two standards. 16 package idna // import "github.com/go-xe2/third/golang.org/x/text/internal/export/idna" 17 18 import ( 19 "fmt" 20 "strings" 21 "unicode/utf8" 22 23 "github.com/go-xe2/third/golang.org/x/text/secure/bidirule" 24 "github.com/go-xe2/third/golang.org/x/text/unicode/bidi" 25 "github.com/go-xe2/third/golang.org/x/text/unicode/norm" 26 ) 27 28 // NOTE: Unlike common practice in Go APIs, the functions will return a 29 // sanitized domain name in case of errors. Browsers sometimes use a partially 30 // evaluated string as lookup. 31 // TODO: the current error handling is, in my opinion, the least opinionated. 32 // Other strategies are also viable, though: 33 // Option 1) Return an empty string in case of error, but allow the user to 34 // specify explicitly which errors to ignore. 35 // Option 2) Return the partially evaluated string if it is itself a valid 36 // string, otherwise return the empty string in case of error. 37 // Option 3) Option 1 and 2. 38 // Option 4) Always return an empty string for now and implement Option 1 as 39 // needed, and document that the return string may not be empty in case of 40 // error in the future. 41 // I think Option 1 is best, but it is quite opinionated. 42 43 // ToASCII is a wrapper for Punycode.ToASCII. 44 func ToASCII(s string) (string, error) { 45 return Punycode.process(s, true) 46 } 47 48 // ToUnicode is a wrapper for Punycode.ToUnicode. 49 func ToUnicode(s string) (string, error) { 50 return Punycode.process(s, false) 51 } 52 53 // An Option configures a Profile at creation time. 54 type Option func(*options) 55 56 // Transitional sets a Profile to use the Transitional mapping as defined in UTS 57 // #46. This will cause, for example, "ß" to be mapped to "ss". Using the 58 // transitional mapping provides a compromise between IDNA2003 and IDNA2008 59 // compatibility. It is used by most browsers when resolving domain names. This 60 // option is only meaningful if combined with MapForLookup. 61 func Transitional(transitional bool) Option { 62 return func(o *options) { o.transitional = true } 63 } 64 65 // VerifyDNSLength sets whether a Profile should fail if any of the IDN parts 66 // are longer than allowed by the RFC. 67 func VerifyDNSLength(verify bool) Option { 68 return func(o *options) { o.verifyDNSLength = verify } 69 } 70 71 // RemoveLeadingDots removes leading label separators. Leading runes that map to 72 // dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well. 73 // 74 // This is the behavior suggested by the UTS #46 and is adopted by some 75 // browsers. 76 func RemoveLeadingDots(remove bool) Option { 77 return func(o *options) { o.removeLeadingDots = remove } 78 } 79 80 // ValidateLabels sets whether to check the mandatory label validation criteria 81 // as defined in Section 5.4 of RFC 5891. This includes testing for correct use 82 // of hyphens ('-'), normalization, validity of runes, and the context rules. 83 func ValidateLabels(enable bool) Option { 84 return func(o *options) { 85 // Don't override existing mappings, but set one that at least checks 86 // normalization if it is not set. 87 if o.mapping == nil && enable { 88 o.mapping = normalize 89 } 90 o.trie = trie 91 o.validateLabels = enable 92 o.fromPuny = validateFromPunycode 93 } 94 } 95 96 // StrictDomainName limits the set of permissible ASCII characters to those 97 // allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the 98 // hyphen). This is set by default for MapForLookup and ValidateForRegistration. 99 // 100 // This option is useful, for instance, for browsers that allow characters 101 // outside this range, for example a '_' (U+005F LOW LINE). See 102 // http://www.rfc-editor.org/std/std3.txt for more details This option 103 // corresponds to the UseSTD3ASCIIRules option in UTS #46. 104 func StrictDomainName(use bool) Option { 105 return func(o *options) { 106 o.trie = trie 107 o.useSTD3Rules = use 108 o.fromPuny = validateFromPunycode 109 } 110 } 111 112 // NOTE: the following options pull in tables. The tables should not be linked 113 // in as long as the options are not used. 114 115 // BidiRule enables the Bidi rule as defined in RFC 5893. Any application 116 // that relies on proper validation of labels should include this rule. 117 func BidiRule() Option { 118 return func(o *options) { o.bidirule = bidirule.ValidString } 119 } 120 121 // ValidateForRegistration sets validation options to verify that a given IDN is 122 // properly formatted for registration as defined by Section 4 of RFC 5891. 123 func ValidateForRegistration() Option { 124 return func(o *options) { 125 o.mapping = validateRegistration 126 StrictDomainName(true)(o) 127 ValidateLabels(true)(o) 128 VerifyDNSLength(true)(o) 129 BidiRule()(o) 130 } 131 } 132 133 // MapForLookup sets validation and mapping options such that a given IDN is 134 // transformed for domain name lookup according to the requirements set out in 135 // Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894, 136 // RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option 137 // to add this check. 138 // 139 // The mappings include normalization and mapping case, width and other 140 // compatibility mappings. 141 func MapForLookup() Option { 142 return func(o *options) { 143 o.mapping = validateAndMap 144 StrictDomainName(true)(o) 145 ValidateLabels(true)(o) 146 } 147 } 148 149 type options struct { 150 transitional bool 151 useSTD3Rules bool 152 validateLabels bool 153 verifyDNSLength bool 154 removeLeadingDots bool 155 156 trie *idnaTrie 157 158 // fromPuny calls validation rules when converting A-labels to U-labels. 159 fromPuny func(p *Profile, s string) error 160 161 // mapping implements a validation and mapping step as defined in RFC 5895 162 // or UTS 46, tailored to, for example, domain registration or lookup. 163 mapping func(p *Profile, s string) (mapped string, isBidi bool, err error) 164 165 // bidirule, if specified, checks whether s conforms to the Bidi Rule 166 // defined in RFC 5893. 167 bidirule func(s string) bool 168 } 169 170 // A Profile defines the configuration of an IDNA mapper. 171 type Profile struct { 172 options 173 } 174 175 func apply(o *options, opts []Option) { 176 for _, f := range opts { 177 f(o) 178 } 179 } 180 181 // New creates a new Profile. 182 // 183 // With no options, the returned Profile is the most permissive and equals the 184 // Punycode Profile. Options can be passed to further restrict the Profile. The 185 // MapForLookup and ValidateForRegistration options set a collection of options, 186 // for lookup and registration purposes respectively, which can be tailored by 187 // adding more fine-grained options, where later options override earlier 188 // options. 189 func New(o ...Option) *Profile { 190 p := &Profile{} 191 apply(&p.options, o) 192 return p 193 } 194 195 // ToASCII converts a domain or domain label to its ASCII form. For example, 196 // ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and 197 // ToASCII("golang") is "golang". If an error is encountered it will return 198 // an error and a (partially) processed result. 199 func (p *Profile) ToASCII(s string) (string, error) { 200 return p.process(s, true) 201 } 202 203 // ToUnicode converts a domain or domain label to its Unicode form. For example, 204 // ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and 205 // ToUnicode("golang") is "golang". If an error is encountered it will return 206 // an error and a (partially) processed result. 207 func (p *Profile) ToUnicode(s string) (string, error) { 208 pp := *p 209 pp.transitional = false 210 return pp.process(s, false) 211 } 212 213 // String reports a string with a description of the profile for debugging 214 // purposes. The string format may change with different versions. 215 func (p *Profile) String() string { 216 s := "" 217 if p.transitional { 218 s = "Transitional" 219 } else { 220 s = "NonTransitional" 221 } 222 if p.useSTD3Rules { 223 s += ":UseSTD3Rules" 224 } 225 if p.validateLabels { 226 s += ":ValidateLabels" 227 } 228 if p.verifyDNSLength { 229 s += ":VerifyDNSLength" 230 } 231 return s 232 } 233 234 var ( 235 // Punycode is a Profile that does raw punycode processing with a minimum 236 // of validation. 237 Punycode *Profile = punycode 238 239 // Lookup is the recommended profile for looking up domain names, according 240 // to Section 5 of RFC 5891. The exact configuration of this profile may 241 // change over time. 242 Lookup *Profile = lookup 243 244 // Display is the recommended profile for displaying domain names. 245 // The configuration of this profile may change over time. 246 Display *Profile = display 247 248 // Registration is the recommended profile for checking whether a given 249 // IDN is valid for registration, according to Section 4 of RFC 5891. 250 Registration *Profile = registration 251 252 punycode = &Profile{} 253 lookup = &Profile{options{ 254 transitional: true, 255 useSTD3Rules: true, 256 validateLabels: true, 257 trie: trie, 258 fromPuny: validateFromPunycode, 259 mapping: validateAndMap, 260 bidirule: bidirule.ValidString, 261 }} 262 display = &Profile{options{ 263 useSTD3Rules: true, 264 validateLabels: true, 265 trie: trie, 266 fromPuny: validateFromPunycode, 267 mapping: validateAndMap, 268 bidirule: bidirule.ValidString, 269 }} 270 registration = &Profile{options{ 271 useSTD3Rules: true, 272 validateLabels: true, 273 verifyDNSLength: true, 274 trie: trie, 275 fromPuny: validateFromPunycode, 276 mapping: validateRegistration, 277 bidirule: bidirule.ValidString, 278 }} 279 280 // TODO: profiles 281 // Register: recommended for approving domain names: don't do any mappings 282 // but rather reject on invalid input. Bundle or block deviation characters. 283 ) 284 285 type labelError struct{ label, code_ string } 286 287 func (e labelError) code() string { return e.code_ } 288 func (e labelError) Error() string { 289 return fmt.Sprintf("idna: invalid label %q", e.label) 290 } 291 292 type runeError rune 293 294 func (e runeError) code() string { return "P1" } 295 func (e runeError) Error() string { 296 return fmt.Sprintf("idna: disallowed rune %U", e) 297 } 298 299 // process implements the algorithm described in section 4 of UTS #46, 300 // see http://www.unicode.org/reports/tr46. 301 func (p *Profile) process(s string, toASCII bool) (string, error) { 302 var err error 303 var isBidi bool 304 if p.mapping != nil { 305 s, isBidi, err = p.mapping(p, s) 306 } 307 // Remove leading empty labels. 308 if p.removeLeadingDots { 309 for ; len(s) > 0 && s[0] == '.'; s = s[1:] { 310 } 311 } 312 // TODO: allow for a quick check of the tables data. 313 // It seems like we should only create this error on ToASCII, but the 314 // UTS 46 conformance tests suggests we should always check this. 315 if err == nil && p.verifyDNSLength && s == "" { 316 err = &labelError{s, "A4"} 317 } 318 labels := labelIter{orig: s} 319 for ; !labels.done(); labels.next() { 320 label := labels.label() 321 if label == "" { 322 // Empty labels are not okay. The label iterator skips the last 323 // label if it is empty. 324 if err == nil && p.verifyDNSLength { 325 err = &labelError{s, "A4"} 326 } 327 continue 328 } 329 if strings.HasPrefix(label, acePrefix) { 330 u, err2 := decode(label[len(acePrefix):]) 331 if err2 != nil { 332 if err == nil { 333 err = err2 334 } 335 // Spec says keep the old label. 336 continue 337 } 338 isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight 339 labels.set(u) 340 if err == nil && p.validateLabels { 341 err = p.fromPuny(p, u) 342 } 343 if err == nil { 344 // This should be called on NonTransitional, according to the 345 // spec, but that currently does not have any effect. Use the 346 // original profile to preserve options. 347 err = p.validateLabel(u) 348 } 349 } else if err == nil { 350 err = p.validateLabel(label) 351 } 352 } 353 if isBidi && p.bidirule != nil && err == nil { 354 for labels.reset(); !labels.done(); labels.next() { 355 if !p.bidirule(labels.label()) { 356 err = &labelError{s, "B"} 357 break 358 } 359 } 360 } 361 if toASCII { 362 for labels.reset(); !labels.done(); labels.next() { 363 label := labels.label() 364 if !ascii(label) { 365 a, err2 := encode(acePrefix, label) 366 if err == nil { 367 err = err2 368 } 369 label = a 370 labels.set(a) 371 } 372 n := len(label) 373 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { 374 err = &labelError{label, "A4"} 375 } 376 } 377 } 378 s = labels.result() 379 if toASCII && p.verifyDNSLength && err == nil { 380 // Compute the length of the domain name minus the root label and its dot. 381 n := len(s) 382 if n > 0 && s[n-1] == '.' { 383 n-- 384 } 385 if len(s) < 1 || n > 253 { 386 err = &labelError{s, "A4"} 387 } 388 } 389 return s, err 390 } 391 392 func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) { 393 // TODO: consider first doing a quick check to see if any of these checks 394 // need to be done. This will make it slower in the general case, but 395 // faster in the common case. 396 mapped = norm.NFC.String(s) 397 isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft 398 return mapped, isBidi, nil 399 } 400 401 func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) { 402 // TODO: filter need for normalization in loop below. 403 if !norm.NFC.IsNormalString(s) { 404 return s, false, &labelError{s, "V1"} 405 } 406 for i := 0; i < len(s); { 407 v, sz := trie.lookupString(s[i:]) 408 if sz == 0 { 409 return s, bidi, runeError(utf8.RuneError) 410 } 411 bidi = bidi || info(v).isBidi(s[i:]) 412 // Copy bytes not copied so far. 413 switch p.simplify(info(v).category()) { 414 // TODO: handle the NV8 defined in the Unicode idna data set to allow 415 // for strict conformance to IDNA2008. 416 case valid, deviation: 417 case disallowed, mapped, unknown, ignored: 418 r, _ := utf8.DecodeRuneInString(s[i:]) 419 return s, bidi, runeError(r) 420 } 421 i += sz 422 } 423 return s, bidi, nil 424 } 425 426 func (c info) isBidi(s string) bool { 427 if !c.isMapped() { 428 return c&attributesMask == rtl 429 } 430 // TODO: also store bidi info for mapped data. This is possible, but a bit 431 // cumbersome and not for the common case. 432 p, _ := bidi.LookupString(s) 433 switch p.Class() { 434 case bidi.R, bidi.AL, bidi.AN: 435 return true 436 } 437 return false 438 } 439 440 func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) { 441 var ( 442 b []byte 443 k int 444 ) 445 // combinedInfoBits contains the or-ed bits of all runes. We use this 446 // to derive the mayNeedNorm bit later. This may trigger normalization 447 // overeagerly, but it will not do so in the common case. The end result 448 // is another 10% saving on BenchmarkProfile for the common case. 449 var combinedInfoBits info 450 for i := 0; i < len(s); { 451 v, sz := trie.lookupString(s[i:]) 452 if sz == 0 { 453 b = append(b, s[k:i]...) 454 b = append(b, "\ufffd"...) 455 k = len(s) 456 if err == nil { 457 err = runeError(utf8.RuneError) 458 } 459 break 460 } 461 combinedInfoBits |= info(v) 462 bidi = bidi || info(v).isBidi(s[i:]) 463 start := i 464 i += sz 465 // Copy bytes not copied so far. 466 switch p.simplify(info(v).category()) { 467 case valid: 468 continue 469 case disallowed: 470 if err == nil { 471 r, _ := utf8.DecodeRuneInString(s[start:]) 472 err = runeError(r) 473 } 474 continue 475 case mapped, deviation: 476 b = append(b, s[k:start]...) 477 b = info(v).appendMapping(b, s[start:i]) 478 case ignored: 479 b = append(b, s[k:start]...) 480 // drop the rune 481 case unknown: 482 b = append(b, s[k:start]...) 483 b = append(b, "\ufffd"...) 484 } 485 k = i 486 } 487 if k == 0 { 488 // No changes so far. 489 if combinedInfoBits&mayNeedNorm != 0 { 490 s = norm.NFC.String(s) 491 } 492 } else { 493 b = append(b, s[k:]...) 494 if norm.NFC.QuickSpan(b) != len(b) { 495 b = norm.NFC.Bytes(b) 496 } 497 // TODO: the punycode converters require strings as input. 498 s = string(b) 499 } 500 return s, bidi, err 501 } 502 503 // A labelIter allows iterating over domain name labels. 504 type labelIter struct { 505 orig string 506 slice []string 507 curStart int 508 curEnd int 509 i int 510 } 511 512 func (l *labelIter) reset() { 513 l.curStart = 0 514 l.curEnd = 0 515 l.i = 0 516 } 517 518 func (l *labelIter) done() bool { 519 return l.curStart >= len(l.orig) 520 } 521 522 func (l *labelIter) result() string { 523 if l.slice != nil { 524 return strings.Join(l.slice, ".") 525 } 526 return l.orig 527 } 528 529 func (l *labelIter) label() string { 530 if l.slice != nil { 531 return l.slice[l.i] 532 } 533 p := strings.IndexByte(l.orig[l.curStart:], '.') 534 l.curEnd = l.curStart + p 535 if p == -1 { 536 l.curEnd = len(l.orig) 537 } 538 return l.orig[l.curStart:l.curEnd] 539 } 540 541 // next sets the value to the next label. It skips the last label if it is empty. 542 func (l *labelIter) next() { 543 l.i++ 544 if l.slice != nil { 545 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { 546 l.curStart = len(l.orig) 547 } 548 } else { 549 l.curStart = l.curEnd + 1 550 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { 551 l.curStart = len(l.orig) 552 } 553 } 554 } 555 556 func (l *labelIter) set(s string) { 557 if l.slice == nil { 558 l.slice = strings.Split(l.orig, ".") 559 } 560 l.slice[l.i] = s 561 } 562 563 // acePrefix is the ASCII Compatible Encoding prefix. 564 const acePrefix = "xn--" 565 566 func (p *Profile) simplify(cat category) category { 567 switch cat { 568 case disallowedSTD3Mapped: 569 if p.useSTD3Rules { 570 cat = disallowed 571 } else { 572 cat = mapped 573 } 574 case disallowedSTD3Valid: 575 if p.useSTD3Rules { 576 cat = disallowed 577 } else { 578 cat = valid 579 } 580 case deviation: 581 if !p.transitional { 582 cat = valid 583 } 584 case validNV8, validXV8: 585 // TODO: handle V2008 586 cat = valid 587 } 588 return cat 589 } 590 591 func validateFromPunycode(p *Profile, s string) error { 592 if !norm.NFC.IsNormalString(s) { 593 return &labelError{s, "V1"} 594 } 595 // TODO: detect whether string may have to be normalized in the following 596 // loop. 597 for i := 0; i < len(s); { 598 v, sz := trie.lookupString(s[i:]) 599 if sz == 0 { 600 return runeError(utf8.RuneError) 601 } 602 if c := p.simplify(info(v).category()); c != valid && c != deviation { 603 return &labelError{s, "V6"} 604 } 605 i += sz 606 } 607 return nil 608 } 609 610 const ( 611 zwnj = "\u200c" 612 zwj = "\u200d" 613 ) 614 615 type joinState int8 616 617 const ( 618 stateStart joinState = iota 619 stateVirama 620 stateBefore 621 stateBeforeVirama 622 stateAfter 623 stateFAIL 624 ) 625 626 var joinStates = [][numJoinTypes]joinState{ 627 stateStart: { 628 joiningL: stateBefore, 629 joiningD: stateBefore, 630 joinZWNJ: stateFAIL, 631 joinZWJ: stateFAIL, 632 joinVirama: stateVirama, 633 }, 634 stateVirama: { 635 joiningL: stateBefore, 636 joiningD: stateBefore, 637 }, 638 stateBefore: { 639 joiningL: stateBefore, 640 joiningD: stateBefore, 641 joiningT: stateBefore, 642 joinZWNJ: stateAfter, 643 joinZWJ: stateFAIL, 644 joinVirama: stateBeforeVirama, 645 }, 646 stateBeforeVirama: { 647 joiningL: stateBefore, 648 joiningD: stateBefore, 649 joiningT: stateBefore, 650 }, 651 stateAfter: { 652 joiningL: stateFAIL, 653 joiningD: stateBefore, 654 joiningT: stateAfter, 655 joiningR: stateStart, 656 joinZWNJ: stateFAIL, 657 joinZWJ: stateFAIL, 658 joinVirama: stateAfter, // no-op as we can't accept joiners here 659 }, 660 stateFAIL: { 661 0: stateFAIL, 662 joiningL: stateFAIL, 663 joiningD: stateFAIL, 664 joiningT: stateFAIL, 665 joiningR: stateFAIL, 666 joinZWNJ: stateFAIL, 667 joinZWJ: stateFAIL, 668 joinVirama: stateFAIL, 669 }, 670 } 671 672 // validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are 673 // already implicitly satisfied by the overall implementation. 674 func (p *Profile) validateLabel(s string) (err error) { 675 if s == "" { 676 if p.verifyDNSLength { 677 return &labelError{s, "A4"} 678 } 679 return nil 680 } 681 if !p.validateLabels { 682 return nil 683 } 684 trie := p.trie // p.validateLabels is only set if trie is set. 685 if len(s) > 4 && s[2] == '-' && s[3] == '-' { 686 return &labelError{s, "V2"} 687 } 688 if s[0] == '-' || s[len(s)-1] == '-' { 689 return &labelError{s, "V3"} 690 } 691 // TODO: merge the use of this in the trie. 692 v, sz := trie.lookupString(s) 693 x := info(v) 694 if x.isModifier() { 695 return &labelError{s, "V5"} 696 } 697 // Quickly return in the absence of zero-width (non) joiners. 698 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { 699 return nil 700 } 701 st := stateStart 702 for i := 0; ; { 703 jt := x.joinType() 704 if s[i:i+sz] == zwj { 705 jt = joinZWJ 706 } else if s[i:i+sz] == zwnj { 707 jt = joinZWNJ 708 } 709 st = joinStates[st][jt] 710 if x.isViramaModifier() { 711 st = joinStates[st][joinVirama] 712 } 713 if i += sz; i == len(s) { 714 break 715 } 716 v, sz = trie.lookupString(s[i:]) 717 x = info(v) 718 } 719 if st == stateFAIL || st == stateAfter { 720 return &labelError{s, "C"} 721 } 722 return nil 723 } 724 725 func ascii(s string) bool { 726 for i := 0; i < len(s); i++ { 727 if s[i] >= utf8.RuneSelf { 728 return false 729 } 730 } 731 return true 732 }