github.com/liquid-dev/text@v0.3.3-liquid/internal/export/idna/idna9.0.0.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !go1.10 6 //go:generate go run gen.go gen_trieval.go gen_common.go 7 8 // Package idna implements IDNA2008 using the compatibility processing 9 // defined by UTS (Unicode Technical Standard) #46, which defines a standard to 10 // deal with the transition from IDNA2003. 11 // 12 // IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC 13 // 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894. 14 // UTS #46 is defined in https://www.unicode.org/reports/tr46. 15 // See https://unicode.org/cldr/utility/idna.jsp for a visualization of the 16 // differences between these two standards. 17 package idna // import "github.com/liquid-dev/text/internal/export/idna" 18 19 import ( 20 "fmt" 21 "strings" 22 "unicode/utf8" 23 24 "github.com/liquid-dev/text/secure/bidirule" 25 "github.com/liquid-dev/text/unicode/norm" 26 ) 27 28 // NOTE: Unlike common practice in Go APIs, the functions will return a 29 // sanitized domain name in case of errors. Browsers sometimes use a partially 30 // evaluated string as lookup. 31 // TODO: the current error handling is, in my opinion, the least opinionated. 32 // Other strategies are also viable, though: 33 // Option 1) Return an empty string in case of error, but allow the user to 34 // specify explicitly which errors to ignore. 35 // Option 2) Return the partially evaluated string if it is itself a valid 36 // string, otherwise return the empty string in case of error. 37 // Option 3) Option 1 and 2. 38 // Option 4) Always return an empty string for now and implement Option 1 as 39 // needed, and document that the return string may not be empty in case of 40 // error in the future. 41 // I think Option 1 is best, but it is quite opinionated. 42 43 // ToASCII is a wrapper for Punycode.ToASCII. 44 func ToASCII(s string) (string, error) { 45 return Punycode.process(s, true) 46 } 47 48 // ToUnicode is a wrapper for Punycode.ToUnicode. 49 func ToUnicode(s string) (string, error) { 50 return Punycode.process(s, false) 51 } 52 53 // An Option configures a Profile at creation time. 54 type Option func(*options) 55 56 // Transitional sets a Profile to use the Transitional mapping as defined in UTS 57 // #46. This will cause, for example, "ß" to be mapped to "ss". Using the 58 // transitional mapping provides a compromise between IDNA2003 and IDNA2008 59 // compatibility. It is used by most browsers when resolving domain names. This 60 // option is only meaningful if combined with MapForLookup. 61 func Transitional(transitional bool) Option { 62 return func(o *options) { o.transitional = true } 63 } 64 65 // VerifyDNSLength sets whether a Profile should fail if any of the IDN parts 66 // are longer than allowed by the RFC. 67 func VerifyDNSLength(verify bool) Option { 68 return func(o *options) { o.verifyDNSLength = verify } 69 } 70 71 // RemoveLeadingDots removes leading label separators. Leading runes that map to 72 // dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well. 73 // 74 // This is the behavior suggested by the UTS #46 and is adopted by some 75 // browsers. 76 func RemoveLeadingDots(remove bool) Option { 77 return func(o *options) { o.removeLeadingDots = remove } 78 } 79 80 // ValidateLabels sets whether to check the mandatory label validation criteria 81 // as defined in Section 5.4 of RFC 5891. This includes testing for correct use 82 // of hyphens ('-'), normalization, validity of runes, and the context rules. 83 func ValidateLabels(enable bool) Option { 84 return func(o *options) { 85 // Don't override existing mappings, but set one that at least checks 86 // normalization if it is not set. 87 if o.mapping == nil && enable { 88 o.mapping = normalize 89 } 90 o.trie = trie 91 o.validateLabels = enable 92 o.fromPuny = validateFromPunycode 93 } 94 } 95 96 // StrictDomainName limits the set of permissable ASCII characters to those 97 // allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the 98 // hyphen). This is set by default for MapForLookup and ValidateForRegistration. 99 // 100 // This option is useful, for instance, for browsers that allow characters 101 // outside this range, for example a '_' (U+005F LOW LINE). See 102 // http://www.rfc-editor.org/std/std3.txt for more details This option 103 // corresponds to the UseSTD3ASCIIRules option in UTS #46. 104 func StrictDomainName(use bool) Option { 105 return func(o *options) { 106 o.trie = trie 107 o.useSTD3Rules = use 108 o.fromPuny = validateFromPunycode 109 } 110 } 111 112 // NOTE: the following options pull in tables. The tables should not be linked 113 // in as long as the options are not used. 114 115 // BidiRule enables the Bidi rule as defined in RFC 5893. Any application 116 // that relies on proper validation of labels should include this rule. 117 func BidiRule() Option { 118 return func(o *options) { o.bidirule = bidirule.ValidString } 119 } 120 121 // ValidateForRegistration sets validation options to verify that a given IDN is 122 // properly formatted for registration as defined by Section 4 of RFC 5891. 123 func ValidateForRegistration() Option { 124 return func(o *options) { 125 o.mapping = validateRegistration 126 StrictDomainName(true)(o) 127 ValidateLabels(true)(o) 128 VerifyDNSLength(true)(o) 129 BidiRule()(o) 130 } 131 } 132 133 // MapForLookup sets validation and mapping options such that a given IDN is 134 // transformed for domain name lookup according to the requirements set out in 135 // Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894, 136 // RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option 137 // to add this check. 138 // 139 // The mappings include normalization and mapping case, width and other 140 // compatibility mappings. 141 func MapForLookup() Option { 142 return func(o *options) { 143 o.mapping = validateAndMap 144 StrictDomainName(true)(o) 145 ValidateLabels(true)(o) 146 RemoveLeadingDots(true)(o) 147 } 148 } 149 150 type options struct { 151 transitional bool 152 useSTD3Rules bool 153 validateLabels bool 154 verifyDNSLength bool 155 removeLeadingDots bool 156 157 trie *idnaTrie 158 159 // fromPuny calls validation rules when converting A-labels to U-labels. 160 fromPuny func(p *Profile, s string) error 161 162 // mapping implements a validation and mapping step as defined in RFC 5895 163 // or UTS 46, tailored to, for example, domain registration or lookup. 164 mapping func(p *Profile, s string) (string, error) 165 166 // bidirule, if specified, checks whether s conforms to the Bidi Rule 167 // defined in RFC 5893. 168 bidirule func(s string) bool 169 } 170 171 // A Profile defines the configuration of a IDNA mapper. 172 type Profile struct { 173 options 174 } 175 176 func apply(o *options, opts []Option) { 177 for _, f := range opts { 178 f(o) 179 } 180 } 181 182 // New creates a new Profile. 183 // 184 // With no options, the returned Profile is the most permissive and equals the 185 // Punycode Profile. Options can be passed to further restrict the Profile. The 186 // MapForLookup and ValidateForRegistration options set a collection of options, 187 // for lookup and registration purposes respectively, which can be tailored by 188 // adding more fine-grained options, where later options override earlier 189 // options. 190 func New(o ...Option) *Profile { 191 p := &Profile{} 192 apply(&p.options, o) 193 return p 194 } 195 196 // ToASCII converts a domain or domain label to its ASCII form. For example, 197 // ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and 198 // ToASCII("golang") is "golang". If an error is encountered it will return 199 // an error and a (partially) processed result. 200 func (p *Profile) ToASCII(s string) (string, error) { 201 return p.process(s, true) 202 } 203 204 // ToUnicode converts a domain or domain label to its Unicode form. For example, 205 // ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and 206 // ToUnicode("golang") is "golang". If an error is encountered it will return 207 // an error and a (partially) processed result. 208 func (p *Profile) ToUnicode(s string) (string, error) { 209 pp := *p 210 pp.transitional = false 211 return pp.process(s, false) 212 } 213 214 // String reports a string with a description of the profile for debugging 215 // purposes. The string format may change with different versions. 216 func (p *Profile) String() string { 217 s := "" 218 if p.transitional { 219 s = "Transitional" 220 } else { 221 s = "NonTransitional" 222 } 223 if p.useSTD3Rules { 224 s += ":UseSTD3Rules" 225 } 226 if p.validateLabels { 227 s += ":ValidateLabels" 228 } 229 if p.verifyDNSLength { 230 s += ":VerifyDNSLength" 231 } 232 return s 233 } 234 235 var ( 236 // Punycode is a Profile that does raw punycode processing with a minimum 237 // of validation. 238 Punycode *Profile = punycode 239 240 // Lookup is the recommended profile for looking up domain names, according 241 // to Section 5 of RFC 5891. The exact configuration of this profile may 242 // change over time. 243 Lookup *Profile = lookup 244 245 // Display is the recommended profile for displaying domain names. 246 // The configuration of this profile may change over time. 247 Display *Profile = display 248 249 // Registration is the recommended profile for checking whether a given 250 // IDN is valid for registration, according to Section 4 of RFC 5891. 251 Registration *Profile = registration 252 253 punycode = &Profile{} 254 lookup = &Profile{options{ 255 transitional: true, 256 useSTD3Rules: true, 257 validateLabels: true, 258 removeLeadingDots: true, 259 trie: trie, 260 fromPuny: validateFromPunycode, 261 mapping: validateAndMap, 262 bidirule: bidirule.ValidString, 263 }} 264 display = &Profile{options{ 265 useSTD3Rules: true, 266 validateLabels: true, 267 removeLeadingDots: true, 268 trie: trie, 269 fromPuny: validateFromPunycode, 270 mapping: validateAndMap, 271 bidirule: bidirule.ValidString, 272 }} 273 registration = &Profile{options{ 274 useSTD3Rules: true, 275 validateLabels: true, 276 verifyDNSLength: true, 277 trie: trie, 278 fromPuny: validateFromPunycode, 279 mapping: validateRegistration, 280 bidirule: bidirule.ValidString, 281 }} 282 283 // TODO: profiles 284 // Register: recommended for approving domain names: don't do any mappings 285 // but rather reject on invalid input. Bundle or block deviation characters. 286 ) 287 288 type labelError struct{ label, code_ string } 289 290 func (e labelError) code() string { return e.code_ } 291 func (e labelError) Error() string { 292 return fmt.Sprintf("idna: invalid label %q", e.label) 293 } 294 295 type runeError rune 296 297 func (e runeError) code() string { return "P1" } 298 func (e runeError) Error() string { 299 return fmt.Sprintf("idna: disallowed rune %U", e) 300 } 301 302 // process implements the algorithm described in section 4 of UTS #46, 303 // see https://www.unicode.org/reports/tr46. 304 func (p *Profile) process(s string, toASCII bool) (string, error) { 305 var err error 306 if p.mapping != nil { 307 s, err = p.mapping(p, s) 308 } 309 // Remove leading empty labels. 310 if p.removeLeadingDots { 311 for ; len(s) > 0 && s[0] == '.'; s = s[1:] { 312 } 313 } 314 // It seems like we should only create this error on ToASCII, but the 315 // UTS 46 conformance tests suggests we should always check this. 316 if err == nil && p.verifyDNSLength && s == "" { 317 err = &labelError{s, "A4"} 318 } 319 labels := labelIter{orig: s} 320 for ; !labels.done(); labels.next() { 321 label := labels.label() 322 if label == "" { 323 // Empty labels are not okay. The label iterator skips the last 324 // label if it is empty. 325 if err == nil && p.verifyDNSLength { 326 err = &labelError{s, "A4"} 327 } 328 continue 329 } 330 if strings.HasPrefix(label, acePrefix) { 331 u, err2 := decode(label[len(acePrefix):]) 332 if err2 != nil { 333 if err == nil { 334 err = err2 335 } 336 // Spec says keep the old label. 337 continue 338 } 339 labels.set(u) 340 if err == nil && p.validateLabels { 341 err = p.fromPuny(p, u) 342 } 343 if err == nil { 344 // This should be called on NonTransitional, according to the 345 // spec, but that currently does not have any effect. Use the 346 // original profile to preserve options. 347 err = p.validateLabel(u) 348 } 349 } else if err == nil { 350 err = p.validateLabel(label) 351 } 352 } 353 if toASCII { 354 for labels.reset(); !labels.done(); labels.next() { 355 label := labels.label() 356 if !ascii(label) { 357 a, err2 := encode(acePrefix, label) 358 if err == nil { 359 err = err2 360 } 361 label = a 362 labels.set(a) 363 } 364 n := len(label) 365 if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { 366 err = &labelError{label, "A4"} 367 } 368 } 369 } 370 s = labels.result() 371 if toASCII && p.verifyDNSLength && err == nil { 372 // Compute the length of the domain name minus the root label and its dot. 373 n := len(s) 374 if n > 0 && s[n-1] == '.' { 375 n-- 376 } 377 if len(s) < 1 || n > 253 { 378 err = &labelError{s, "A4"} 379 } 380 } 381 return s, err 382 } 383 384 func normalize(p *Profile, s string) (string, error) { 385 return norm.NFC.String(s), nil 386 } 387 388 func validateRegistration(p *Profile, s string) (string, error) { 389 if !norm.NFC.IsNormalString(s) { 390 return s, &labelError{s, "V1"} 391 } 392 for i := 0; i < len(s); { 393 v, sz := trie.lookupString(s[i:]) 394 // Copy bytes not copied so far. 395 switch p.simplify(info(v).category()) { 396 // TODO: handle the NV8 defined in the Unicode idna data set to allow 397 // for strict conformance to IDNA2008. 398 case valid, deviation: 399 case disallowed, mapped, unknown, ignored: 400 r, _ := utf8.DecodeRuneInString(s[i:]) 401 return s, runeError(r) 402 } 403 i += sz 404 } 405 return s, nil 406 } 407 408 func validateAndMap(p *Profile, s string) (string, error) { 409 var ( 410 err error 411 b []byte 412 k int 413 ) 414 for i := 0; i < len(s); { 415 v, sz := trie.lookupString(s[i:]) 416 start := i 417 i += sz 418 // Copy bytes not copied so far. 419 switch p.simplify(info(v).category()) { 420 case valid: 421 continue 422 case disallowed: 423 if err == nil { 424 r, _ := utf8.DecodeRuneInString(s[start:]) 425 err = runeError(r) 426 } 427 continue 428 case mapped, deviation: 429 b = append(b, s[k:start]...) 430 b = info(v).appendMapping(b, s[start:i]) 431 case ignored: 432 b = append(b, s[k:start]...) 433 // drop the rune 434 case unknown: 435 b = append(b, s[k:start]...) 436 b = append(b, "\ufffd"...) 437 } 438 k = i 439 } 440 if k == 0 { 441 // No changes so far. 442 s = norm.NFC.String(s) 443 } else { 444 b = append(b, s[k:]...) 445 if norm.NFC.QuickSpan(b) != len(b) { 446 b = norm.NFC.Bytes(b) 447 } 448 // TODO: the punycode converters require strings as input. 449 s = string(b) 450 } 451 return s, err 452 } 453 454 // A labelIter allows iterating over domain name labels. 455 type labelIter struct { 456 orig string 457 slice []string 458 curStart int 459 curEnd int 460 i int 461 } 462 463 func (l *labelIter) reset() { 464 l.curStart = 0 465 l.curEnd = 0 466 l.i = 0 467 } 468 469 func (l *labelIter) done() bool { 470 return l.curStart >= len(l.orig) 471 } 472 473 func (l *labelIter) result() string { 474 if l.slice != nil { 475 return strings.Join(l.slice, ".") 476 } 477 return l.orig 478 } 479 480 func (l *labelIter) label() string { 481 if l.slice != nil { 482 return l.slice[l.i] 483 } 484 p := strings.IndexByte(l.orig[l.curStart:], '.') 485 l.curEnd = l.curStart + p 486 if p == -1 { 487 l.curEnd = len(l.orig) 488 } 489 return l.orig[l.curStart:l.curEnd] 490 } 491 492 // next sets the value to the next label. It skips the last label if it is empty. 493 func (l *labelIter) next() { 494 l.i++ 495 if l.slice != nil { 496 if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { 497 l.curStart = len(l.orig) 498 } 499 } else { 500 l.curStart = l.curEnd + 1 501 if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { 502 l.curStart = len(l.orig) 503 } 504 } 505 } 506 507 func (l *labelIter) set(s string) { 508 if l.slice == nil { 509 l.slice = strings.Split(l.orig, ".") 510 } 511 l.slice[l.i] = s 512 } 513 514 // acePrefix is the ASCII Compatible Encoding prefix. 515 const acePrefix = "xn--" 516 517 func (p *Profile) simplify(cat category) category { 518 switch cat { 519 case disallowedSTD3Mapped: 520 if p.useSTD3Rules { 521 cat = disallowed 522 } else { 523 cat = mapped 524 } 525 case disallowedSTD3Valid: 526 if p.useSTD3Rules { 527 cat = disallowed 528 } else { 529 cat = valid 530 } 531 case deviation: 532 if !p.transitional { 533 cat = valid 534 } 535 case validNV8, validXV8: 536 // TODO: handle V2008 537 cat = valid 538 } 539 return cat 540 } 541 542 func validateFromPunycode(p *Profile, s string) error { 543 if !norm.NFC.IsNormalString(s) { 544 return &labelError{s, "V1"} 545 } 546 for i := 0; i < len(s); { 547 v, sz := trie.lookupString(s[i:]) 548 if c := p.simplify(info(v).category()); c != valid && c != deviation { 549 return &labelError{s, "V6"} 550 } 551 i += sz 552 } 553 return nil 554 } 555 556 const ( 557 zwnj = "\u200c" 558 zwj = "\u200d" 559 ) 560 561 type joinState int8 562 563 const ( 564 stateStart joinState = iota 565 stateVirama 566 stateBefore 567 stateBeforeVirama 568 stateAfter 569 stateFAIL 570 ) 571 572 var joinStates = [][numJoinTypes]joinState{ 573 stateStart: { 574 joiningL: stateBefore, 575 joiningD: stateBefore, 576 joinZWNJ: stateFAIL, 577 joinZWJ: stateFAIL, 578 joinVirama: stateVirama, 579 }, 580 stateVirama: { 581 joiningL: stateBefore, 582 joiningD: stateBefore, 583 }, 584 stateBefore: { 585 joiningL: stateBefore, 586 joiningD: stateBefore, 587 joiningT: stateBefore, 588 joinZWNJ: stateAfter, 589 joinZWJ: stateFAIL, 590 joinVirama: stateBeforeVirama, 591 }, 592 stateBeforeVirama: { 593 joiningL: stateBefore, 594 joiningD: stateBefore, 595 joiningT: stateBefore, 596 }, 597 stateAfter: { 598 joiningL: stateFAIL, 599 joiningD: stateBefore, 600 joiningT: stateAfter, 601 joiningR: stateStart, 602 joinZWNJ: stateFAIL, 603 joinZWJ: stateFAIL, 604 joinVirama: stateAfter, // no-op as we can't accept joiners here 605 }, 606 stateFAIL: { 607 0: stateFAIL, 608 joiningL: stateFAIL, 609 joiningD: stateFAIL, 610 joiningT: stateFAIL, 611 joiningR: stateFAIL, 612 joinZWNJ: stateFAIL, 613 joinZWJ: stateFAIL, 614 joinVirama: stateFAIL, 615 }, 616 } 617 618 // validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are 619 // already implicitly satisfied by the overall implementation. 620 func (p *Profile) validateLabel(s string) error { 621 if s == "" { 622 if p.verifyDNSLength { 623 return &labelError{s, "A4"} 624 } 625 return nil 626 } 627 if p.bidirule != nil && !p.bidirule(s) { 628 return &labelError{s, "B"} 629 } 630 if !p.validateLabels { 631 return nil 632 } 633 trie := p.trie // p.validateLabels is only set if trie is set. 634 if len(s) > 4 && s[2] == '-' && s[3] == '-' { 635 return &labelError{s, "V2"} 636 } 637 if s[0] == '-' || s[len(s)-1] == '-' { 638 return &labelError{s, "V3"} 639 } 640 // TODO: merge the use of this in the trie. 641 v, sz := trie.lookupString(s) 642 x := info(v) 643 if x.isModifier() { 644 return &labelError{s, "V5"} 645 } 646 // Quickly return in the absence of zero-width (non) joiners. 647 if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { 648 return nil 649 } 650 st := stateStart 651 for i := 0; ; { 652 jt := x.joinType() 653 if s[i:i+sz] == zwj { 654 jt = joinZWJ 655 } else if s[i:i+sz] == zwnj { 656 jt = joinZWNJ 657 } 658 st = joinStates[st][jt] 659 if x.isViramaModifier() { 660 st = joinStates[st][joinVirama] 661 } 662 if i += sz; i == len(s) { 663 break 664 } 665 v, sz = trie.lookupString(s[i:]) 666 x = info(v) 667 } 668 if st == stateFAIL || st == stateAfter { 669 return &labelError{s, "C"} 670 } 671 return nil 672 } 673 674 func ascii(s string) bool { 675 for i := 0; i < len(s); i++ { 676 if s[i] >= utf8.RuneSelf { 677 return false 678 } 679 } 680 return true 681 }