github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/net/publicsuffix/gen.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 package main 8 9 // This program generates table.go and table_test.go. 10 // Invoke as: 11 // 12 // go run gen.go -version "xxx" >table.go 13 // go run gen.go -version "xxx" -test >table_test.go 14 // 15 // The version is derived from information found at 16 // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat 17 // 18 // To fetch a particular git revision, such as 5c70ccd250, pass 19 // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat" 20 21 import ( 22 "bufio" 23 "bytes" 24 "flag" 25 "fmt" 26 "go/format" 27 "io" 28 "net/http" 29 "os" 30 "regexp" 31 "sort" 32 "strings" 33 34 "golang.org/x/net/idna" 35 ) 36 37 const ( 38 nodesBitsChildren = 9 39 nodesBitsICANN = 1 40 nodesBitsTextOffset = 15 41 nodesBitsTextLength = 6 42 43 childrenBitsWildcard = 1 44 childrenBitsNodeType = 2 45 childrenBitsHi = 14 46 childrenBitsLo = 14 47 ) 48 49 var ( 50 maxChildren int 51 maxTextOffset int 52 maxTextLength int 53 maxHi uint32 54 maxLo uint32 55 ) 56 57 func max(a, b int) int { 58 if a < b { 59 return b 60 } 61 return a 62 } 63 64 func u32max(a, b uint32) uint32 { 65 if a < b { 66 return b 67 } 68 return a 69 } 70 71 const ( 72 nodeTypeNormal = 0 73 nodeTypeException = 1 74 nodeTypeParentOnly = 2 75 numNodeType = 3 76 ) 77 78 func nodeTypeStr(n int) string { 79 switch n { 80 case nodeTypeNormal: 81 return "+" 82 case nodeTypeException: 83 return "!" 84 case nodeTypeParentOnly: 85 return "o" 86 } 87 panic("unreachable") 88 } 89 90 var ( 91 labelEncoding = map[string]uint32{} 92 labelsList = []string{} 93 labelsMap = map[string]bool{} 94 rules = []string{} 95 96 // validSuffix is used to check that the entries in the public suffix list 97 // are in canonical form (after Punycode encoding). Specifically, capital 98 // letters are not allowed. 99 validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`) 100 101 crush = flag.Bool("crush", true, "make the generated node text as small as possible") 102 subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging") 103 url = flag.String("url", 104 "https://publicsuffix.org/list/effective_tld_names.dat", 105 "URL of the publicsuffix.org list. If empty, stdin is read instead") 106 v = flag.Bool("v", false, "verbose output (to stderr)") 107 version = flag.String("version", "", "the effective_tld_names.dat version") 108 test = flag.Bool("test", false, "generate table_test.go") 109 ) 110 111 func main() { 112 if err := main1(); err != nil { 113 fmt.Fprintln(os.Stderr, err) 114 os.Exit(1) 115 } 116 } 117 118 func main1() error { 119 flag.Parse() 120 if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 { 121 return fmt.Errorf("not enough bits to encode the nodes table") 122 } 123 if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 { 124 return fmt.Errorf("not enough bits to encode the children table") 125 } 126 if *version == "" { 127 return fmt.Errorf("-version was not specified") 128 } 129 var r io.Reader = os.Stdin 130 if *url != "" { 131 res, err := http.Get(*url) 132 if err != nil { 133 return err 134 } 135 if res.StatusCode != http.StatusOK { 136 return fmt.Errorf("bad GET status for %s: %d", *url, res.Status) 137 } 138 r = res.Body 139 defer res.Body.Close() 140 } 141 142 var root node 143 icann := false 144 buf := new(bytes.Buffer) 145 br := bufio.NewReader(r) 146 for { 147 s, err := br.ReadString('\n') 148 if err != nil { 149 if err == io.EOF { 150 break 151 } 152 return err 153 } 154 s = strings.TrimSpace(s) 155 if strings.Contains(s, "BEGIN ICANN DOMAINS") { 156 icann = true 157 continue 158 } 159 if strings.Contains(s, "END ICANN DOMAINS") { 160 icann = false 161 continue 162 } 163 if s == "" || strings.HasPrefix(s, "//") { 164 continue 165 } 166 s, err = idna.ToASCII(s) 167 if err != nil { 168 return err 169 } 170 if !validSuffix.MatchString(s) { 171 return fmt.Errorf("bad publicsuffix.org list data: %q", s) 172 } 173 174 if *subset { 175 switch { 176 case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"): 177 case s == "ak.us" || strings.HasSuffix(s, ".ak.us"): 178 case s == "ao" || strings.HasSuffix(s, ".ao"): 179 case s == "ar" || strings.HasSuffix(s, ".ar"): 180 case s == "arpa" || strings.HasSuffix(s, ".arpa"): 181 case s == "cy" || strings.HasSuffix(s, ".cy"): 182 case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"): 183 case s == "jp": 184 case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"): 185 case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"): 186 case s == "om" || strings.HasSuffix(s, ".om"): 187 case s == "uk" || strings.HasSuffix(s, ".uk"): 188 case s == "uk.com" || strings.HasSuffix(s, ".uk.com"): 189 case s == "tw" || strings.HasSuffix(s, ".tw"): 190 case s == "zw" || strings.HasSuffix(s, ".zw"): 191 case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"): 192 // xn--p1ai is Russian-Cyrillic "рф". 193 default: 194 continue 195 } 196 } 197 198 rules = append(rules, s) 199 200 nt, wildcard := nodeTypeNormal, false 201 switch { 202 case strings.HasPrefix(s, "*."): 203 s, nt = s[2:], nodeTypeParentOnly 204 wildcard = true 205 case strings.HasPrefix(s, "!"): 206 s, nt = s[1:], nodeTypeException 207 } 208 labels := strings.Split(s, ".") 209 for n, i := &root, len(labels)-1; i >= 0; i-- { 210 label := labels[i] 211 n = n.child(label) 212 if i == 0 { 213 if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly { 214 n.nodeType = nt 215 } 216 n.icann = n.icann && icann 217 n.wildcard = n.wildcard || wildcard 218 } 219 labelsMap[label] = true 220 } 221 } 222 labelsList = make([]string, 0, len(labelsMap)) 223 for label := range labelsMap { 224 labelsList = append(labelsList, label) 225 } 226 sort.Strings(labelsList) 227 228 p := printReal 229 if *test { 230 p = printTest 231 } 232 if err := p(buf, &root); err != nil { 233 return err 234 } 235 236 b, err := format.Source(buf.Bytes()) 237 if err != nil { 238 return err 239 } 240 _, err = os.Stdout.Write(b) 241 return err 242 } 243 244 func printTest(w io.Writer, n *node) error { 245 fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n") 246 fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n") 247 for _, rule := range rules { 248 fmt.Fprintf(w, "%q,\n", rule) 249 } 250 fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n") 251 if err := n.walk(w, printNodeLabel); err != nil { 252 return err 253 } 254 fmt.Fprintf(w, "}\n") 255 return nil 256 } 257 258 func printReal(w io.Writer, n *node) error { 259 const header = `// generated by go run gen.go; DO NOT EDIT 260 261 package publicsuffix 262 263 const version = %q 264 265 const ( 266 nodesBitsChildren = %d 267 nodesBitsICANN = %d 268 nodesBitsTextOffset = %d 269 nodesBitsTextLength = %d 270 271 childrenBitsWildcard = %d 272 childrenBitsNodeType = %d 273 childrenBitsHi = %d 274 childrenBitsLo = %d 275 ) 276 277 const ( 278 nodeTypeNormal = %d 279 nodeTypeException = %d 280 nodeTypeParentOnly = %d 281 ) 282 283 // numTLD is the number of top level domains. 284 const numTLD = %d 285 286 ` 287 fmt.Fprintf(w, header, *version, 288 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength, 289 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo, 290 nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children)) 291 292 text := makeText() 293 if text == "" { 294 return fmt.Errorf("internal error: makeText returned no text") 295 } 296 for _, label := range labelsList { 297 offset, length := strings.Index(text, label), len(label) 298 if offset < 0 { 299 return fmt.Errorf("internal error: could not find %q in text %q", label, text) 300 } 301 maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length) 302 if offset >= 1<<nodesBitsTextOffset || length >= 1<<nodesBitsTextLength { 303 return fmt.Errorf("text offset/length is too large: %d/%d", offset, length) 304 } 305 labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length) 306 } 307 fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ") 308 for len(text) > 0 { 309 n, plus := len(text), "" 310 if n > 64 { 311 n, plus = 64, " +" 312 } 313 fmt.Fprintf(w, "%q%s\n", text[:n], plus) 314 text = text[n:] 315 } 316 317 n.walk(w, assignIndexes) 318 319 fmt.Fprintf(w, ` 320 321 // nodes is the list of nodes. Each node is represented as a uint32, which 322 // encodes the node's children, wildcard bit and node type (as an index into 323 // the children array), ICANN bit and text. 324 // 325 // In the //-comment after each node's data, the nodes indexes of the children 326 // are formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The 327 // nodeType is printed as + for normal, ! for exception, and o for parent-only 328 // nodes that have children but don't match a domain label in their own right. 329 // An I denotes an ICANN domain. 330 // 331 // The layout within the uint32, from MSB to LSB, is: 332 // [%2d bits] unused 333 // [%2d bits] children index 334 // [%2d bits] ICANN bit 335 // [%2d bits] text index 336 // [%2d bits] text length 337 var nodes = [...]uint32{ 338 `, 339 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength, 340 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength) 341 if err := n.walk(w, printNode); err != nil { 342 return err 343 } 344 fmt.Fprintf(w, `} 345 346 // children is the list of nodes' children, the parent's wildcard bit and the 347 // parent's node type. If a node has no children then their children index 348 // will be in the range [0, 6), depending on the wildcard bit and node type. 349 // 350 // The layout within the uint32, from MSB to LSB, is: 351 // [%2d bits] unused 352 // [%2d bits] wildcard bit 353 // [%2d bits] node type 354 // [%2d bits] high nodes index (exclusive) of children 355 // [%2d bits] low nodes index (inclusive) of children 356 var children=[...]uint32{ 357 `, 358 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo, 359 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo) 360 for i, c := range childrenEncoding { 361 s := "---------------" 362 lo := c & (1<<childrenBitsLo - 1) 363 hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1) 364 if lo != hi { 365 s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi) 366 } 367 nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1) 368 wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0 369 fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n", 370 c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType)) 371 } 372 fmt.Fprintf(w, "}\n\n") 373 fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1) 374 fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1) 375 fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1) 376 fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1) 377 fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1) 378 return nil 379 } 380 381 type node struct { 382 label string 383 nodeType int 384 icann bool 385 wildcard bool 386 // nodesIndex and childrenIndex are the index of this node in the nodes 387 // and the index of its children offset/length in the children arrays. 388 nodesIndex, childrenIndex int 389 // firstChild is the index of this node's first child, or zero if this 390 // node has no children. 391 firstChild int 392 // children are the node's children, in strictly increasing node label order. 393 children []*node 394 } 395 396 func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error { 397 if err := f(w, n); err != nil { 398 return err 399 } 400 for _, c := range n.children { 401 if err := c.walk(w, f); err != nil { 402 return err 403 } 404 } 405 return nil 406 } 407 408 // child returns the child of n with the given label. The child is created if 409 // it did not exist beforehand. 410 func (n *node) child(label string) *node { 411 for _, c := range n.children { 412 if c.label == label { 413 return c 414 } 415 } 416 c := &node{ 417 label: label, 418 nodeType: nodeTypeParentOnly, 419 icann: true, 420 } 421 n.children = append(n.children, c) 422 sort.Sort(byLabel(n.children)) 423 return c 424 } 425 426 type byLabel []*node 427 428 func (b byLabel) Len() int { return len(b) } 429 func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 430 func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label } 431 432 var nextNodesIndex int 433 434 // childrenEncoding are the encoded entries in the generated children array. 435 // All these pre-defined entries have no children. 436 var childrenEncoding = []uint32{ 437 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal. 438 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException. 439 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly. 440 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal. 441 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException. 442 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly. 443 } 444 445 var firstCallToAssignIndexes = true 446 447 func assignIndexes(w io.Writer, n *node) error { 448 if len(n.children) != 0 { 449 // Assign nodesIndex. 450 n.firstChild = nextNodesIndex 451 for _, c := range n.children { 452 c.nodesIndex = nextNodesIndex 453 nextNodesIndex++ 454 } 455 456 // The root node's children is implicit. 457 if firstCallToAssignIndexes { 458 firstCallToAssignIndexes = false 459 return nil 460 } 461 462 // Assign childrenIndex. 463 maxChildren = max(maxChildren, len(childrenEncoding)) 464 if len(childrenEncoding) >= 1<<nodesBitsChildren { 465 return fmt.Errorf("children table is too large") 466 } 467 n.childrenIndex = len(childrenEncoding) 468 lo := uint32(n.firstChild) 469 hi := lo + uint32(len(n.children)) 470 maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi) 471 if lo >= 1<<childrenBitsLo || hi >= 1<<childrenBitsHi { 472 return fmt.Errorf("children lo/hi is too large: %d/%d", lo, hi) 473 } 474 enc := hi<<childrenBitsLo | lo 475 enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi) 476 if n.wildcard { 477 enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType) 478 } 479 childrenEncoding = append(childrenEncoding, enc) 480 } else { 481 n.childrenIndex = n.nodeType 482 if n.wildcard { 483 n.childrenIndex += numNodeType 484 } 485 } 486 return nil 487 } 488 489 func printNode(w io.Writer, n *node) error { 490 for _, c := range n.children { 491 s := "---------------" 492 if len(c.children) != 0 { 493 s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children)) 494 } 495 encoding := labelEncoding[c.label] 496 if c.icann { 497 encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset) 498 } 499 encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN) 500 fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n", 501 encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard), 502 nodeTypeStr(c.nodeType), icannStr(c.icann), c.label, 503 ) 504 } 505 return nil 506 } 507 508 func printNodeLabel(w io.Writer, n *node) error { 509 for _, c := range n.children { 510 fmt.Fprintf(w, "%q,\n", c.label) 511 } 512 return nil 513 } 514 515 func icannStr(icann bool) string { 516 if icann { 517 return "I" 518 } 519 return " " 520 } 521 522 func wildcardStr(wildcard bool) string { 523 if wildcard { 524 return "*" 525 } 526 return " " 527 } 528 529 // makeText combines all the strings in labelsList to form one giant string. 530 // If the crush flag is true, then overlapping strings will be merged: "arpa" 531 // and "parliament" could yield "arparliament". 532 func makeText() string { 533 if !*crush { 534 return strings.Join(labelsList, "") 535 } 536 537 beforeLength := 0 538 for _, s := range labelsList { 539 beforeLength += len(s) 540 } 541 542 // Make a copy of labelsList. 543 ss := append(make([]string, 0, len(labelsList)), labelsList...) 544 545 // Remove strings that are substrings of other strings. 546 for changed := true; changed; { 547 changed = false 548 for i, s := range ss { 549 if s == "" { 550 continue 551 } 552 for j, t := range ss { 553 if i != j && t != "" && strings.Contains(s, t) { 554 changed = true 555 ss[j] = "" 556 } 557 } 558 } 559 } 560 561 // Remove the empty strings. 562 sort.Strings(ss) 563 for len(ss) > 0 && ss[0] == "" { 564 ss = ss[1:] 565 } 566 567 // Join strings where one suffix matches another prefix. 568 for { 569 // Find best i, j, k such that ss[i][len-k:] == ss[j][:k], 570 // maximizing overlap length k. 571 besti := -1 572 bestj := -1 573 bestk := 0 574 for i, s := range ss { 575 if s == "" { 576 continue 577 } 578 for j, t := range ss { 579 if i == j { 580 continue 581 } 582 for k := bestk + 1; k <= len(s) && k <= len(t); k++ { 583 if s[len(s)-k:] == t[:k] { 584 besti = i 585 bestj = j 586 bestk = k 587 } 588 } 589 } 590 } 591 if bestk > 0 { 592 if *v { 593 fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n", 594 bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj]) 595 } 596 ss[besti] += ss[bestj][bestk:] 597 ss[bestj] = "" 598 continue 599 } 600 break 601 } 602 603 text := strings.Join(ss, "") 604 if *v { 605 fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text)) 606 } 607 return text 608 }