github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/net/publicsuffix/gen.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  package main
     8  
     9  // This program generates table.go and table_test.go.
    10  // Invoke as:
    11  //
    12  //	go run gen.go -version "xxx"       >table.go
    13  //	go run gen.go -version "xxx" -test >table_test.go
    14  //
    15  // The version is derived from information found at
    16  // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
    17  //
    18  // To fetch a particular git revision, such as 5c70ccd250, pass
    19  // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
    20  
    21  import (
    22  	"bufio"
    23  	"bytes"
    24  	"flag"
    25  	"fmt"
    26  	"go/format"
    27  	"io"
    28  	"net/http"
    29  	"os"
    30  	"regexp"
    31  	"sort"
    32  	"strings"
    33  
    34  	"golang.org/x/net/idna"
    35  )
    36  
    37  const (
    38  	nodesBitsChildren   = 9
    39  	nodesBitsICANN      = 1
    40  	nodesBitsTextOffset = 15
    41  	nodesBitsTextLength = 6
    42  
    43  	childrenBitsWildcard = 1
    44  	childrenBitsNodeType = 2
    45  	childrenBitsHi       = 14
    46  	childrenBitsLo       = 14
    47  )
    48  
    49  var (
    50  	maxChildren   int
    51  	maxTextOffset int
    52  	maxTextLength int
    53  	maxHi         uint32
    54  	maxLo         uint32
    55  )
    56  
    57  func max(a, b int) int {
    58  	if a < b {
    59  		return b
    60  	}
    61  	return a
    62  }
    63  
    64  func u32max(a, b uint32) uint32 {
    65  	if a < b {
    66  		return b
    67  	}
    68  	return a
    69  }
    70  
    71  const (
    72  	nodeTypeNormal     = 0
    73  	nodeTypeException  = 1
    74  	nodeTypeParentOnly = 2
    75  	numNodeType        = 3
    76  )
    77  
    78  func nodeTypeStr(n int) string {
    79  	switch n {
    80  	case nodeTypeNormal:
    81  		return "+"
    82  	case nodeTypeException:
    83  		return "!"
    84  	case nodeTypeParentOnly:
    85  		return "o"
    86  	}
    87  	panic("unreachable")
    88  }
    89  
    90  var (
    91  	labelEncoding = map[string]uint32{}
    92  	labelsList    = []string{}
    93  	labelsMap     = map[string]bool{}
    94  	rules         = []string{}
    95  
    96  	// validSuffix is used to check that the entries in the public suffix list
    97  	// are in canonical form (after Punycode encoding). Specifically, capital
    98  	// letters are not allowed.
    99  	validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
   100  
   101  	crush  = flag.Bool("crush", true, "make the generated node text as small as possible")
   102  	subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
   103  	url    = flag.String("url",
   104  		"https://publicsuffix.org/list/effective_tld_names.dat",
   105  		"URL of the publicsuffix.org list. If empty, stdin is read instead")
   106  	v       = flag.Bool("v", false, "verbose output (to stderr)")
   107  	version = flag.String("version", "", "the effective_tld_names.dat version")
   108  	test    = flag.Bool("test", false, "generate table_test.go")
   109  )
   110  
   111  func main() {
   112  	if err := main1(); err != nil {
   113  		fmt.Fprintln(os.Stderr, err)
   114  		os.Exit(1)
   115  	}
   116  }
   117  
   118  func main1() error {
   119  	flag.Parse()
   120  	if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
   121  		return fmt.Errorf("not enough bits to encode the nodes table")
   122  	}
   123  	if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
   124  		return fmt.Errorf("not enough bits to encode the children table")
   125  	}
   126  	if *version == "" {
   127  		return fmt.Errorf("-version was not specified")
   128  	}
   129  	var r io.Reader = os.Stdin
   130  	if *url != "" {
   131  		res, err := http.Get(*url)
   132  		if err != nil {
   133  			return err
   134  		}
   135  		if res.StatusCode != http.StatusOK {
   136  			return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
   137  		}
   138  		r = res.Body
   139  		defer res.Body.Close()
   140  	}
   141  
   142  	var root node
   143  	icann := false
   144  	buf := new(bytes.Buffer)
   145  	br := bufio.NewReader(r)
   146  	for {
   147  		s, err := br.ReadString('\n')
   148  		if err != nil {
   149  			if err == io.EOF {
   150  				break
   151  			}
   152  			return err
   153  		}
   154  		s = strings.TrimSpace(s)
   155  		if strings.Contains(s, "BEGIN ICANN DOMAINS") {
   156  			icann = true
   157  			continue
   158  		}
   159  		if strings.Contains(s, "END ICANN DOMAINS") {
   160  			icann = false
   161  			continue
   162  		}
   163  		if s == "" || strings.HasPrefix(s, "//") {
   164  			continue
   165  		}
   166  		s, err = idna.ToASCII(s)
   167  		if err != nil {
   168  			return err
   169  		}
   170  		if !validSuffix.MatchString(s) {
   171  			return fmt.Errorf("bad publicsuffix.org list data: %q", s)
   172  		}
   173  
   174  		if *subset {
   175  			switch {
   176  			case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
   177  			case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
   178  			case s == "ao" || strings.HasSuffix(s, ".ao"):
   179  			case s == "ar" || strings.HasSuffix(s, ".ar"):
   180  			case s == "arpa" || strings.HasSuffix(s, ".arpa"):
   181  			case s == "cy" || strings.HasSuffix(s, ".cy"):
   182  			case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
   183  			case s == "jp":
   184  			case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
   185  			case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
   186  			case s == "om" || strings.HasSuffix(s, ".om"):
   187  			case s == "uk" || strings.HasSuffix(s, ".uk"):
   188  			case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
   189  			case s == "tw" || strings.HasSuffix(s, ".tw"):
   190  			case s == "zw" || strings.HasSuffix(s, ".zw"):
   191  			case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
   192  				// xn--p1ai is Russian-Cyrillic "рф".
   193  			default:
   194  				continue
   195  			}
   196  		}
   197  
   198  		rules = append(rules, s)
   199  
   200  		nt, wildcard := nodeTypeNormal, false
   201  		switch {
   202  		case strings.HasPrefix(s, "*."):
   203  			s, nt = s[2:], nodeTypeParentOnly
   204  			wildcard = true
   205  		case strings.HasPrefix(s, "!"):
   206  			s, nt = s[1:], nodeTypeException
   207  		}
   208  		labels := strings.Split(s, ".")
   209  		for n, i := &root, len(labels)-1; i >= 0; i-- {
   210  			label := labels[i]
   211  			n = n.child(label)
   212  			if i == 0 {
   213  				if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
   214  					n.nodeType = nt
   215  				}
   216  				n.icann = n.icann && icann
   217  				n.wildcard = n.wildcard || wildcard
   218  			}
   219  			labelsMap[label] = true
   220  		}
   221  	}
   222  	labelsList = make([]string, 0, len(labelsMap))
   223  	for label := range labelsMap {
   224  		labelsList = append(labelsList, label)
   225  	}
   226  	sort.Strings(labelsList)
   227  
   228  	p := printReal
   229  	if *test {
   230  		p = printTest
   231  	}
   232  	if err := p(buf, &root); err != nil {
   233  		return err
   234  	}
   235  
   236  	b, err := format.Source(buf.Bytes())
   237  	if err != nil {
   238  		return err
   239  	}
   240  	_, err = os.Stdout.Write(b)
   241  	return err
   242  }
   243  
   244  func printTest(w io.Writer, n *node) error {
   245  	fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
   246  	fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
   247  	for _, rule := range rules {
   248  		fmt.Fprintf(w, "%q,\n", rule)
   249  	}
   250  	fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
   251  	if err := n.walk(w, printNodeLabel); err != nil {
   252  		return err
   253  	}
   254  	fmt.Fprintf(w, "}\n")
   255  	return nil
   256  }
   257  
   258  func printReal(w io.Writer, n *node) error {
   259  	const header = `// generated by go run gen.go; DO NOT EDIT
   260  
   261  package publicsuffix
   262  
   263  const version = %q
   264  
   265  const (
   266  	nodesBitsChildren   = %d
   267  	nodesBitsICANN      = %d
   268  	nodesBitsTextOffset = %d
   269  	nodesBitsTextLength = %d
   270  
   271  	childrenBitsWildcard = %d
   272  	childrenBitsNodeType = %d
   273  	childrenBitsHi       = %d
   274  	childrenBitsLo       = %d
   275  )
   276  
   277  const (
   278  	nodeTypeNormal     = %d
   279  	nodeTypeException  = %d
   280  	nodeTypeParentOnly = %d
   281  )
   282  
   283  // numTLD is the number of top level domains.
   284  const numTLD = %d
   285  
   286  `
   287  	fmt.Fprintf(w, header, *version,
   288  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
   289  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
   290  		nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
   291  
   292  	text := makeText()
   293  	if text == "" {
   294  		return fmt.Errorf("internal error: makeText returned no text")
   295  	}
   296  	for _, label := range labelsList {
   297  		offset, length := strings.Index(text, label), len(label)
   298  		if offset < 0 {
   299  			return fmt.Errorf("internal error: could not find %q in text %q", label, text)
   300  		}
   301  		maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
   302  		if offset >= 1<<nodesBitsTextOffset || length >= 1<<nodesBitsTextLength {
   303  			return fmt.Errorf("text offset/length is too large: %d/%d", offset, length)
   304  		}
   305  		labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
   306  	}
   307  	fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
   308  	for len(text) > 0 {
   309  		n, plus := len(text), ""
   310  		if n > 64 {
   311  			n, plus = 64, " +"
   312  		}
   313  		fmt.Fprintf(w, "%q%s\n", text[:n], plus)
   314  		text = text[n:]
   315  	}
   316  
   317  	n.walk(w, assignIndexes)
   318  
   319  	fmt.Fprintf(w, `
   320  
   321  // nodes is the list of nodes. Each node is represented as a uint32, which
   322  // encodes the node's children, wildcard bit and node type (as an index into
   323  // the children array), ICANN bit and text.
   324  //
   325  // In the //-comment after each node's data, the nodes indexes of the children
   326  // are formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
   327  // nodeType is printed as + for normal, ! for exception, and o for parent-only
   328  // nodes that have children but don't match a domain label in their own right.
   329  // An I denotes an ICANN domain.
   330  //
   331  // The layout within the uint32, from MSB to LSB, is:
   332  //	[%2d bits] unused
   333  //	[%2d bits] children index
   334  //	[%2d bits] ICANN bit
   335  //	[%2d bits] text index
   336  //	[%2d bits] text length
   337  var nodes = [...]uint32{
   338  `,
   339  		32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
   340  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
   341  	if err := n.walk(w, printNode); err != nil {
   342  		return err
   343  	}
   344  	fmt.Fprintf(w, `}
   345  
   346  // children is the list of nodes' children, the parent's wildcard bit and the
   347  // parent's node type. If a node has no children then their children index
   348  // will be in the range [0, 6), depending on the wildcard bit and node type.
   349  //
   350  // The layout within the uint32, from MSB to LSB, is:
   351  //	[%2d bits] unused
   352  //	[%2d bits] wildcard bit
   353  //	[%2d bits] node type
   354  //	[%2d bits] high nodes index (exclusive) of children
   355  //	[%2d bits] low nodes index (inclusive) of children
   356  var children=[...]uint32{
   357  `,
   358  		32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
   359  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
   360  	for i, c := range childrenEncoding {
   361  		s := "---------------"
   362  		lo := c & (1<<childrenBitsLo - 1)
   363  		hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
   364  		if lo != hi {
   365  			s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
   366  		}
   367  		nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
   368  		wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
   369  		fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
   370  			c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
   371  	}
   372  	fmt.Fprintf(w, "}\n\n")
   373  	fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
   374  	fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
   375  	fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
   376  	fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
   377  	fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
   378  	return nil
   379  }
   380  
   381  type node struct {
   382  	label    string
   383  	nodeType int
   384  	icann    bool
   385  	wildcard bool
   386  	// nodesIndex and childrenIndex are the index of this node in the nodes
   387  	// and the index of its children offset/length in the children arrays.
   388  	nodesIndex, childrenIndex int
   389  	// firstChild is the index of this node's first child, or zero if this
   390  	// node has no children.
   391  	firstChild int
   392  	// children are the node's children, in strictly increasing node label order.
   393  	children []*node
   394  }
   395  
   396  func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
   397  	if err := f(w, n); err != nil {
   398  		return err
   399  	}
   400  	for _, c := range n.children {
   401  		if err := c.walk(w, f); err != nil {
   402  			return err
   403  		}
   404  	}
   405  	return nil
   406  }
   407  
   408  // child returns the child of n with the given label. The child is created if
   409  // it did not exist beforehand.
   410  func (n *node) child(label string) *node {
   411  	for _, c := range n.children {
   412  		if c.label == label {
   413  			return c
   414  		}
   415  	}
   416  	c := &node{
   417  		label:    label,
   418  		nodeType: nodeTypeParentOnly,
   419  		icann:    true,
   420  	}
   421  	n.children = append(n.children, c)
   422  	sort.Sort(byLabel(n.children))
   423  	return c
   424  }
   425  
   426  type byLabel []*node
   427  
   428  func (b byLabel) Len() int           { return len(b) }
   429  func (b byLabel) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   430  func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
   431  
   432  var nextNodesIndex int
   433  
   434  // childrenEncoding are the encoded entries in the generated children array.
   435  // All these pre-defined entries have no children.
   436  var childrenEncoding = []uint32{
   437  	0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
   438  	1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
   439  	2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
   440  	4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
   441  	5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
   442  	6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
   443  }
   444  
   445  var firstCallToAssignIndexes = true
   446  
   447  func assignIndexes(w io.Writer, n *node) error {
   448  	if len(n.children) != 0 {
   449  		// Assign nodesIndex.
   450  		n.firstChild = nextNodesIndex
   451  		for _, c := range n.children {
   452  			c.nodesIndex = nextNodesIndex
   453  			nextNodesIndex++
   454  		}
   455  
   456  		// The root node's children is implicit.
   457  		if firstCallToAssignIndexes {
   458  			firstCallToAssignIndexes = false
   459  			return nil
   460  		}
   461  
   462  		// Assign childrenIndex.
   463  		maxChildren = max(maxChildren, len(childrenEncoding))
   464  		if len(childrenEncoding) >= 1<<nodesBitsChildren {
   465  			return fmt.Errorf("children table is too large")
   466  		}
   467  		n.childrenIndex = len(childrenEncoding)
   468  		lo := uint32(n.firstChild)
   469  		hi := lo + uint32(len(n.children))
   470  		maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
   471  		if lo >= 1<<childrenBitsLo || hi >= 1<<childrenBitsHi {
   472  			return fmt.Errorf("children lo/hi is too large: %d/%d", lo, hi)
   473  		}
   474  		enc := hi<<childrenBitsLo | lo
   475  		enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
   476  		if n.wildcard {
   477  			enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
   478  		}
   479  		childrenEncoding = append(childrenEncoding, enc)
   480  	} else {
   481  		n.childrenIndex = n.nodeType
   482  		if n.wildcard {
   483  			n.childrenIndex += numNodeType
   484  		}
   485  	}
   486  	return nil
   487  }
   488  
   489  func printNode(w io.Writer, n *node) error {
   490  	for _, c := range n.children {
   491  		s := "---------------"
   492  		if len(c.children) != 0 {
   493  			s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
   494  		}
   495  		encoding := labelEncoding[c.label]
   496  		if c.icann {
   497  			encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
   498  		}
   499  		encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
   500  		fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
   501  			encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
   502  			nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
   503  		)
   504  	}
   505  	return nil
   506  }
   507  
   508  func printNodeLabel(w io.Writer, n *node) error {
   509  	for _, c := range n.children {
   510  		fmt.Fprintf(w, "%q,\n", c.label)
   511  	}
   512  	return nil
   513  }
   514  
   515  func icannStr(icann bool) string {
   516  	if icann {
   517  		return "I"
   518  	}
   519  	return " "
   520  }
   521  
   522  func wildcardStr(wildcard bool) string {
   523  	if wildcard {
   524  		return "*"
   525  	}
   526  	return " "
   527  }
   528  
   529  // makeText combines all the strings in labelsList to form one giant string.
   530  // If the crush flag is true, then overlapping strings will be merged: "arpa"
   531  // and "parliament" could yield "arparliament".
   532  func makeText() string {
   533  	if !*crush {
   534  		return strings.Join(labelsList, "")
   535  	}
   536  
   537  	beforeLength := 0
   538  	for _, s := range labelsList {
   539  		beforeLength += len(s)
   540  	}
   541  
   542  	// Make a copy of labelsList.
   543  	ss := append(make([]string, 0, len(labelsList)), labelsList...)
   544  
   545  	// Remove strings that are substrings of other strings.
   546  	for changed := true; changed; {
   547  		changed = false
   548  		for i, s := range ss {
   549  			if s == "" {
   550  				continue
   551  			}
   552  			for j, t := range ss {
   553  				if i != j && t != "" && strings.Contains(s, t) {
   554  					changed = true
   555  					ss[j] = ""
   556  				}
   557  			}
   558  		}
   559  	}
   560  
   561  	// Remove the empty strings.
   562  	sort.Strings(ss)
   563  	for len(ss) > 0 && ss[0] == "" {
   564  		ss = ss[1:]
   565  	}
   566  
   567  	// Join strings where one suffix matches another prefix.
   568  	for {
   569  		// Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
   570  		// maximizing overlap length k.
   571  		besti := -1
   572  		bestj := -1
   573  		bestk := 0
   574  		for i, s := range ss {
   575  			if s == "" {
   576  				continue
   577  			}
   578  			for j, t := range ss {
   579  				if i == j {
   580  					continue
   581  				}
   582  				for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
   583  					if s[len(s)-k:] == t[:k] {
   584  						besti = i
   585  						bestj = j
   586  						bestk = k
   587  					}
   588  				}
   589  			}
   590  		}
   591  		if bestk > 0 {
   592  			if *v {
   593  				fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
   594  					bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
   595  			}
   596  			ss[besti] += ss[bestj][bestk:]
   597  			ss[bestj] = ""
   598  			continue
   599  		}
   600  		break
   601  	}
   602  
   603  	text := strings.Join(ss, "")
   604  	if *v {
   605  		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
   606  	}
   607  	return text
   608  }