github.com/lianghucheng/zrddz@v0.0.0-20200923083010-c71f680932e2/src/golang.org/x/net/publicsuffix/gen.go

github.com/lianghucheng/zrddz@v0.0.0-20200923083010-c71f680932e2/src/golang.org/x/net/publicsuffix/gen.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  package main
     8  
     9  // This program generates table.go and table_test.go based on the authoritative
    10  // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
    11  //
    12  // The version is derived from
    13  // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
    14  // and a human-readable form is at
    15  // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
    16  //
    17  // To fetch a particular git revision, such as 5c70ccd250, pass
    18  // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
    19  // and -version "an explicit version string".
    20  
    21  import (
    22  	"bufio"
    23  	"bytes"
    24  	"flag"
    25  	"fmt"
    26  	"go/format"
    27  	"io"
    28  	"io/ioutil"
    29  	"net/http"
    30  	"os"
    31  	"regexp"
    32  	"sort"
    33  	"strings"
    34  
    35  	"golang.org/x/net/idna"
    36  )
    37  
    38  const (
    39  	// These sum of these four values must be no greater than 32.
    40  	nodesBitsChildren   = 10
    41  	nodesBitsICANN      = 1
    42  	nodesBitsTextOffset = 15
    43  	nodesBitsTextLength = 6
    44  
    45  	// These sum of these four values must be no greater than 32.
    46  	childrenBitsWildcard = 1
    47  	childrenBitsNodeType = 2
    48  	childrenBitsHi       = 14
    49  	childrenBitsLo       = 14
    50  )
    51  
    52  var (
    53  	maxChildren   int
    54  	maxTextOffset int
    55  	maxTextLength int
    56  	maxHi         uint32
    57  	maxLo         uint32
    58  )
    59  
    60  func max(a, b int) int {
    61  	if a < b {
    62  		return b
    63  	}
    64  	return a
    65  }
    66  
    67  func u32max(a, b uint32) uint32 {
    68  	if a < b {
    69  		return b
    70  	}
    71  	return a
    72  }
    73  
    74  const (
    75  	nodeTypeNormal     = 0
    76  	nodeTypeException  = 1
    77  	nodeTypeParentOnly = 2
    78  	numNodeType        = 3
    79  )
    80  
    81  func nodeTypeStr(n int) string {
    82  	switch n {
    83  	case nodeTypeNormal:
    84  		return "+"
    85  	case nodeTypeException:
    86  		return "!"
    87  	case nodeTypeParentOnly:
    88  		return "o"
    89  	}
    90  	panic("unreachable")
    91  }
    92  
    93  const (
    94  	defaultURL   = "https://publicsuffix.org/list/effective_tld_names.dat"
    95  	gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
    96  )
    97  
    98  var (
    99  	labelEncoding = map[string]uint32{}
   100  	labelsList    = []string{}
   101  	labelsMap     = map[string]bool{}
   102  	rules         = []string{}
   103  	numICANNRules = 0
   104  
   105  	// validSuffixRE is used to check that the entries in the public suffix
   106  	// list are in canonical form (after Punycode encoding). Specifically,
   107  	// capital letters are not allowed.
   108  	validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
   109  
   110  	shaRE  = regexp.MustCompile(`"sha":"([^"]+)"`)
   111  	dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
   112  
   113  	comments = flag.Bool("comments", false, "generate table.go comments, for debugging")
   114  	subset   = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
   115  	url      = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
   116  	v        = flag.Bool("v", false, "verbose output (to stderr)")
   117  	version  = flag.String("version", "", "the effective_tld_names.dat version")
   118  )
   119  
   120  func main() {
   121  	if err := main1(); err != nil {
   122  		fmt.Fprintln(os.Stderr, err)
   123  		os.Exit(1)
   124  	}
   125  }
   126  
   127  func main1() error {
   128  	flag.Parse()
   129  	if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
   130  		return fmt.Errorf("not enough bits to encode the nodes table")
   131  	}
   132  	if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
   133  		return fmt.Errorf("not enough bits to encode the children table")
   134  	}
   135  	if *version == "" {
   136  		if *url != defaultURL {
   137  			return fmt.Errorf("-version was not specified, and the -url is not the default one")
   138  		}
   139  		sha, date, err := gitCommit()
   140  		if err != nil {
   141  			return err
   142  		}
   143  		*version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
   144  	}
   145  	var r io.Reader = os.Stdin
   146  	if *url != "" {
   147  		res, err := http.Get(*url)
   148  		if err != nil {
   149  			return err
   150  		}
   151  		if res.StatusCode != http.StatusOK {
   152  			return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
   153  		}
   154  		r = res.Body
   155  		defer res.Body.Close()
   156  	}
   157  
   158  	var root node
   159  	icann := false
   160  	br := bufio.NewReader(r)
   161  	for {
   162  		s, err := br.ReadString('\n')
   163  		if err != nil {
   164  			if err == io.EOF {
   165  				break
   166  			}
   167  			return err
   168  		}
   169  		s = strings.TrimSpace(s)
   170  		if strings.Contains(s, "BEGIN ICANN DOMAINS") {
   171  			if len(rules) != 0 {
   172  				return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`)
   173  			}
   174  			icann = true
   175  			continue
   176  		}
   177  		if strings.Contains(s, "END ICANN DOMAINS") {
   178  			icann, numICANNRules = false, len(rules)
   179  			continue
   180  		}
   181  		if s == "" || strings.HasPrefix(s, "//") {
   182  			continue
   183  		}
   184  		s, err = idna.ToASCII(s)
   185  		if err != nil {
   186  			return err
   187  		}
   188  		if !validSuffixRE.MatchString(s) {
   189  			return fmt.Errorf("bad publicsuffix.org list data: %q", s)
   190  		}
   191  
   192  		if *subset {
   193  			switch {
   194  			case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
   195  			case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
   196  			case s == "ao" || strings.HasSuffix(s, ".ao"):
   197  			case s == "ar" || strings.HasSuffix(s, ".ar"):
   198  			case s == "arpa" || strings.HasSuffix(s, ".arpa"):
   199  			case s == "cy" || strings.HasSuffix(s, ".cy"):
   200  			case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
   201  			case s == "jp":
   202  			case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
   203  			case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
   204  			case s == "om" || strings.HasSuffix(s, ".om"):
   205  			case s == "uk" || strings.HasSuffix(s, ".uk"):
   206  			case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
   207  			case s == "tw" || strings.HasSuffix(s, ".tw"):
   208  			case s == "zw" || strings.HasSuffix(s, ".zw"):
   209  			case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
   210  				// xn--p1ai is Russian-Cyrillic "рф".
   211  			default:
   212  				continue
   213  			}
   214  		}
   215  
   216  		rules = append(rules, s)
   217  
   218  		nt, wildcard := nodeTypeNormal, false
   219  		switch {
   220  		case strings.HasPrefix(s, "*."):
   221  			s, nt = s[2:], nodeTypeParentOnly
   222  			wildcard = true
   223  		case strings.HasPrefix(s, "!"):
   224  			s, nt = s[1:], nodeTypeException
   225  		}
   226  		labels := strings.Split(s, ".")
   227  		for n, i := &root, len(labels)-1; i >= 0; i-- {
   228  			label := labels[i]
   229  			n = n.child(label)
   230  			if i == 0 {
   231  				if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
   232  					n.nodeType = nt
   233  				}
   234  				n.icann = n.icann && icann
   235  				n.wildcard = n.wildcard || wildcard
   236  			}
   237  			labelsMap[label] = true
   238  		}
   239  	}
   240  	labelsList = make([]string, 0, len(labelsMap))
   241  	for label := range labelsMap {
   242  		labelsList = append(labelsList, label)
   243  	}
   244  	sort.Strings(labelsList)
   245  
   246  	if err := generate(printReal, &root, "table.go"); err != nil {
   247  		return err
   248  	}
   249  	if err := generate(printTest, &root, "table_test.go"); err != nil {
   250  		return err
   251  	}
   252  	return nil
   253  }
   254  
   255  func generate(p func(io.Writer, *node) error, root *node, filename string) error {
   256  	buf := new(bytes.Buffer)
   257  	if err := p(buf, root); err != nil {
   258  		return err
   259  	}
   260  	b, err := format.Source(buf.Bytes())
   261  	if err != nil {
   262  		return err
   263  	}
   264  	return ioutil.WriteFile(filename, b, 0644)
   265  }
   266  
   267  func gitCommit() (sha, date string, retErr error) {
   268  	res, err := http.Get(gitCommitURL)
   269  	if err != nil {
   270  		return "", "", err
   271  	}
   272  	if res.StatusCode != http.StatusOK {
   273  		return "", "", fmt.Errorf("bad GET status for %s: %d", gitCommitURL, res.Status)
   274  	}
   275  	defer res.Body.Close()
   276  	b, err := ioutil.ReadAll(res.Body)
   277  	if err != nil {
   278  		return "", "", err
   279  	}
   280  	if m := shaRE.FindSubmatch(b); m != nil {
   281  		sha = string(m[1])
   282  	}
   283  	if m := dateRE.FindSubmatch(b); m != nil {
   284  		date = string(m[1])
   285  	}
   286  	if sha == "" || date == "" {
   287  		retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
   288  	}
   289  	return sha, date, retErr
   290  }
   291  
   292  func printTest(w io.Writer, n *node) error {
   293  	fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
   294  	fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules)
   295  	for _, rule := range rules {
   296  		fmt.Fprintf(w, "%q,\n", rule)
   297  	}
   298  	fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
   299  	if err := n.walk(w, printNodeLabel); err != nil {
   300  		return err
   301  	}
   302  	fmt.Fprintf(w, "}\n")
   303  	return nil
   304  }
   305  
   306  func printReal(w io.Writer, n *node) error {
   307  	const header = `// generated by go run gen.go; DO NOT EDIT
   308  
   309  package publicsuffix
   310  
   311  const version = %q
   312  
   313  const (
   314  	nodesBitsChildren   = %d
   315  	nodesBitsICANN      = %d
   316  	nodesBitsTextOffset = %d
   317  	nodesBitsTextLength = %d
   318  
   319  	childrenBitsWildcard = %d
   320  	childrenBitsNodeType = %d
   321  	childrenBitsHi       = %d
   322  	childrenBitsLo       = %d
   323  )
   324  
   325  const (
   326  	nodeTypeNormal     = %d
   327  	nodeTypeException  = %d
   328  	nodeTypeParentOnly = %d
   329  )
   330  
   331  // numTLD is the number of top level domains.
   332  const numTLD = %d
   333  
   334  `
   335  	fmt.Fprintf(w, header, *version,
   336  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
   337  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
   338  		nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
   339  
   340  	text := combineText(labelsList)
   341  	if text == "" {
   342  		return fmt.Errorf("internal error: makeText returned no text")
   343  	}
   344  	for _, label := range labelsList {
   345  		offset, length := strings.Index(text, label), len(label)
   346  		if offset < 0 {
   347  			return fmt.Errorf("internal error: could not find %q in text %q", label, text)
   348  		}
   349  		maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
   350  		if offset >= 1<<nodesBitsTextOffset {
   351  			return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
   352  		}
   353  		if length >= 1<<nodesBitsTextLength {
   354  			return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
   355  		}
   356  		labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
   357  	}
   358  	fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
   359  	for len(text) > 0 {
   360  		n, plus := len(text), ""
   361  		if n > 64 {
   362  			n, plus = 64, " +"
   363  		}
   364  		fmt.Fprintf(w, "%q%s\n", text[:n], plus)
   365  		text = text[n:]
   366  	}
   367  
   368  	if err := n.walk(w, assignIndexes); err != nil {
   369  		return err
   370  	}
   371  
   372  	fmt.Fprintf(w, `
   373  
   374  // nodes is the list of nodes. Each node is represented as a uint32, which
   375  // encodes the node's children, wildcard bit and node type (as an index into
   376  // the children array), ICANN bit and text.
   377  //
   378  // If the table was generated with the -comments flag, there is a //-comment
   379  // after each node's data. In it is the nodes-array indexes of the children,
   380  // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
   381  // nodeType is printed as + for normal, ! for exception, and o for parent-only
   382  // nodes that have children but don't match a domain label in their own right.
   383  // An I denotes an ICANN domain.
   384  //
   385  // The layout within the uint32, from MSB to LSB, is:
   386  //	[%2d bits] unused
   387  //	[%2d bits] children index
   388  //	[%2d bits] ICANN bit
   389  //	[%2d bits] text index
   390  //	[%2d bits] text length
   391  var nodes = [...]uint32{
   392  `,
   393  		32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
   394  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
   395  	if err := n.walk(w, printNode); err != nil {
   396  		return err
   397  	}
   398  	fmt.Fprintf(w, `}
   399  
   400  // children is the list of nodes' children, the parent's wildcard bit and the
   401  // parent's node type. If a node has no children then their children index
   402  // will be in the range [0, 6), depending on the wildcard bit and node type.
   403  //
   404  // The layout within the uint32, from MSB to LSB, is:
   405  //	[%2d bits] unused
   406  //	[%2d bits] wildcard bit
   407  //	[%2d bits] node type
   408  //	[%2d bits] high nodes index (exclusive) of children
   409  //	[%2d bits] low nodes index (inclusive) of children
   410  var children=[...]uint32{
   411  `,
   412  		32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
   413  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
   414  	for i, c := range childrenEncoding {
   415  		s := "---------------"
   416  		lo := c & (1<<childrenBitsLo - 1)
   417  		hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
   418  		if lo != hi {
   419  			s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
   420  		}
   421  		nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
   422  		wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
   423  		if *comments {
   424  			fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
   425  				c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
   426  		} else {
   427  			fmt.Fprintf(w, "0x%x,\n", c)
   428  		}
   429  	}
   430  	fmt.Fprintf(w, "}\n\n")
   431  	fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
   432  	fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
   433  	fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
   434  	fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
   435  	fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
   436  	return nil
   437  }
   438  
   439  type node struct {
   440  	label    string
   441  	nodeType int
   442  	icann    bool
   443  	wildcard bool
   444  	// nodesIndex and childrenIndex are the index of this node in the nodes
   445  	// and the index of its children offset/length in the children arrays.
   446  	nodesIndex, childrenIndex int
   447  	// firstChild is the index of this node's first child, or zero if this
   448  	// node has no children.
   449  	firstChild int
   450  	// children are the node's children, in strictly increasing node label order.
   451  	children []*node
   452  }
   453  
   454  func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
   455  	if err := f(w, n); err != nil {
   456  		return err
   457  	}
   458  	for _, c := range n.children {
   459  		if err := c.walk(w, f); err != nil {
   460  			return err
   461  		}
   462  	}
   463  	return nil
   464  }
   465  
   466  // child returns the child of n with the given label. The child is created if
   467  // it did not exist beforehand.
   468  func (n *node) child(label string) *node {
   469  	for _, c := range n.children {
   470  		if c.label == label {
   471  			return c
   472  		}
   473  	}
   474  	c := &node{
   475  		label:    label,
   476  		nodeType: nodeTypeParentOnly,
   477  		icann:    true,
   478  	}
   479  	n.children = append(n.children, c)
   480  	sort.Sort(byLabel(n.children))
   481  	return c
   482  }
   483  
   484  type byLabel []*node
   485  
   486  func (b byLabel) Len() int           { return len(b) }
   487  func (b byLabel) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   488  func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
   489  
   490  var nextNodesIndex int
   491  
   492  // childrenEncoding are the encoded entries in the generated children array.
   493  // All these pre-defined entries have no children.
   494  var childrenEncoding = []uint32{
   495  	0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
   496  	1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
   497  	2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
   498  	4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
   499  	5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
   500  	6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
   501  }
   502  
   503  var firstCallToAssignIndexes = true
   504  
   505  func assignIndexes(w io.Writer, n *node) error {
   506  	if len(n.children) != 0 {
   507  		// Assign nodesIndex.
   508  		n.firstChild = nextNodesIndex
   509  		for _, c := range n.children {
   510  			c.nodesIndex = nextNodesIndex
   511  			nextNodesIndex++
   512  		}
   513  
   514  		// The root node's children is implicit.
   515  		if firstCallToAssignIndexes {
   516  			firstCallToAssignIndexes = false
   517  			return nil
   518  		}
   519  
   520  		// Assign childrenIndex.
   521  		maxChildren = max(maxChildren, len(childrenEncoding))
   522  		if len(childrenEncoding) >= 1<<nodesBitsChildren {
   523  			return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
   524  		}
   525  		n.childrenIndex = len(childrenEncoding)
   526  		lo := uint32(n.firstChild)
   527  		hi := lo + uint32(len(n.children))
   528  		maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
   529  		if lo >= 1<<childrenBitsLo {
   530  			return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
   531  		}
   532  		if hi >= 1<<childrenBitsHi {
   533  			return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
   534  		}
   535  		enc := hi<<childrenBitsLo | lo
   536  		enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
   537  		if n.wildcard {
   538  			enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
   539  		}
   540  		childrenEncoding = append(childrenEncoding, enc)
   541  	} else {
   542  		n.childrenIndex = n.nodeType
   543  		if n.wildcard {
   544  			n.childrenIndex += numNodeType
   545  		}
   546  	}
   547  	return nil
   548  }
   549  
   550  func printNode(w io.Writer, n *node) error {
   551  	for _, c := range n.children {
   552  		s := "---------------"
   553  		if len(c.children) != 0 {
   554  			s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
   555  		}
   556  		encoding := labelEncoding[c.label]
   557  		if c.icann {
   558  			encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
   559  		}
   560  		encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
   561  		if *comments {
   562  			fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
   563  				encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
   564  				nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
   565  			)
   566  		} else {
   567  			fmt.Fprintf(w, "0x%x,\n", encoding)
   568  		}
   569  	}
   570  	return nil
   571  }
   572  
   573  func printNodeLabel(w io.Writer, n *node) error {
   574  	for _, c := range n.children {
   575  		fmt.Fprintf(w, "%q,\n", c.label)
   576  	}
   577  	return nil
   578  }
   579  
   580  func icannStr(icann bool) string {
   581  	if icann {
   582  		return "I"
   583  	}
   584  	return " "
   585  }
   586  
   587  func wildcardStr(wildcard bool) string {
   588  	if wildcard {
   589  		return "*"
   590  	}
   591  	return " "
   592  }
   593  
   594  // combineText combines all the strings in labelsList to form one giant string.
   595  // Overlapping strings will be merged: "arpa" and "parliament" could yield
   596  // "arparliament".
   597  func combineText(labelsList []string) string {
   598  	beforeLength := 0
   599  	for _, s := range labelsList {
   600  		beforeLength += len(s)
   601  	}
   602  
   603  	text := crush(removeSubstrings(labelsList))
   604  	if *v {
   605  		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
   606  	}
   607  	return text
   608  }
   609  
   610  type byLength []string
   611  
   612  func (s byLength) Len() int           { return len(s) }
   613  func (s byLength) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
   614  func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
   615  
   616  // removeSubstrings returns a copy of its input with any strings removed
   617  // that are substrings of other provided strings.
   618  func removeSubstrings(input []string) []string {
   619  	// Make a copy of input.
   620  	ss := append(make([]string, 0, len(input)), input...)
   621  	sort.Sort(byLength(ss))
   622  
   623  	for i, shortString := range ss {
   624  		// For each string, only consider strings higher than it in sort order, i.e.
   625  		// of equal length or greater.
   626  		for _, longString := range ss[i+1:] {
   627  			if strings.Contains(longString, shortString) {
   628  				ss[i] = ""
   629  				break
   630  			}
   631  		}
   632  	}
   633  
   634  	// Remove the empty strings.
   635  	sort.Strings(ss)
   636  	for len(ss) > 0 && ss[0] == "" {
   637  		ss = ss[1:]
   638  	}
   639  	return ss
   640  }
   641  
   642  // crush combines a list of strings, taking advantage of overlaps. It returns a
   643  // single string that contains each input string as a substring.
   644  func crush(ss []string) string {
   645  	maxLabelLen := 0
   646  	for _, s := range ss {
   647  		if maxLabelLen < len(s) {
   648  			maxLabelLen = len(s)
   649  		}
   650  	}
   651  
   652  	for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
   653  		prefixes := makePrefixMap(ss, prefixLen)
   654  		for i, s := range ss {
   655  			if len(s) <= prefixLen {
   656  				continue
   657  			}
   658  			mergeLabel(ss, i, prefixLen, prefixes)
   659  		}
   660  	}
   661  
   662  	return strings.Join(ss, "")
   663  }
   664  
   665  // mergeLabel merges the label at ss[i] with the first available matching label
   666  // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
   667  // "prefixLen" characters in the matching label.
   668  // It will merge ss[i] repeatedly until no more matches are available.
   669  // All matching labels merged into ss[i] are replaced by "".
   670  func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
   671  	s := ss[i]
   672  	suffix := s[len(s)-prefixLen:]
   673  	for _, j := range prefixes[suffix] {
   674  		// Empty strings mean "already used." Also avoid merging with self.
   675  		if ss[j] == "" || i == j {
   676  			continue
   677  		}
   678  		if *v {
   679  			fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
   680  				prefixLen, i, j, ss[i], ss[j], suffix)
   681  		}
   682  		ss[i] += ss[j][prefixLen:]
   683  		ss[j] = ""
   684  		// ss[i] has a new suffix, so merge again if possible.
   685  		// Note: we only have to merge again at the same prefix length. Shorter
   686  		// prefix lengths will be handled in the next iteration of crush's for loop.
   687  		// Can there be matches for longer prefix lengths, introduced by the merge?
   688  		// I believe that any such matches would by necessity have been eliminated
   689  		// during substring removal or merged at a higher prefix length. For
   690  		// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
   691  		// would yield "abcde", which could be merged with "bcdef." However, in
   692  		// practice "cde" would already have been elimintated by removeSubstrings.
   693  		mergeLabel(ss, i, prefixLen, prefixes)
   694  		return
   695  	}
   696  }
   697  
   698  // prefixMap maps from a prefix to a list of strings containing that prefix. The
   699  // list of strings is represented as indexes into a slice of strings stored
   700  // elsewhere.
   701  type prefixMap map[string][]int
   702  
   703  // makePrefixMap constructs a prefixMap from a slice of strings.
   704  func makePrefixMap(ss []string, prefixLen int) prefixMap {
   705  	prefixes := make(prefixMap)
   706  	for i, s := range ss {
   707  		// We use < rather than <= because if a label matches on a prefix equal to
   708  		// its full length, that's actually a substring match handled by
   709  		// removeSubstrings.
   710  		if prefixLen < len(s) {
   711  			prefix := s[:prefixLen]
   712  			prefixes[prefix] = append(prefixes[prefix], i)
   713  		}
   714  	}
   715  
   716  	return prefixes
   717  }