github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/net/publicsuffix/gen.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  // +build ignore
     7  
     8  package main
     9  
    10  // This program generates table.go and table_test.go based on the authoritative
    11  // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
    12  //
    13  // The version is derived from
    14  // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
    15  // and a human-readable form is at
    16  // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
    17  //
    18  // To fetch a particular git revision, such as 5c70ccd250, pass
    19  // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
    20  // and -version "an explicit version string".
    21  
    22  import (
    23  	"bufio"
    24  	"bytes"
    25  	"flag"
    26  	"fmt"
    27  	http "github.com/hxx258456/ccgo/gmhttp"
    28  	"go/format"
    29  	"io"
    30  	"io/ioutil"
    31  	"os"
    32  	"regexp"
    33  	"sort"
    34  	"strings"
    35  
    36  	"github.com/hxx258456/ccgo/net/idna"
    37  )
    38  
    39  const (
    40  	// These sum of these four values must be no greater than 32.
    41  	nodesBitsChildren   = 10
    42  	nodesBitsICANN      = 1
    43  	nodesBitsTextOffset = 15
    44  	nodesBitsTextLength = 6
    45  
    46  	// These sum of these four values must be no greater than 32.
    47  	childrenBitsWildcard = 1
    48  	childrenBitsNodeType = 2
    49  	childrenBitsHi       = 14
    50  	childrenBitsLo       = 14
    51  )
    52  
    53  var (
    54  	maxChildren   int
    55  	maxTextOffset int
    56  	maxTextLength int
    57  	maxHi         uint32
    58  	maxLo         uint32
    59  )
    60  
    61  func max(a, b int) int {
    62  	if a < b {
    63  		return b
    64  	}
    65  	return a
    66  }
    67  
    68  func u32max(a, b uint32) uint32 {
    69  	if a < b {
    70  		return b
    71  	}
    72  	return a
    73  }
    74  
    75  const (
    76  	nodeTypeNormal     = 0
    77  	nodeTypeException  = 1
    78  	nodeTypeParentOnly = 2
    79  	numNodeType        = 3
    80  )
    81  
    82  func nodeTypeStr(n int) string {
    83  	switch n {
    84  	case nodeTypeNormal:
    85  		return "+"
    86  	case nodeTypeException:
    87  		return "!"
    88  	case nodeTypeParentOnly:
    89  		return "o"
    90  	}
    91  	panic("unreachable")
    92  }
    93  
    94  const (
    95  	defaultURL   = "https://publicsuffix.org/list/effective_tld_names.dat"
    96  	gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
    97  )
    98  
    99  var (
   100  	labelEncoding = map[string]uint32{}
   101  	labelsList    = []string{}
   102  	labelsMap     = map[string]bool{}
   103  	rules         = []string{}
   104  	numICANNRules = 0
   105  
   106  	// validSuffixRE is used to check that the entries in the public suffix
   107  	// list are in canonical form (after Punycode encoding). Specifically,
   108  	// capital letters are not allowed.
   109  	validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
   110  
   111  	shaRE  = regexp.MustCompile(`"sha":"([^"]+)"`)
   112  	dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
   113  
   114  	comments = flag.Bool("comments", false, "generate table.go comments, for debugging")
   115  	subset   = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
   116  	url      = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
   117  	v        = flag.Bool("v", false, "verbose output (to stderr)")
   118  	version  = flag.String("version", "", "the effective_tld_names.dat version")
   119  )
   120  
   121  func main() {
   122  	if err := main1(); err != nil {
   123  		fmt.Fprintln(os.Stderr, err)
   124  		os.Exit(1)
   125  	}
   126  }
   127  
   128  func main1() error {
   129  	flag.Parse()
   130  	if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
   131  		return fmt.Errorf("not enough bits to encode the nodes table")
   132  	}
   133  	if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
   134  		return fmt.Errorf("not enough bits to encode the children table")
   135  	}
   136  	if *version == "" {
   137  		if *url != defaultURL {
   138  			return fmt.Errorf("-version was not specified, and the -url is not the default one")
   139  		}
   140  		sha, date, err := gitCommit()
   141  		if err != nil {
   142  			return err
   143  		}
   144  		*version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
   145  	}
   146  	var r io.Reader = os.Stdin
   147  	if *url != "" {
   148  		res, err := http.Get(*url)
   149  		if err != nil {
   150  			return err
   151  		}
   152  		if res.StatusCode != http.StatusOK {
   153  			return fmt.Errorf("bad GET status for %s: %s", *url, res.Status)
   154  		}
   155  		r = res.Body
   156  		defer res.Body.Close()
   157  	}
   158  
   159  	var root node
   160  	icann := false
   161  	br := bufio.NewReader(r)
   162  	for {
   163  		s, err := br.ReadString('\n')
   164  		if err != nil {
   165  			if err == io.EOF {
   166  				break
   167  			}
   168  			return err
   169  		}
   170  		s = strings.TrimSpace(s)
   171  		if strings.Contains(s, "BEGIN ICANN DOMAINS") {
   172  			if len(rules) != 0 {
   173  				return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`)
   174  			}
   175  			icann = true
   176  			continue
   177  		}
   178  		if strings.Contains(s, "END ICANN DOMAINS") {
   179  			icann, numICANNRules = false, len(rules)
   180  			continue
   181  		}
   182  		if s == "" || strings.HasPrefix(s, "//") {
   183  			continue
   184  		}
   185  		s, err = idna.ToASCII(s)
   186  		if err != nil {
   187  			return err
   188  		}
   189  		if !validSuffixRE.MatchString(s) {
   190  			return fmt.Errorf("bad publicsuffix.org list data: %q", s)
   191  		}
   192  
   193  		if *subset {
   194  			switch {
   195  			case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
   196  			case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
   197  			case s == "ao" || strings.HasSuffix(s, ".ao"):
   198  			case s == "ar" || strings.HasSuffix(s, ".ar"):
   199  			case s == "arpa" || strings.HasSuffix(s, ".arpa"):
   200  			case s == "cy" || strings.HasSuffix(s, ".cy"):
   201  			case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
   202  			case s == "jp":
   203  			case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
   204  			case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
   205  			case s == "om" || strings.HasSuffix(s, ".om"):
   206  			case s == "uk" || strings.HasSuffix(s, ".uk"):
   207  			case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
   208  			case s == "tw" || strings.HasSuffix(s, ".tw"):
   209  			case s == "zw" || strings.HasSuffix(s, ".zw"):
   210  			case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
   211  				// xn--p1ai is Russian-Cyrillic "рф".
   212  			default:
   213  				continue
   214  			}
   215  		}
   216  
   217  		rules = append(rules, s)
   218  
   219  		nt, wildcard := nodeTypeNormal, false
   220  		switch {
   221  		case strings.HasPrefix(s, "*."):
   222  			s, nt = s[2:], nodeTypeParentOnly
   223  			wildcard = true
   224  		case strings.HasPrefix(s, "!"):
   225  			s, nt = s[1:], nodeTypeException
   226  		}
   227  		labels := strings.Split(s, ".")
   228  		for n, i := &root, len(labels)-1; i >= 0; i-- {
   229  			label := labels[i]
   230  			n = n.child(label)
   231  			if i == 0 {
   232  				if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
   233  					n.nodeType = nt
   234  				}
   235  				n.icann = n.icann && icann
   236  				n.wildcard = n.wildcard || wildcard
   237  			}
   238  			labelsMap[label] = true
   239  		}
   240  	}
   241  	labelsList = make([]string, 0, len(labelsMap))
   242  	for label := range labelsMap {
   243  		labelsList = append(labelsList, label)
   244  	}
   245  	sort.Strings(labelsList)
   246  
   247  	if err := generate(printReal, &root, "table.go"); err != nil {
   248  		return err
   249  	}
   250  	if err := generate(printTest, &root, "table_test.go"); err != nil {
   251  		return err
   252  	}
   253  	return nil
   254  }
   255  
   256  func generate(p func(io.Writer, *node) error, root *node, filename string) error {
   257  	buf := new(bytes.Buffer)
   258  	if err := p(buf, root); err != nil {
   259  		return err
   260  	}
   261  	b, err := format.Source(buf.Bytes())
   262  	if err != nil {
   263  		return err
   264  	}
   265  	return ioutil.WriteFile(filename, b, 0644)
   266  }
   267  
   268  func gitCommit() (sha, date string, retErr error) {
   269  	res, err := http.Get(gitCommitURL)
   270  	if err != nil {
   271  		return "", "", err
   272  	}
   273  	if res.StatusCode != http.StatusOK {
   274  		return "", "", fmt.Errorf("bad GET status for %s: %s", gitCommitURL, res.Status)
   275  	}
   276  	defer res.Body.Close()
   277  	b, err := ioutil.ReadAll(res.Body)
   278  	if err != nil {
   279  		return "", "", err
   280  	}
   281  	if m := shaRE.FindSubmatch(b); m != nil {
   282  		sha = string(m[1])
   283  	}
   284  	if m := dateRE.FindSubmatch(b); m != nil {
   285  		date = string(m[1])
   286  	}
   287  	if sha == "" || date == "" {
   288  		retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
   289  	}
   290  	return sha, date, retErr
   291  }
   292  
   293  func printTest(w io.Writer, n *node) error {
   294  	fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
   295  	fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules)
   296  	for _, rule := range rules {
   297  		fmt.Fprintf(w, "%q,\n", rule)
   298  	}
   299  	fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
   300  	if err := n.walk(w, printNodeLabel); err != nil {
   301  		return err
   302  	}
   303  	fmt.Fprintf(w, "}\n")
   304  	return nil
   305  }
   306  
   307  func printReal(w io.Writer, n *node) error {
   308  	const header = `// generated by go run gen.go; DO NOT EDIT
   309  
   310  package publicsuffix
   311  
   312  const version = %q
   313  
   314  const (
   315  	nodesBitsChildren   = %d
   316  	nodesBitsICANN      = %d
   317  	nodesBitsTextOffset = %d
   318  	nodesBitsTextLength = %d
   319  
   320  	childrenBitsWildcard = %d
   321  	childrenBitsNodeType = %d
   322  	childrenBitsHi       = %d
   323  	childrenBitsLo       = %d
   324  )
   325  
   326  const (
   327  	nodeTypeNormal     = %d
   328  	nodeTypeException  = %d
   329  	nodeTypeParentOnly = %d
   330  )
   331  
   332  // numTLD is the number of top level domains.
   333  const numTLD = %d
   334  
   335  `
   336  	fmt.Fprintf(w, header, *version,
   337  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
   338  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
   339  		nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
   340  
   341  	text := combineText(labelsList)
   342  	if text == "" {
   343  		return fmt.Errorf("internal error: makeText returned no text")
   344  	}
   345  	for _, label := range labelsList {
   346  		offset, length := strings.Index(text, label), len(label)
   347  		if offset < 0 {
   348  			return fmt.Errorf("internal error: could not find %q in text %q", label, text)
   349  		}
   350  		maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
   351  		if offset >= 1<<nodesBitsTextOffset {
   352  			return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
   353  		}
   354  		if length >= 1<<nodesBitsTextLength {
   355  			return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
   356  		}
   357  		labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
   358  	}
   359  	fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
   360  	for len(text) > 0 {
   361  		n, plus := len(text), ""
   362  		if n > 64 {
   363  			n, plus = 64, " +"
   364  		}
   365  		fmt.Fprintf(w, "%q%s\n", text[:n], plus)
   366  		text = text[n:]
   367  	}
   368  
   369  	if err := n.walk(w, assignIndexes); err != nil {
   370  		return err
   371  	}
   372  
   373  	fmt.Fprintf(w, `
   374  
   375  // nodes is the list of nodes. Each node is represented as a uint32, which
   376  // encodes the node's children, wildcard bit and node type (as an index into
   377  // the children array), ICANN bit and text.
   378  //
   379  // If the table was generated with the -comments flag, there is a //-comment
   380  // after each node's data. In it is the nodes-array indexes of the children,
   381  // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
   382  // nodeType is printed as + for normal, ! for exception, and o for parent-only
   383  // nodes that have children but don't match a domain label in their own right.
   384  // An I denotes an ICANN domain.
   385  //
   386  // The layout within the uint32, from MSB to LSB, is:
   387  //	[%2d bits] unused
   388  //	[%2d bits] children index
   389  //	[%2d bits] ICANN bit
   390  //	[%2d bits] text index
   391  //	[%2d bits] text length
   392  var nodes = [...]uint32{
   393  `,
   394  		32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
   395  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
   396  	if err := n.walk(w, printNode); err != nil {
   397  		return err
   398  	}
   399  	fmt.Fprintf(w, `}
   400  
   401  // children is the list of nodes' children, the parent's wildcard bit and the
   402  // parent's node type. If a node has no children then their children index
   403  // will be in the range [0, 6), depending on the wildcard bit and node type.
   404  //
   405  // The layout within the uint32, from MSB to LSB, is:
   406  //	[%2d bits] unused
   407  //	[%2d bits] wildcard bit
   408  //	[%2d bits] node type
   409  //	[%2d bits] high nodes index (exclusive) of children
   410  //	[%2d bits] low nodes index (inclusive) of children
   411  var children=[...]uint32{
   412  `,
   413  		32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
   414  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
   415  	for i, c := range childrenEncoding {
   416  		s := "---------------"
   417  		lo := c & (1<<childrenBitsLo - 1)
   418  		hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
   419  		if lo != hi {
   420  			s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
   421  		}
   422  		nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
   423  		wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
   424  		if *comments {
   425  			fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
   426  				c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
   427  		} else {
   428  			fmt.Fprintf(w, "0x%x,\n", c)
   429  		}
   430  	}
   431  	fmt.Fprintf(w, "}\n\n")
   432  	fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
   433  	fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
   434  	fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
   435  	fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
   436  	fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
   437  	return nil
   438  }
   439  
   440  type node struct {
   441  	label    string
   442  	nodeType int
   443  	icann    bool
   444  	wildcard bool
   445  	// nodesIndex and childrenIndex are the index of this node in the nodes
   446  	// and the index of its children offset/length in the children arrays.
   447  	nodesIndex, childrenIndex int
   448  	// firstChild is the index of this node's first child, or zero if this
   449  	// node has no children.
   450  	firstChild int
   451  	// children are the node's children, in strictly increasing node label order.
   452  	children []*node
   453  }
   454  
   455  func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
   456  	if err := f(w, n); err != nil {
   457  		return err
   458  	}
   459  	for _, c := range n.children {
   460  		if err := c.walk(w, f); err != nil {
   461  			return err
   462  		}
   463  	}
   464  	return nil
   465  }
   466  
   467  // child returns the child of n with the given label. The child is created if
   468  // it did not exist beforehand.
   469  func (n *node) child(label string) *node {
   470  	for _, c := range n.children {
   471  		if c.label == label {
   472  			return c
   473  		}
   474  	}
   475  	c := &node{
   476  		label:    label,
   477  		nodeType: nodeTypeParentOnly,
   478  		icann:    true,
   479  	}
   480  	n.children = append(n.children, c)
   481  	sort.Sort(byLabel(n.children))
   482  	return c
   483  }
   484  
   485  type byLabel []*node
   486  
   487  func (b byLabel) Len() int           { return len(b) }
   488  func (b byLabel) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   489  func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
   490  
   491  var nextNodesIndex int
   492  
   493  // childrenEncoding are the encoded entries in the generated children array.
   494  // All these pre-defined entries have no children.
   495  var childrenEncoding = []uint32{
   496  	0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
   497  	1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
   498  	2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
   499  	4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
   500  	5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
   501  	6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
   502  }
   503  
   504  var firstCallToAssignIndexes = true
   505  
   506  func assignIndexes(w io.Writer, n *node) error {
   507  	if len(n.children) != 0 {
   508  		// Assign nodesIndex.
   509  		n.firstChild = nextNodesIndex
   510  		for _, c := range n.children {
   511  			c.nodesIndex = nextNodesIndex
   512  			nextNodesIndex++
   513  		}
   514  
   515  		// The root node's children is implicit.
   516  		if firstCallToAssignIndexes {
   517  			firstCallToAssignIndexes = false
   518  			return nil
   519  		}
   520  
   521  		// Assign childrenIndex.
   522  		maxChildren = max(maxChildren, len(childrenEncoding))
   523  		if len(childrenEncoding) >= 1<<nodesBitsChildren {
   524  			return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
   525  		}
   526  		n.childrenIndex = len(childrenEncoding)
   527  		lo := uint32(n.firstChild)
   528  		hi := lo + uint32(len(n.children))
   529  		maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
   530  		if lo >= 1<<childrenBitsLo {
   531  			return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
   532  		}
   533  		if hi >= 1<<childrenBitsHi {
   534  			return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
   535  		}
   536  		enc := hi<<childrenBitsLo | lo
   537  		enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
   538  		if n.wildcard {
   539  			enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
   540  		}
   541  		childrenEncoding = append(childrenEncoding, enc)
   542  	} else {
   543  		n.childrenIndex = n.nodeType
   544  		if n.wildcard {
   545  			n.childrenIndex += numNodeType
   546  		}
   547  	}
   548  	return nil
   549  }
   550  
   551  func printNode(w io.Writer, n *node) error {
   552  	for _, c := range n.children {
   553  		s := "---------------"
   554  		if len(c.children) != 0 {
   555  			s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
   556  		}
   557  		encoding := labelEncoding[c.label]
   558  		if c.icann {
   559  			encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
   560  		}
   561  		encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
   562  		if *comments {
   563  			fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
   564  				encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
   565  				nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
   566  			)
   567  		} else {
   568  			fmt.Fprintf(w, "0x%x,\n", encoding)
   569  		}
   570  	}
   571  	return nil
   572  }
   573  
   574  func printNodeLabel(w io.Writer, n *node) error {
   575  	for _, c := range n.children {
   576  		fmt.Fprintf(w, "%q,\n", c.label)
   577  	}
   578  	return nil
   579  }
   580  
   581  func icannStr(icann bool) string {
   582  	if icann {
   583  		return "I"
   584  	}
   585  	return " "
   586  }
   587  
   588  func wildcardStr(wildcard bool) string {
   589  	if wildcard {
   590  		return "*"
   591  	}
   592  	return " "
   593  }
   594  
   595  // combineText combines all the strings in labelsList to form one giant string.
   596  // Overlapping strings will be merged: "arpa" and "parliament" could yield
   597  // "arparliament".
   598  func combineText(labelsList []string) string {
   599  	beforeLength := 0
   600  	for _, s := range labelsList {
   601  		beforeLength += len(s)
   602  	}
   603  
   604  	text := crush(removeSubstrings(labelsList))
   605  	if *v {
   606  		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
   607  	}
   608  	return text
   609  }
   610  
   611  type byLength []string
   612  
   613  func (s byLength) Len() int           { return len(s) }
   614  func (s byLength) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
   615  func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
   616  
   617  // removeSubstrings returns a copy of its input with any strings removed
   618  // that are substrings of other provided strings.
   619  func removeSubstrings(input []string) []string {
   620  	// Make a copy of input.
   621  	ss := append(make([]string, 0, len(input)), input...)
   622  	sort.Sort(byLength(ss))
   623  
   624  	for i, shortString := range ss {
   625  		// For each string, only consider strings higher than it in sort order, i.e.
   626  		// of equal length or greater.
   627  		for _, longString := range ss[i+1:] {
   628  			if strings.Contains(longString, shortString) {
   629  				ss[i] = ""
   630  				break
   631  			}
   632  		}
   633  	}
   634  
   635  	// Remove the empty strings.
   636  	sort.Strings(ss)
   637  	for len(ss) > 0 && ss[0] == "" {
   638  		ss = ss[1:]
   639  	}
   640  	return ss
   641  }
   642  
   643  // crush combines a list of strings, taking advantage of overlaps. It returns a
   644  // single string that contains each input string as a substring.
   645  func crush(ss []string) string {
   646  	maxLabelLen := 0
   647  	for _, s := range ss {
   648  		if maxLabelLen < len(s) {
   649  			maxLabelLen = len(s)
   650  		}
   651  	}
   652  
   653  	for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
   654  		prefixes := makePrefixMap(ss, prefixLen)
   655  		for i, s := range ss {
   656  			if len(s) <= prefixLen {
   657  				continue
   658  			}
   659  			mergeLabel(ss, i, prefixLen, prefixes)
   660  		}
   661  	}
   662  
   663  	return strings.Join(ss, "")
   664  }
   665  
   666  // mergeLabel merges the label at ss[i] with the first available matching label
   667  // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
   668  // "prefixLen" characters in the matching label.
   669  // It will merge ss[i] repeatedly until no more matches are available.
   670  // All matching labels merged into ss[i] are replaced by "".
   671  func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
   672  	s := ss[i]
   673  	suffix := s[len(s)-prefixLen:]
   674  	for _, j := range prefixes[suffix] {
   675  		// Empty strings mean "already used." Also avoid merging with self.
   676  		if ss[j] == "" || i == j {
   677  			continue
   678  		}
   679  		if *v {
   680  			fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
   681  				prefixLen, i, j, ss[i], ss[j], suffix)
   682  		}
   683  		ss[i] += ss[j][prefixLen:]
   684  		ss[j] = ""
   685  		// ss[i] has a new suffix, so merge again if possible.
   686  		// Note: we only have to merge again at the same prefix length. Shorter
   687  		// prefix lengths will be handled in the next iteration of crush's for loop.
   688  		// Can there be matches for longer prefix lengths, introduced by the merge?
   689  		// I believe that any such matches would by necessity have been eliminated
   690  		// during substring removal or merged at a higher prefix length. For
   691  		// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
   692  		// would yield "abcde", which could be merged with "bcdef." However, in
   693  		// practice "cde" would already have been elimintated by removeSubstrings.
   694  		mergeLabel(ss, i, prefixLen, prefixes)
   695  		return
   696  	}
   697  }
   698  
   699  // prefixMap maps from a prefix to a list of strings containing that prefix. The
   700  // list of strings is represented as indexes into a slice of strings stored
   701  // elsewhere.
   702  type prefixMap map[string][]int
   703  
   704  // makePrefixMap constructs a prefixMap from a slice of strings.
   705  func makePrefixMap(ss []string, prefixLen int) prefixMap {
   706  	prefixes := make(prefixMap)
   707  	for i, s := range ss {
   708  		// We use < rather than <= because if a label matches on a prefix equal to
   709  		// its full length, that's actually a substring match handled by
   710  		// removeSubstrings.
   711  		if prefixLen < len(s) {
   712  			prefix := s[:prefixLen]
   713  			prefixes[prefix] = append(prefixes[prefix], i)
   714  		}
   715  	}
   716  
   717  	return prefixes
   718  }