github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/publicsuffix/gen.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ignore 6 7 package main 8 9 // This program generates table.go and table_test.go based on the authoritative 10 // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat 11 // 12 // The version is derived from 13 // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat 14 // and a human-readable form is at 15 // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat 16 // 17 // To fetch a particular git revision, such as 5c70ccd250, pass 18 // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat" 19 // and -version "an explicit version string". 20 21 import ( 22 "bufio" 23 "bytes" 24 "flag" 25 "fmt" 26 "go/format" 27 "io" 28 "io/ioutil" 29 "net/http" 30 "os" 31 "regexp" 32 "sort" 33 "strings" 34 35 "github.com/Andyfoo/golang/x/net/idna" 36 ) 37 38 const ( 39 // These sum of these four values must be no greater than 32. 40 nodesBitsChildren = 10 41 nodesBitsICANN = 1 42 nodesBitsTextOffset = 15 43 nodesBitsTextLength = 6 44 45 // These sum of these four values must be no greater than 32. 46 childrenBitsWildcard = 1 47 childrenBitsNodeType = 2 48 childrenBitsHi = 14 49 childrenBitsLo = 14 50 ) 51 52 var ( 53 maxChildren int 54 maxTextOffset int 55 maxTextLength int 56 maxHi uint32 57 maxLo uint32 58 ) 59 60 func max(a, b int) int { 61 if a < b { 62 return b 63 } 64 return a 65 } 66 67 func u32max(a, b uint32) uint32 { 68 if a < b { 69 return b 70 } 71 return a 72 } 73 74 const ( 75 nodeTypeNormal = 0 76 nodeTypeException = 1 77 nodeTypeParentOnly = 2 78 numNodeType = 3 79 ) 80 81 func nodeTypeStr(n int) string { 82 switch n { 83 case nodeTypeNormal: 84 return "+" 85 case nodeTypeException: 86 return "!" 87 case nodeTypeParentOnly: 88 return "o" 89 } 90 panic("unreachable") 91 } 92 93 const ( 94 defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat" 95 gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat" 96 ) 97 98 var ( 99 labelEncoding = map[string]uint32{} 100 labelsList = []string{} 101 labelsMap = map[string]bool{} 102 rules = []string{} 103 numICANNRules = 0 104 105 // validSuffixRE is used to check that the entries in the public suffix 106 // list are in canonical form (after Punycode encoding). Specifically, 107 // capital letters are not allowed. 108 validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`) 109 110 shaRE = regexp.MustCompile(`"sha":"([^"]+)"`) 111 dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`) 112 113 comments = flag.Bool("comments", false, "generate table.go comments, for debugging") 114 subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging") 115 url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead") 116 v = flag.Bool("v", false, "verbose output (to stderr)") 117 version = flag.String("version", "", "the effective_tld_names.dat version") 118 ) 119 120 func main() { 121 if err := main1(); err != nil { 122 fmt.Fprintln(os.Stderr, err) 123 os.Exit(1) 124 } 125 } 126 127 func main1() error { 128 flag.Parse() 129 if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 { 130 return fmt.Errorf("not enough bits to encode the nodes table") 131 } 132 if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 { 133 return fmt.Errorf("not enough bits to encode the children table") 134 } 135 if *version == "" { 136 if *url != defaultURL { 137 return fmt.Errorf("-version was not specified, and the -url is not the default one") 138 } 139 sha, date, err := gitCommit() 140 if err != nil { 141 return err 142 } 143 *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date) 144 } 145 var r io.Reader = os.Stdin 146 if *url != "" { 147 res, err := http.Get(*url) 148 if err != nil { 149 return err 150 } 151 if res.StatusCode != http.StatusOK { 152 return fmt.Errorf("bad GET status for %s: %d", *url, res.Status) 153 } 154 r = res.Body 155 defer res.Body.Close() 156 } 157 158 var root node 159 icann := false 160 br := bufio.NewReader(r) 161 for { 162 s, err := br.ReadString('\n') 163 if err != nil { 164 if err == io.EOF { 165 break 166 } 167 return err 168 } 169 s = strings.TrimSpace(s) 170 if strings.Contains(s, "BEGIN ICANN DOMAINS") { 171 if len(rules) != 0 { 172 return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`) 173 } 174 icann = true 175 continue 176 } 177 if strings.Contains(s, "END ICANN DOMAINS") { 178 icann, numICANNRules = false, len(rules) 179 continue 180 } 181 if s == "" || strings.HasPrefix(s, "//") { 182 continue 183 } 184 s, err = idna.ToASCII(s) 185 if err != nil { 186 return err 187 } 188 if !validSuffixRE.MatchString(s) { 189 return fmt.Errorf("bad publicsuffix.org list data: %q", s) 190 } 191 192 if *subset { 193 switch { 194 case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"): 195 case s == "ak.us" || strings.HasSuffix(s, ".ak.us"): 196 case s == "ao" || strings.HasSuffix(s, ".ao"): 197 case s == "ar" || strings.HasSuffix(s, ".ar"): 198 case s == "arpa" || strings.HasSuffix(s, ".arpa"): 199 case s == "cy" || strings.HasSuffix(s, ".cy"): 200 case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"): 201 case s == "jp": 202 case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"): 203 case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"): 204 case s == "om" || strings.HasSuffix(s, ".om"): 205 case s == "uk" || strings.HasSuffix(s, ".uk"): 206 case s == "uk.com" || strings.HasSuffix(s, ".uk.com"): 207 case s == "tw" || strings.HasSuffix(s, ".tw"): 208 case s == "zw" || strings.HasSuffix(s, ".zw"): 209 case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"): 210 // xn--p1ai is Russian-Cyrillic "рф". 211 default: 212 continue 213 } 214 } 215 216 rules = append(rules, s) 217 218 nt, wildcard := nodeTypeNormal, false 219 switch { 220 case strings.HasPrefix(s, "*."): 221 s, nt = s[2:], nodeTypeParentOnly 222 wildcard = true 223 case strings.HasPrefix(s, "!"): 224 s, nt = s[1:], nodeTypeException 225 } 226 labels := strings.Split(s, ".") 227 for n, i := &root, len(labels)-1; i >= 0; i-- { 228 label := labels[i] 229 n = n.child(label) 230 if i == 0 { 231 if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly { 232 n.nodeType = nt 233 } 234 n.icann = n.icann && icann 235 n.wildcard = n.wildcard || wildcard 236 } 237 labelsMap[label] = true 238 } 239 } 240 labelsList = make([]string, 0, len(labelsMap)) 241 for label := range labelsMap { 242 labelsList = append(labelsList, label) 243 } 244 sort.Strings(labelsList) 245 246 if err := generate(printReal, &root, "table.go"); err != nil { 247 return err 248 } 249 if err := generate(printTest, &root, "table_test.go"); err != nil { 250 return err 251 } 252 return nil 253 } 254 255 func generate(p func(io.Writer, *node) error, root *node, filename string) error { 256 buf := new(bytes.Buffer) 257 if err := p(buf, root); err != nil { 258 return err 259 } 260 b, err := format.Source(buf.Bytes()) 261 if err != nil { 262 return err 263 } 264 return ioutil.WriteFile(filename, b, 0644) 265 } 266 267 func gitCommit() (sha, date string, retErr error) { 268 res, err := http.Get(gitCommitURL) 269 if err != nil { 270 return "", "", err 271 } 272 if res.StatusCode != http.StatusOK { 273 return "", "", fmt.Errorf("bad GET status for %s: %d", gitCommitURL, res.Status) 274 } 275 defer res.Body.Close() 276 b, err := ioutil.ReadAll(res.Body) 277 if err != nil { 278 return "", "", err 279 } 280 if m := shaRE.FindSubmatch(b); m != nil { 281 sha = string(m[1]) 282 } 283 if m := dateRE.FindSubmatch(b); m != nil { 284 date = string(m[1]) 285 } 286 if sha == "" || date == "" { 287 retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL) 288 } 289 return sha, date, retErr 290 } 291 292 func printTest(w io.Writer, n *node) error { 293 fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n") 294 fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules) 295 for _, rule := range rules { 296 fmt.Fprintf(w, "%q,\n", rule) 297 } 298 fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n") 299 if err := n.walk(w, printNodeLabel); err != nil { 300 return err 301 } 302 fmt.Fprintf(w, "}\n") 303 return nil 304 } 305 306 func printReal(w io.Writer, n *node) error { 307 const header = `// generated by go run gen.go; DO NOT EDIT 308 309 package publicsuffix 310 311 const version = %q 312 313 const ( 314 nodesBitsChildren = %d 315 nodesBitsICANN = %d 316 nodesBitsTextOffset = %d 317 nodesBitsTextLength = %d 318 319 childrenBitsWildcard = %d 320 childrenBitsNodeType = %d 321 childrenBitsHi = %d 322 childrenBitsLo = %d 323 ) 324 325 const ( 326 nodeTypeNormal = %d 327 nodeTypeException = %d 328 nodeTypeParentOnly = %d 329 ) 330 331 // numTLD is the number of top level domains. 332 const numTLD = %d 333 334 ` 335 fmt.Fprintf(w, header, *version, 336 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength, 337 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo, 338 nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children)) 339 340 text := combineText(labelsList) 341 if text == "" { 342 return fmt.Errorf("internal error: makeText returned no text") 343 } 344 for _, label := range labelsList { 345 offset, length := strings.Index(text, label), len(label) 346 if offset < 0 { 347 return fmt.Errorf("internal error: could not find %q in text %q", label, text) 348 } 349 maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length) 350 if offset >= 1<<nodesBitsTextOffset { 351 return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset) 352 } 353 if length >= 1<<nodesBitsTextLength { 354 return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length) 355 } 356 labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length) 357 } 358 fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ") 359 for len(text) > 0 { 360 n, plus := len(text), "" 361 if n > 64 { 362 n, plus = 64, " +" 363 } 364 fmt.Fprintf(w, "%q%s\n", text[:n], plus) 365 text = text[n:] 366 } 367 368 if err := n.walk(w, assignIndexes); err != nil { 369 return err 370 } 371 372 fmt.Fprintf(w, ` 373 374 // nodes is the list of nodes. Each node is represented as a uint32, which 375 // encodes the node's children, wildcard bit and node type (as an index into 376 // the children array), ICANN bit and text. 377 // 378 // If the table was generated with the -comments flag, there is a //-comment 379 // after each node's data. In it is the nodes-array indexes of the children, 380 // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The 381 // nodeType is printed as + for normal, ! for exception, and o for parent-only 382 // nodes that have children but don't match a domain label in their own right. 383 // An I denotes an ICANN domain. 384 // 385 // The layout within the uint32, from MSB to LSB, is: 386 // [%2d bits] unused 387 // [%2d bits] children index 388 // [%2d bits] ICANN bit 389 // [%2d bits] text index 390 // [%2d bits] text length 391 var nodes = [...]uint32{ 392 `, 393 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength, 394 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength) 395 if err := n.walk(w, printNode); err != nil { 396 return err 397 } 398 fmt.Fprintf(w, `} 399 400 // children is the list of nodes' children, the parent's wildcard bit and the 401 // parent's node type. If a node has no children then their children index 402 // will be in the range [0, 6), depending on the wildcard bit and node type. 403 // 404 // The layout within the uint32, from MSB to LSB, is: 405 // [%2d bits] unused 406 // [%2d bits] wildcard bit 407 // [%2d bits] node type 408 // [%2d bits] high nodes index (exclusive) of children 409 // [%2d bits] low nodes index (inclusive) of children 410 var children=[...]uint32{ 411 `, 412 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo, 413 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo) 414 for i, c := range childrenEncoding { 415 s := "---------------" 416 lo := c & (1<<childrenBitsLo - 1) 417 hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1) 418 if lo != hi { 419 s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi) 420 } 421 nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1) 422 wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0 423 if *comments { 424 fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n", 425 c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType)) 426 } else { 427 fmt.Fprintf(w, "0x%x,\n", c) 428 } 429 } 430 fmt.Fprintf(w, "}\n\n") 431 fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1) 432 fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1) 433 fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1) 434 fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1) 435 fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1) 436 return nil 437 } 438 439 type node struct { 440 label string 441 nodeType int 442 icann bool 443 wildcard bool 444 // nodesIndex and childrenIndex are the index of this node in the nodes 445 // and the index of its children offset/length in the children arrays. 446 nodesIndex, childrenIndex int 447 // firstChild is the index of this node's first child, or zero if this 448 // node has no children. 449 firstChild int 450 // children are the node's children, in strictly increasing node label order. 451 children []*node 452 } 453 454 func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error { 455 if err := f(w, n); err != nil { 456 return err 457 } 458 for _, c := range n.children { 459 if err := c.walk(w, f); err != nil { 460 return err 461 } 462 } 463 return nil 464 } 465 466 // child returns the child of n with the given label. The child is created if 467 // it did not exist beforehand. 468 func (n *node) child(label string) *node { 469 for _, c := range n.children { 470 if c.label == label { 471 return c 472 } 473 } 474 c := &node{ 475 label: label, 476 nodeType: nodeTypeParentOnly, 477 icann: true, 478 } 479 n.children = append(n.children, c) 480 sort.Sort(byLabel(n.children)) 481 return c 482 } 483 484 type byLabel []*node 485 486 func (b byLabel) Len() int { return len(b) } 487 func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 488 func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label } 489 490 var nextNodesIndex int 491 492 // childrenEncoding are the encoded entries in the generated children array. 493 // All these pre-defined entries have no children. 494 var childrenEncoding = []uint32{ 495 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal. 496 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException. 497 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly. 498 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal. 499 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException. 500 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly. 501 } 502 503 var firstCallToAssignIndexes = true 504 505 func assignIndexes(w io.Writer, n *node) error { 506 if len(n.children) != 0 { 507 // Assign nodesIndex. 508 n.firstChild = nextNodesIndex 509 for _, c := range n.children { 510 c.nodesIndex = nextNodesIndex 511 nextNodesIndex++ 512 } 513 514 // The root node's children is implicit. 515 if firstCallToAssignIndexes { 516 firstCallToAssignIndexes = false 517 return nil 518 } 519 520 // Assign childrenIndex. 521 maxChildren = max(maxChildren, len(childrenEncoding)) 522 if len(childrenEncoding) >= 1<<nodesBitsChildren { 523 return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding)) 524 } 525 n.childrenIndex = len(childrenEncoding) 526 lo := uint32(n.firstChild) 527 hi := lo + uint32(len(n.children)) 528 maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi) 529 if lo >= 1<<childrenBitsLo { 530 return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo) 531 } 532 if hi >= 1<<childrenBitsHi { 533 return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi) 534 } 535 enc := hi<<childrenBitsLo | lo 536 enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi) 537 if n.wildcard { 538 enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType) 539 } 540 childrenEncoding = append(childrenEncoding, enc) 541 } else { 542 n.childrenIndex = n.nodeType 543 if n.wildcard { 544 n.childrenIndex += numNodeType 545 } 546 } 547 return nil 548 } 549 550 func printNode(w io.Writer, n *node) error { 551 for _, c := range n.children { 552 s := "---------------" 553 if len(c.children) != 0 { 554 s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children)) 555 } 556 encoding := labelEncoding[c.label] 557 if c.icann { 558 encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset) 559 } 560 encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN) 561 if *comments { 562 fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n", 563 encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard), 564 nodeTypeStr(c.nodeType), icannStr(c.icann), c.label, 565 ) 566 } else { 567 fmt.Fprintf(w, "0x%x,\n", encoding) 568 } 569 } 570 return nil 571 } 572 573 func printNodeLabel(w io.Writer, n *node) error { 574 for _, c := range n.children { 575 fmt.Fprintf(w, "%q,\n", c.label) 576 } 577 return nil 578 } 579 580 func icannStr(icann bool) string { 581 if icann { 582 return "I" 583 } 584 return " " 585 } 586 587 func wildcardStr(wildcard bool) string { 588 if wildcard { 589 return "*" 590 } 591 return " " 592 } 593 594 // combineText combines all the strings in labelsList to form one giant string. 595 // Overlapping strings will be merged: "arpa" and "parliament" could yield 596 // "arparliament". 597 func combineText(labelsList []string) string { 598 beforeLength := 0 599 for _, s := range labelsList { 600 beforeLength += len(s) 601 } 602 603 text := crush(removeSubstrings(labelsList)) 604 if *v { 605 fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text)) 606 } 607 return text 608 } 609 610 type byLength []string 611 612 func (s byLength) Len() int { return len(s) } 613 func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 614 func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) } 615 616 // removeSubstrings returns a copy of its input with any strings removed 617 // that are substrings of other provided strings. 618 func removeSubstrings(input []string) []string { 619 // Make a copy of input. 620 ss := append(make([]string, 0, len(input)), input...) 621 sort.Sort(byLength(ss)) 622 623 for i, shortString := range ss { 624 // For each string, only consider strings higher than it in sort order, i.e. 625 // of equal length or greater. 626 for _, longString := range ss[i+1:] { 627 if strings.Contains(longString, shortString) { 628 ss[i] = "" 629 break 630 } 631 } 632 } 633 634 // Remove the empty strings. 635 sort.Strings(ss) 636 for len(ss) > 0 && ss[0] == "" { 637 ss = ss[1:] 638 } 639 return ss 640 } 641 642 // crush combines a list of strings, taking advantage of overlaps. It returns a 643 // single string that contains each input string as a substring. 644 func crush(ss []string) string { 645 maxLabelLen := 0 646 for _, s := range ss { 647 if maxLabelLen < len(s) { 648 maxLabelLen = len(s) 649 } 650 } 651 652 for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- { 653 prefixes := makePrefixMap(ss, prefixLen) 654 for i, s := range ss { 655 if len(s) <= prefixLen { 656 continue 657 } 658 mergeLabel(ss, i, prefixLen, prefixes) 659 } 660 } 661 662 return strings.Join(ss, "") 663 } 664 665 // mergeLabel merges the label at ss[i] with the first available matching label 666 // in prefixMap, where the last "prefixLen" characters in ss[i] match the first 667 // "prefixLen" characters in the matching label. 668 // It will merge ss[i] repeatedly until no more matches are available. 669 // All matching labels merged into ss[i] are replaced by "". 670 func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) { 671 s := ss[i] 672 suffix := s[len(s)-prefixLen:] 673 for _, j := range prefixes[suffix] { 674 // Empty strings mean "already used." Also avoid merging with self. 675 if ss[j] == "" || i == j { 676 continue 677 } 678 if *v { 679 fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n", 680 prefixLen, i, j, ss[i], ss[j], suffix) 681 } 682 ss[i] += ss[j][prefixLen:] 683 ss[j] = "" 684 // ss[i] has a new suffix, so merge again if possible. 685 // Note: we only have to merge again at the same prefix length. Shorter 686 // prefix lengths will be handled in the next iteration of crush's for loop. 687 // Can there be matches for longer prefix lengths, introduced by the merge? 688 // I believe that any such matches would by necessity have been eliminated 689 // during substring removal or merged at a higher prefix length. For 690 // instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde" 691 // would yield "abcde", which could be merged with "bcdef." However, in 692 // practice "cde" would already have been elimintated by removeSubstrings. 693 mergeLabel(ss, i, prefixLen, prefixes) 694 return 695 } 696 } 697 698 // prefixMap maps from a prefix to a list of strings containing that prefix. The 699 // list of strings is represented as indexes into a slice of strings stored 700 // elsewhere. 701 type prefixMap map[string][]int 702 703 // makePrefixMap constructs a prefixMap from a slice of strings. 704 func makePrefixMap(ss []string, prefixLen int) prefixMap { 705 prefixes := make(prefixMap) 706 for i, s := range ss { 707 // We use < rather than <= because if a label matches on a prefix equal to 708 // its full length, that's actually a substring match handled by 709 // removeSubstrings. 710 if prefixLen < len(s) { 711 prefix := s[:prefixLen] 712 prefixes[prefix] = append(prefixes[prefix], i) 713 } 714 } 715 716 return prefixes 717 }