gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/net/publicsuffix/gen.go (about) 1 // Copyright 2012 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 // +build ignore 7 8 package main 9 10 // This program generates table.go and table_test.go based on the authoritative 11 // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat 12 // 13 // The version is derived from 14 // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat 15 // and a human-readable form is at 16 // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat 17 // 18 // To fetch a particular git revision, such as 5c70ccd250, pass 19 // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat" 20 // and -version "an explicit version string". 21 22 import ( 23 "bufio" 24 "bytes" 25 "flag" 26 "fmt" 27 http "gitee.com/ks-custle/core-gm/gmhttp" 28 "go/format" 29 "io" 30 "io/ioutil" 31 "os" 32 "regexp" 33 "sort" 34 "strings" 35 36 "gitee.com/ks-custle/core-gm/net/idna" 37 ) 38 39 const ( 40 // These sum of these four values must be no greater than 32. 41 nodesBitsChildren = 10 42 nodesBitsICANN = 1 43 nodesBitsTextOffset = 15 44 nodesBitsTextLength = 6 45 46 // These sum of these four values must be no greater than 32. 47 childrenBitsWildcard = 1 48 childrenBitsNodeType = 2 49 childrenBitsHi = 14 50 childrenBitsLo = 14 51 ) 52 53 var ( 54 maxChildren int 55 maxTextOffset int 56 maxTextLength int 57 maxHi uint32 58 maxLo uint32 59 ) 60 61 func max(a, b int) int { 62 if a < b { 63 return b 64 } 65 return a 66 } 67 68 func u32max(a, b uint32) uint32 { 69 if a < b { 70 return b 71 } 72 return a 73 } 74 75 const ( 76 nodeTypeNormal = 0 77 nodeTypeException = 1 78 nodeTypeParentOnly = 2 79 numNodeType = 3 80 ) 81 82 func nodeTypeStr(n int) string { 83 switch n { 84 case nodeTypeNormal: 85 return "+" 86 case nodeTypeException: 87 return "!" 88 case nodeTypeParentOnly: 89 return "o" 90 } 91 panic("unreachable") 92 } 93 94 const ( 95 defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat" 96 gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat" 97 ) 98 99 var ( 100 labelEncoding = map[string]uint32{} 101 labelsList = []string{} 102 labelsMap = map[string]bool{} 103 rules = []string{} 104 numICANNRules = 0 105 106 // validSuffixRE is used to check that the entries in the public suffix 107 // list are in canonical form (after Punycode encoding). Specifically, 108 // capital letters are not allowed. 109 validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`) 110 111 shaRE = regexp.MustCompile(`"sha":"([^"]+)"`) 112 dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`) 113 114 comments = flag.Bool("comments", false, "generate table.go comments, for debugging") 115 subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging") 116 url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead") 117 v = flag.Bool("v", false, "verbose output (to stderr)") 118 version = flag.String("version", "", "the effective_tld_names.dat version") 119 ) 120 121 func main() { 122 if err := main1(); err != nil { 123 fmt.Fprintln(os.Stderr, err) 124 os.Exit(1) 125 } 126 } 127 128 func main1() error { 129 flag.Parse() 130 if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 { 131 return fmt.Errorf("not enough bits to encode the nodes table") 132 } 133 if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 { 134 return fmt.Errorf("not enough bits to encode the children table") 135 } 136 if *version == "" { 137 if *url != defaultURL { 138 return fmt.Errorf("-version was not specified, and the -url is not the default one") 139 } 140 sha, date, err := gitCommit() 141 if err != nil { 142 return err 143 } 144 *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date) 145 } 146 var r io.Reader = os.Stdin 147 if *url != "" { 148 res, err := http.Get(*url) 149 if err != nil { 150 return err 151 } 152 if res.StatusCode != http.StatusOK { 153 return fmt.Errorf("bad GET status for %s: %s", *url, res.Status) 154 } 155 r = res.Body 156 defer res.Body.Close() 157 } 158 159 var root node 160 icann := false 161 br := bufio.NewReader(r) 162 for { 163 s, err := br.ReadString('\n') 164 if err != nil { 165 if err == io.EOF { 166 break 167 } 168 return err 169 } 170 s = strings.TrimSpace(s) 171 if strings.Contains(s, "BEGIN ICANN DOMAINS") { 172 if len(rules) != 0 { 173 return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`) 174 } 175 icann = true 176 continue 177 } 178 if strings.Contains(s, "END ICANN DOMAINS") { 179 icann, numICANNRules = false, len(rules) 180 continue 181 } 182 if s == "" || strings.HasPrefix(s, "//") { 183 continue 184 } 185 s, err = idna.ToASCII(s) 186 if err != nil { 187 return err 188 } 189 if !validSuffixRE.MatchString(s) { 190 return fmt.Errorf("bad publicsuffix.org list data: %q", s) 191 } 192 193 if *subset { 194 switch { 195 case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"): 196 case s == "ak.us" || strings.HasSuffix(s, ".ak.us"): 197 case s == "ao" || strings.HasSuffix(s, ".ao"): 198 case s == "ar" || strings.HasSuffix(s, ".ar"): 199 case s == "arpa" || strings.HasSuffix(s, ".arpa"): 200 case s == "cy" || strings.HasSuffix(s, ".cy"): 201 case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"): 202 case s == "jp": 203 case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"): 204 case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"): 205 case s == "om" || strings.HasSuffix(s, ".om"): 206 case s == "uk" || strings.HasSuffix(s, ".uk"): 207 case s == "uk.com" || strings.HasSuffix(s, ".uk.com"): 208 case s == "tw" || strings.HasSuffix(s, ".tw"): 209 case s == "zw" || strings.HasSuffix(s, ".zw"): 210 case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"): 211 // xn--p1ai is Russian-Cyrillic "рф". 212 default: 213 continue 214 } 215 } 216 217 rules = append(rules, s) 218 219 nt, wildcard := nodeTypeNormal, false 220 switch { 221 case strings.HasPrefix(s, "*."): 222 s, nt = s[2:], nodeTypeParentOnly 223 wildcard = true 224 case strings.HasPrefix(s, "!"): 225 s, nt = s[1:], nodeTypeException 226 } 227 labels := strings.Split(s, ".") 228 for n, i := &root, len(labels)-1; i >= 0; i-- { 229 label := labels[i] 230 n = n.child(label) 231 if i == 0 { 232 if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly { 233 n.nodeType = nt 234 } 235 n.icann = n.icann && icann 236 n.wildcard = n.wildcard || wildcard 237 } 238 labelsMap[label] = true 239 } 240 } 241 labelsList = make([]string, 0, len(labelsMap)) 242 for label := range labelsMap { 243 labelsList = append(labelsList, label) 244 } 245 sort.Strings(labelsList) 246 247 if err := generate(printReal, &root, "table.go"); err != nil { 248 return err 249 } 250 if err := generate(printTest, &root, "table_test.go"); err != nil { 251 return err 252 } 253 return nil 254 } 255 256 func generate(p func(io.Writer, *node) error, root *node, filename string) error { 257 buf := new(bytes.Buffer) 258 if err := p(buf, root); err != nil { 259 return err 260 } 261 b, err := format.Source(buf.Bytes()) 262 if err != nil { 263 return err 264 } 265 return ioutil.WriteFile(filename, b, 0644) 266 } 267 268 func gitCommit() (sha, date string, retErr error) { 269 res, err := http.Get(gitCommitURL) 270 if err != nil { 271 return "", "", err 272 } 273 if res.StatusCode != http.StatusOK { 274 return "", "", fmt.Errorf("bad GET status for %s: %s", gitCommitURL, res.Status) 275 } 276 defer res.Body.Close() 277 b, err := ioutil.ReadAll(res.Body) 278 if err != nil { 279 return "", "", err 280 } 281 if m := shaRE.FindSubmatch(b); m != nil { 282 sha = string(m[1]) 283 } 284 if m := dateRE.FindSubmatch(b); m != nil { 285 date = string(m[1]) 286 } 287 if sha == "" || date == "" { 288 retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL) 289 } 290 return sha, date, retErr 291 } 292 293 func printTest(w io.Writer, n *node) error { 294 fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n") 295 fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules) 296 for _, rule := range rules { 297 fmt.Fprintf(w, "%q,\n", rule) 298 } 299 fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n") 300 if err := n.walk(w, printNodeLabel); err != nil { 301 return err 302 } 303 fmt.Fprintf(w, "}\n") 304 return nil 305 } 306 307 func printReal(w io.Writer, n *node) error { 308 const header = `// generated by go run gen.go; DO NOT EDIT 309 310 package publicsuffix 311 312 const version = %q 313 314 const ( 315 nodesBitsChildren = %d 316 nodesBitsICANN = %d 317 nodesBitsTextOffset = %d 318 nodesBitsTextLength = %d 319 320 childrenBitsWildcard = %d 321 childrenBitsNodeType = %d 322 childrenBitsHi = %d 323 childrenBitsLo = %d 324 ) 325 326 const ( 327 nodeTypeNormal = %d 328 nodeTypeException = %d 329 nodeTypeParentOnly = %d 330 ) 331 332 // numTLD is the number of top level domains. 333 const numTLD = %d 334 335 ` 336 fmt.Fprintf(w, header, *version, 337 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength, 338 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo, 339 nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children)) 340 341 text := combineText(labelsList) 342 if text == "" { 343 return fmt.Errorf("internal error: makeText returned no text") 344 } 345 for _, label := range labelsList { 346 offset, length := strings.Index(text, label), len(label) 347 if offset < 0 { 348 return fmt.Errorf("internal error: could not find %q in text %q", label, text) 349 } 350 maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length) 351 if offset >= 1<<nodesBitsTextOffset { 352 return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset) 353 } 354 if length >= 1<<nodesBitsTextLength { 355 return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length) 356 } 357 labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length) 358 } 359 fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ") 360 for len(text) > 0 { 361 n, plus := len(text), "" 362 if n > 64 { 363 n, plus = 64, " +" 364 } 365 fmt.Fprintf(w, "%q%s\n", text[:n], plus) 366 text = text[n:] 367 } 368 369 if err := n.walk(w, assignIndexes); err != nil { 370 return err 371 } 372 373 fmt.Fprintf(w, ` 374 375 // nodes is the list of nodes. Each node is represented as a uint32, which 376 // encodes the node's children, wildcard bit and node type (as an index into 377 // the children array), ICANN bit and text. 378 // 379 // If the table was generated with the -comments flag, there is a //-comment 380 // after each node's data. In it is the nodes-array indexes of the children, 381 // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The 382 // nodeType is printed as + for normal, ! for exception, and o for parent-only 383 // nodes that have children but don't match a domain label in their own right. 384 // An I denotes an ICANN domain. 385 // 386 // The layout within the uint32, from MSB to LSB, is: 387 // [%2d bits] unused 388 // [%2d bits] children index 389 // [%2d bits] ICANN bit 390 // [%2d bits] text index 391 // [%2d bits] text length 392 var nodes = [...]uint32{ 393 `, 394 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength, 395 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength) 396 if err := n.walk(w, printNode); err != nil { 397 return err 398 } 399 fmt.Fprintf(w, `} 400 401 // children is the list of nodes' children, the parent's wildcard bit and the 402 // parent's node type. If a node has no children then their children index 403 // will be in the range [0, 6), depending on the wildcard bit and node type. 404 // 405 // The layout within the uint32, from MSB to LSB, is: 406 // [%2d bits] unused 407 // [%2d bits] wildcard bit 408 // [%2d bits] node type 409 // [%2d bits] high nodes index (exclusive) of children 410 // [%2d bits] low nodes index (inclusive) of children 411 var children=[...]uint32{ 412 `, 413 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo, 414 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo) 415 for i, c := range childrenEncoding { 416 s := "---------------" 417 lo := c & (1<<childrenBitsLo - 1) 418 hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1) 419 if lo != hi { 420 s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi) 421 } 422 nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1) 423 wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0 424 if *comments { 425 fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n", 426 c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType)) 427 } else { 428 fmt.Fprintf(w, "0x%x,\n", c) 429 } 430 } 431 fmt.Fprintf(w, "}\n\n") 432 fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1) 433 fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1) 434 fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1) 435 fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1) 436 fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1) 437 return nil 438 } 439 440 type node struct { 441 label string 442 nodeType int 443 icann bool 444 wildcard bool 445 // nodesIndex and childrenIndex are the index of this node in the nodes 446 // and the index of its children offset/length in the children arrays. 447 nodesIndex, childrenIndex int 448 // firstChild is the index of this node's first child, or zero if this 449 // node has no children. 450 firstChild int 451 // children are the node's children, in strictly increasing node label order. 452 children []*node 453 } 454 455 func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error { 456 if err := f(w, n); err != nil { 457 return err 458 } 459 for _, c := range n.children { 460 if err := c.walk(w, f); err != nil { 461 return err 462 } 463 } 464 return nil 465 } 466 467 // child returns the child of n with the given label. The child is created if 468 // it did not exist beforehand. 469 func (n *node) child(label string) *node { 470 for _, c := range n.children { 471 if c.label == label { 472 return c 473 } 474 } 475 c := &node{ 476 label: label, 477 nodeType: nodeTypeParentOnly, 478 icann: true, 479 } 480 n.children = append(n.children, c) 481 sort.Sort(byLabel(n.children)) 482 return c 483 } 484 485 type byLabel []*node 486 487 func (b byLabel) Len() int { return len(b) } 488 func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 489 func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label } 490 491 var nextNodesIndex int 492 493 // childrenEncoding are the encoded entries in the generated children array. 494 // All these pre-defined entries have no children. 495 var childrenEncoding = []uint32{ 496 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal. 497 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException. 498 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly. 499 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal. 500 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException. 501 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly. 502 } 503 504 var firstCallToAssignIndexes = true 505 506 func assignIndexes(w io.Writer, n *node) error { 507 if len(n.children) != 0 { 508 // Assign nodesIndex. 509 n.firstChild = nextNodesIndex 510 for _, c := range n.children { 511 c.nodesIndex = nextNodesIndex 512 nextNodesIndex++ 513 } 514 515 // The root node's children is implicit. 516 if firstCallToAssignIndexes { 517 firstCallToAssignIndexes = false 518 return nil 519 } 520 521 // Assign childrenIndex. 522 maxChildren = max(maxChildren, len(childrenEncoding)) 523 if len(childrenEncoding) >= 1<<nodesBitsChildren { 524 return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding)) 525 } 526 n.childrenIndex = len(childrenEncoding) 527 lo := uint32(n.firstChild) 528 hi := lo + uint32(len(n.children)) 529 maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi) 530 if lo >= 1<<childrenBitsLo { 531 return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo) 532 } 533 if hi >= 1<<childrenBitsHi { 534 return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi) 535 } 536 enc := hi<<childrenBitsLo | lo 537 enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi) 538 if n.wildcard { 539 enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType) 540 } 541 childrenEncoding = append(childrenEncoding, enc) 542 } else { 543 n.childrenIndex = n.nodeType 544 if n.wildcard { 545 n.childrenIndex += numNodeType 546 } 547 } 548 return nil 549 } 550 551 func printNode(w io.Writer, n *node) error { 552 for _, c := range n.children { 553 s := "---------------" 554 if len(c.children) != 0 { 555 s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children)) 556 } 557 encoding := labelEncoding[c.label] 558 if c.icann { 559 encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset) 560 } 561 encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN) 562 if *comments { 563 fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n", 564 encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard), 565 nodeTypeStr(c.nodeType), icannStr(c.icann), c.label, 566 ) 567 } else { 568 fmt.Fprintf(w, "0x%x,\n", encoding) 569 } 570 } 571 return nil 572 } 573 574 func printNodeLabel(w io.Writer, n *node) error { 575 for _, c := range n.children { 576 fmt.Fprintf(w, "%q,\n", c.label) 577 } 578 return nil 579 } 580 581 func icannStr(icann bool) string { 582 if icann { 583 return "I" 584 } 585 return " " 586 } 587 588 func wildcardStr(wildcard bool) string { 589 if wildcard { 590 return "*" 591 } 592 return " " 593 } 594 595 // combineText combines all the strings in labelsList to form one giant string. 596 // Overlapping strings will be merged: "arpa" and "parliament" could yield 597 // "arparliament". 598 func combineText(labelsList []string) string { 599 beforeLength := 0 600 for _, s := range labelsList { 601 beforeLength += len(s) 602 } 603 604 text := crush(removeSubstrings(labelsList)) 605 if *v { 606 fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text)) 607 } 608 return text 609 } 610 611 type byLength []string 612 613 func (s byLength) Len() int { return len(s) } 614 func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 615 func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) } 616 617 // removeSubstrings returns a copy of its input with any strings removed 618 // that are substrings of other provided strings. 619 func removeSubstrings(input []string) []string { 620 // Make a copy of input. 621 ss := append(make([]string, 0, len(input)), input...) 622 sort.Sort(byLength(ss)) 623 624 for i, shortString := range ss { 625 // For each string, only consider strings higher than it in sort order, i.e. 626 // of equal length or greater. 627 for _, longString := range ss[i+1:] { 628 if strings.Contains(longString, shortString) { 629 ss[i] = "" 630 break 631 } 632 } 633 } 634 635 // Remove the empty strings. 636 sort.Strings(ss) 637 for len(ss) > 0 && ss[0] == "" { 638 ss = ss[1:] 639 } 640 return ss 641 } 642 643 // crush combines a list of strings, taking advantage of overlaps. It returns a 644 // single string that contains each input string as a substring. 645 func crush(ss []string) string { 646 maxLabelLen := 0 647 for _, s := range ss { 648 if maxLabelLen < len(s) { 649 maxLabelLen = len(s) 650 } 651 } 652 653 for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- { 654 prefixes := makePrefixMap(ss, prefixLen) 655 for i, s := range ss { 656 if len(s) <= prefixLen { 657 continue 658 } 659 mergeLabel(ss, i, prefixLen, prefixes) 660 } 661 } 662 663 return strings.Join(ss, "") 664 } 665 666 // mergeLabel merges the label at ss[i] with the first available matching label 667 // in prefixMap, where the last "prefixLen" characters in ss[i] match the first 668 // "prefixLen" characters in the matching label. 669 // It will merge ss[i] repeatedly until no more matches are available. 670 // All matching labels merged into ss[i] are replaced by "". 671 func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) { 672 s := ss[i] 673 suffix := s[len(s)-prefixLen:] 674 for _, j := range prefixes[suffix] { 675 // Empty strings mean "already used." Also avoid merging with self. 676 if ss[j] == "" || i == j { 677 continue 678 } 679 if *v { 680 fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n", 681 prefixLen, i, j, ss[i], ss[j], suffix) 682 } 683 ss[i] += ss[j][prefixLen:] 684 ss[j] = "" 685 // ss[i] has a new suffix, so merge again if possible. 686 // Note: we only have to merge again at the same prefix length. Shorter 687 // prefix lengths will be handled in the next iteration of crush's for loop. 688 // Can there be matches for longer prefix lengths, introduced by the merge? 689 // I believe that any such matches would by necessity have been eliminated 690 // during substring removal or merged at a higher prefix length. For 691 // instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde" 692 // would yield "abcde", which could be merged with "bcdef." However, in 693 // practice "cde" would already have been elimintated by removeSubstrings. 694 mergeLabel(ss, i, prefixLen, prefixes) 695 return 696 } 697 } 698 699 // prefixMap maps from a prefix to a list of strings containing that prefix. The 700 // list of strings is represented as indexes into a slice of strings stored 701 // elsewhere. 702 type prefixMap map[string][]int 703 704 // makePrefixMap constructs a prefixMap from a slice of strings. 705 func makePrefixMap(ss []string, prefixLen int) prefixMap { 706 prefixes := make(prefixMap) 707 for i, s := range ss { 708 // We use < rather than <= because if a label matches on a prefix equal to 709 // its full length, that's actually a substring match handled by 710 // removeSubstrings. 711 if prefixLen < len(s) { 712 prefix := s[:prefixLen] 713 prefixes[prefix] = append(prefixes[prefix], i) 714 } 715 } 716 717 return prefixes 718 }