github.com/MangoDowner/go-gm@v0.0.0-20180818020936-8baa2bd4408c/misc/linkcheck/linkcheck.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // The linkcheck command finds missing links in the godoc website.
     6  // It crawls a URL recursively and notes URLs and URL fragments
     7  // that it's seen and prints a report of missing links at the end.
     8  package main
     9  
    10  import (
    11  	"errors"
    12  	"flag"
    13  	"fmt"
    14  	"io/ioutil"
    15  	"log"
    16  	"net/http"
    17  	"os"
    18  	"regexp"
    19  	"strings"
    20  	"sync"
    21  )
    22  
    23  var (
    24  	root    = flag.String("root", "http://localhost:6060", "Root to crawl")
    25  	verbose = flag.Bool("verbose", false, "verbose")
    26  )
    27  
    28  var wg sync.WaitGroup        // outstanding fetches
    29  var urlq = make(chan string) // URLs to crawl
    30  
    31  // urlFrag is a URL and its optional #fragment (without the #)
    32  type urlFrag struct {
    33  	url, frag string
    34  }
    35  
    36  var (
    37  	mu          sync.Mutex
    38  	crawled     = make(map[string]bool)      // URL without fragment -> true
    39  	neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
    40  )
    41  
    42  var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
    43  
    44  // Owned by crawlLoop goroutine:
    45  var (
    46  	linkSources = make(map[string][]string) // url no fragment -> sources
    47  	fragExists  = make(map[urlFrag]bool)
    48  	problems    []string
    49  )
    50  
    51  func localLinks(body string) (links []string) {
    52  	seen := map[string]bool{}
    53  	mv := aRx.FindAllStringSubmatch(body, -1)
    54  	for _, m := range mv {
    55  		ref := m[1]
    56  		if strings.HasPrefix(ref, "/src/") {
    57  			continue
    58  		}
    59  		if !seen[ref] {
    60  			seen[ref] = true
    61  			links = append(links, m[1])
    62  		}
    63  	}
    64  	return
    65  }
    66  
    67  var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
    68  
    69  func pageIDs(body string) (ids []string) {
    70  	mv := idRx.FindAllStringSubmatch(body, -1)
    71  	for _, m := range mv {
    72  		ids = append(ids, m[1])
    73  	}
    74  	return
    75  }
    76  
    77  // url may contain a #fragment, and the fragment is then noted as needing to exist.
    78  func crawl(url string, sourceURL string) {
    79  	if strings.Contains(url, "/devel/release") {
    80  		return
    81  	}
    82  	mu.Lock()
    83  	defer mu.Unlock()
    84  	var frag string
    85  	if i := strings.Index(url, "#"); i >= 0 {
    86  		frag = url[i+1:]
    87  		url = url[:i]
    88  		if frag != "" {
    89  			uf := urlFrag{url, frag}
    90  			neededFrags[uf] = append(neededFrags[uf], sourceURL)
    91  		}
    92  	}
    93  	if crawled[url] {
    94  		return
    95  	}
    96  	crawled[url] = true
    97  
    98  	wg.Add(1)
    99  	go func() {
   100  		urlq <- url
   101  	}()
   102  }
   103  
   104  func addProblem(url, errmsg string) {
   105  	msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
   106  	if *verbose {
   107  		log.Print(msg)
   108  	}
   109  	problems = append(problems, msg)
   110  }
   111  
   112  func crawlLoop() {
   113  	for url := range urlq {
   114  		if err := doCrawl(url); err != nil {
   115  			addProblem(url, err.Error())
   116  		}
   117  	}
   118  }
   119  
   120  func doCrawl(url string) error {
   121  	defer wg.Done()
   122  
   123  	req, err := http.NewRequest("GET", url, nil)
   124  	if err != nil {
   125  		return err
   126  	}
   127  	res, err := http.DefaultTransport.RoundTrip(req)
   128  	if err != nil {
   129  		return err
   130  	}
   131  	// Handle redirects.
   132  	if res.StatusCode/100 == 3 {
   133  		newURL, err := res.Location()
   134  		if err != nil {
   135  			return fmt.Errorf("resolving redirect: %v", err)
   136  		}
   137  		if !strings.HasPrefix(newURL.String(), *root) {
   138  			// Skip off-site redirects.
   139  			return nil
   140  		}
   141  		crawl(newURL.String(), url)
   142  		return nil
   143  	}
   144  	if res.StatusCode != 200 {
   145  		return errors.New(res.Status)
   146  	}
   147  	slurp, err := ioutil.ReadAll(res.Body)
   148  	res.Body.Close()
   149  	if err != nil {
   150  		log.Fatalf("Error reading %s body: %v", url, err)
   151  	}
   152  	if *verbose {
   153  		log.Printf("Len of %s: %d", url, len(slurp))
   154  	}
   155  	body := string(slurp)
   156  	for _, ref := range localLinks(body) {
   157  		if *verbose {
   158  			log.Printf("  links to %s", ref)
   159  		}
   160  		dest := *root + ref
   161  		linkSources[dest] = append(linkSources[dest], url)
   162  		crawl(dest, url)
   163  	}
   164  	for _, id := range pageIDs(body) {
   165  		if *verbose {
   166  			log.Printf(" url %s has #%s", url, id)
   167  		}
   168  		fragExists[urlFrag{url, id}] = true
   169  	}
   170  	return nil
   171  }
   172  
   173  func main() {
   174  	flag.Parse()
   175  
   176  	go crawlLoop()
   177  	crawl(*root, "")
   178  
   179  	wg.Wait()
   180  	close(urlq)
   181  	for uf, needers := range neededFrags {
   182  		if !fragExists[uf] {
   183  			problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
   184  		}
   185  	}
   186  
   187  	for _, s := range problems {
   188  		fmt.Println(s)
   189  	}
   190  	if len(problems) > 0 {
   191  		os.Exit(1)
   192  	}
   193  }