github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/misc/linkcheck/linkcheck.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // The linkcheck command finds missing links in the godoc website.
     6  // It crawls a URL recursively and notes URLs and URL fragments
     7  // that it's seen and prints a report of missing links at the end.
     8  package main
     9  
    10  import (
    11  	"flag"
    12  	"fmt"
    13  	"io/ioutil"
    14  	"log"
    15  	"net/http"
    16  	"regexp"
    17  	"strings"
    18  	"sync"
    19  )
    20  
    21  var (
    22  	root    = flag.String("root", "http://localhost:6060", "Root to crawl")
    23  	verbose = flag.Bool("verbose", false, "verbose")
    24  )
    25  
    26  var wg sync.WaitGroup        // outstanding fetches
    27  var urlq = make(chan string) // URLs to crawl
    28  
    29  // urlFrag is a URL and its optional #fragment (without the #)
    30  type urlFrag struct {
    31  	url, frag string
    32  }
    33  
    34  var (
    35  	mu          sync.Mutex
    36  	crawled     = make(map[string]bool)      // URL without fragment -> true
    37  	neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
    38  )
    39  
    40  var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
    41  
    42  // Owned by crawlLoop goroutine:
    43  var (
    44  	linkSources = make(map[string][]string) // url no fragment -> sources
    45  	fragExists  = make(map[urlFrag]bool)
    46  	problems    []string
    47  )
    48  
    49  func localLinks(body string) (links []string) {
    50  	seen := map[string]bool{}
    51  	mv := aRx.FindAllStringSubmatch(body, -1)
    52  	for _, m := range mv {
    53  		ref := m[1]
    54  		if strings.HasPrefix(ref, "/src/") {
    55  			continue
    56  		}
    57  		if !seen[ref] {
    58  			seen[ref] = true
    59  			links = append(links, m[1])
    60  		}
    61  	}
    62  	return
    63  }
    64  
    65  var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
    66  
    67  func pageIDs(body string) (ids []string) {
    68  	mv := idRx.FindAllStringSubmatch(body, -1)
    69  	for _, m := range mv {
    70  		ids = append(ids, m[1])
    71  	}
    72  	return
    73  }
    74  
    75  // url may contain a #fragment, and the fragment is then noted as needing to exist.
    76  func crawl(url string, sourceURL string) {
    77  	if strings.Contains(url, "/devel/release") {
    78  		return
    79  	}
    80  	mu.Lock()
    81  	defer mu.Unlock()
    82  	var frag string
    83  	if i := strings.Index(url, "#"); i >= 0 {
    84  		frag = url[i+1:]
    85  		url = url[:i]
    86  		if frag != "" {
    87  			uf := urlFrag{url, frag}
    88  			neededFrags[uf] = append(neededFrags[uf], sourceURL)
    89  		}
    90  	}
    91  	if crawled[url] {
    92  		return
    93  	}
    94  	crawled[url] = true
    95  
    96  	wg.Add(1)
    97  	go func() {
    98  		urlq <- url
    99  	}()
   100  }
   101  
   102  func addProblem(url, errmsg string) {
   103  	msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
   104  	log.Print(msg)
   105  	problems = append(problems, msg)
   106  }
   107  
   108  func crawlLoop() {
   109  	for url := range urlq {
   110  		res, err := http.Get(url)
   111  		if err != nil {
   112  			addProblem(url, fmt.Sprintf("Error fetching: %v", err))
   113  			wg.Done()
   114  			continue
   115  		}
   116  		if res.StatusCode != 200 {
   117  			addProblem(url, fmt.Sprintf("Status code = %d", res.StatusCode))
   118  			wg.Done()
   119  			continue
   120  		}
   121  		slurp, err := ioutil.ReadAll(res.Body)
   122  		res.Body.Close()
   123  		if err != nil {
   124  			log.Fatalf("Error reading %s body: %v", url, err)
   125  		}
   126  		if *verbose {
   127  			log.Printf("Len of %s: %d", url, len(slurp))
   128  		}
   129  		body := string(slurp)
   130  		for _, ref := range localLinks(body) {
   131  			if *verbose {
   132  				log.Printf("  links to %s", ref)
   133  			}
   134  			dest := *root + ref
   135  			linkSources[dest] = append(linkSources[dest], url)
   136  			crawl(dest, url)
   137  		}
   138  		for _, id := range pageIDs(body) {
   139  			if *verbose {
   140  				log.Printf(" url %s has #%s", url, id)
   141  			}
   142  			fragExists[urlFrag{url, id}] = true
   143  		}
   144  
   145  		wg.Done()
   146  	}
   147  }
   148  
   149  func main() {
   150  	flag.Parse()
   151  
   152  	go crawlLoop()
   153  	crawl(*root, "")
   154  	crawl(*root+"/doc/go1.1.html", "")
   155  
   156  	wg.Wait()
   157  	close(urlq)
   158  	for uf, needers := range neededFrags {
   159  		if !fragExists[uf] {
   160  			problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
   161  		}
   162  	}
   163  
   164  	for _, s := range problems {
   165  		fmt.Println(s)
   166  	}
   167  }