github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/misc/linkcheck/linkcheck.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // The linkcheck command finds missing links in the godoc website. 6 // It crawls a URL recursively and notes URLs and URL fragments 7 // that it's seen and prints a report of missing links at the end. 8 package main 9 10 import ( 11 "flag" 12 "fmt" 13 "io/ioutil" 14 "log" 15 "net/http" 16 "regexp" 17 "strings" 18 "sync" 19 ) 20 21 var ( 22 root = flag.String("root", "http://localhost:6060", "Root to crawl") 23 verbose = flag.Bool("verbose", false, "verbose") 24 ) 25 26 var wg sync.WaitGroup // outstanding fetches 27 var urlq = make(chan string) // URLs to crawl 28 29 // urlFrag is a URL and its optional #fragment (without the #) 30 type urlFrag struct { 31 url, frag string 32 } 33 34 var ( 35 mu sync.Mutex 36 crawled = make(map[string]bool) // URL without fragment -> true 37 neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it 38 ) 39 40 var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`) 41 42 // Owned by crawlLoop goroutine: 43 var ( 44 linkSources = make(map[string][]string) // url no fragment -> sources 45 fragExists = make(map[urlFrag]bool) 46 problems []string 47 ) 48 49 func localLinks(body string) (links []string) { 50 seen := map[string]bool{} 51 mv := aRx.FindAllStringSubmatch(body, -1) 52 for _, m := range mv { 53 ref := m[1] 54 if strings.HasPrefix(ref, "/src/") { 55 continue 56 } 57 if !seen[ref] { 58 seen[ref] = true 59 links = append(links, m[1]) 60 } 61 } 62 return 63 } 64 65 var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`) 66 67 func pageIDs(body string) (ids []string) { 68 mv := idRx.FindAllStringSubmatch(body, -1) 69 for _, m := range mv { 70 ids = append(ids, m[1]) 71 } 72 return 73 } 74 75 // url may contain a #fragment, and the fragment is then noted as needing to exist. 76 func crawl(url string, sourceURL string) { 77 if strings.Contains(url, "/devel/release") { 78 return 79 } 80 mu.Lock() 81 defer mu.Unlock() 82 var frag string 83 if i := strings.Index(url, "#"); i >= 0 { 84 frag = url[i+1:] 85 url = url[:i] 86 if frag != "" { 87 uf := urlFrag{url, frag} 88 neededFrags[uf] = append(neededFrags[uf], sourceURL) 89 } 90 } 91 if crawled[url] { 92 return 93 } 94 crawled[url] = true 95 96 wg.Add(1) 97 go func() { 98 urlq <- url 99 }() 100 } 101 102 func addProblem(url, errmsg string) { 103 msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) 104 log.Print(msg) 105 problems = append(problems, msg) 106 } 107 108 func crawlLoop() { 109 for url := range urlq { 110 res, err := http.Get(url) 111 if err != nil { 112 addProblem(url, fmt.Sprintf("Error fetching: %v", err)) 113 wg.Done() 114 continue 115 } 116 if res.StatusCode != 200 { 117 addProblem(url, fmt.Sprintf("Status code = %d", res.StatusCode)) 118 wg.Done() 119 continue 120 } 121 slurp, err := ioutil.ReadAll(res.Body) 122 res.Body.Close() 123 if err != nil { 124 log.Fatalf("Error reading %s body: %v", url, err) 125 } 126 if *verbose { 127 log.Printf("Len of %s: %d", url, len(slurp)) 128 } 129 body := string(slurp) 130 for _, ref := range localLinks(body) { 131 if *verbose { 132 log.Printf(" links to %s", ref) 133 } 134 dest := *root + ref 135 linkSources[dest] = append(linkSources[dest], url) 136 crawl(dest, url) 137 } 138 for _, id := range pageIDs(body) { 139 if *verbose { 140 log.Printf(" url %s has #%s", url, id) 141 } 142 fragExists[urlFrag{url, id}] = true 143 } 144 145 wg.Done() 146 } 147 } 148 149 func main() { 150 flag.Parse() 151 152 go crawlLoop() 153 crawl(*root, "") 154 crawl(*root+"/doc/go1.1.html", "") 155 156 wg.Wait() 157 close(urlq) 158 for uf, needers := range neededFrags { 159 if !fragExists[uf] { 160 problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers)) 161 } 162 } 163 164 for _, s := range problems { 165 fmt.Println(s) 166 } 167 }