github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/waybackwalker/main.go (about) 1 package main 2 3 // TODO: add some error tolerance (eg wayback machine produces a server timeout 408 sometimes) 4 // TODO: filter out links to other domains 5 6 import ( 7 "flag" 8 "fmt" 9 "github.com/andybalholm/cascadia" 10 "github.com/bcampbell/arts/util" 11 "golang.org/x/net/html" 12 "net/http" 13 "net/url" 14 "os" 15 "time" 16 ) 17 18 type Options struct { 19 dayFrom, dayTo string 20 } 21 22 func (opts *Options) DayRange() ([]time.Time, error) { 23 from, to, err := opts.parseDays() 24 if err != nil { 25 return nil, err 26 } 27 28 // make sure we're at start of day 29 from = time.Date(from.Year(), from.Month(), from.Day(), 0, 0, 0, 0, time.UTC) 30 31 out := []time.Time{} 32 for day := from; !day.After(to); day = day.AddDate(0, 0, 1) { 33 out = append(out, day) 34 } 35 return out, nil 36 } 37 38 func (opts *Options) parseDays() (time.Time, time.Time, error) { 39 40 const dayFmt = "2006-01-02" 41 z := time.Time{} 42 43 var from, to time.Time 44 var err error 45 if opts.dayFrom == "" { 46 return z, z, fmt.Errorf("'from' day required") 47 } 48 from, err = time.Parse(dayFmt, opts.dayFrom) 49 if err != nil { 50 return z, z, fmt.Errorf("bad 'from' day (%s)", err) 51 } 52 53 if opts.dayTo == "" { 54 return z, z, fmt.Errorf("'to' day required") 55 } 56 to, err = time.Parse(dayFmt, opts.dayTo) 57 if err != nil { 58 return z, z, fmt.Errorf("bad 'to' day (%s)", err) 59 } 60 61 if to.Before(from) { 62 return z, z, fmt.Errorf("bad date range ('from' is after 'to')") 63 } 64 65 return from, to, nil 66 } 67 68 func main() { 69 flag.Usage = func() { 70 71 fmt.Fprintf(os.Stderr, "Usage:\n") 72 fmt.Fprintf(os.Stderr, "%s [OPTIONS] URL(s)...\n", os.Args[0]) 73 fmt.Fprintf(os.Stderr, ` 74 Grabs page snapshots from wayback machine for URLs over the given time 75 period, scans them for links, and dumps them out to stdout. 76 77 78 Input URLs can be absolute or relative - relative links will be 79 considered relative to the previous URL in the list. 80 eg: 81 http://www.telegraph.co.uk/ /news/ /sport/ /business/ 82 is just fine. 83 84 options: 85 `) 86 flag.PrintDefaults() 87 } 88 89 opts := Options{} 90 91 flag.StringVar(&opts.dayFrom, "from", "", "from date") 92 flag.StringVar(&opts.dayTo, "to", "", "to date") 93 flag.Parse() 94 95 var err error 96 if flag.NArg() < 1 { 97 fmt.Fprintf(os.Stderr, "ERROR: missing URL(s)\n") 98 flag.Usage() 99 os.Exit(1) 100 } 101 102 err = doit(&opts, flag.Args()) 103 if err != nil { 104 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 105 os.Exit(1) 106 } 107 108 os.Exit(0) 109 } 110 111 // expand a list of URLs, using the previous URL in the list as the context for the next 112 func expandURLs(origURLs []string) ([]string, error) { 113 prev := &url.URL{} 114 cooked := make([]string, len(origURLs)) 115 for i, origURL := range origURLs { 116 parsed, err := prev.Parse(origURL) 117 if err != nil { 118 return nil, fmt.Errorf("bad URL '%s'", origURL) 119 } 120 121 if !parsed.IsAbs() { 122 return nil, fmt.Errorf("URL not absolute (and can't be guessed from previous) '%s'", origURL) 123 } 124 prev = parsed 125 cooked[i] = parsed.String() 126 } 127 return cooked, nil 128 } 129 130 func doit(opts *Options, urls []string) error { 131 urls, err := expandURLs(urls) 132 if err != nil { 133 return err 134 } 135 136 days, err := opts.DayRange() 137 if err != nil { 138 return err 139 } 140 141 client := &http.Client{ 142 Transport: util.NewPoliteTripper(), 143 } 144 145 for _, day := range days { 146 timeStamp := day.Format("20060102") 147 for _, u := range urls { 148 err := doPage(client, u, timeStamp) 149 if err != nil { 150 return err 151 } 152 } 153 154 } 155 return nil 156 } 157 158 func doPage(client *http.Client, u string, when string) error { 159 linkSel := cascadia.MustCompile("a") 160 161 // the "id_" suffix asks for the original html. Without this wayback machine 162 // will rewrite all the links to go through itself for easy browsing. 163 // This will redirect (302) to the nearest memento to our requested timestamp. 164 page := fmt.Sprintf("http://web.archive.org/web/%sid_/%s", when, u) 165 root, err := fetchAndParse(client, page) 166 if err != nil { 167 return fmt.Errorf("%s failed: %s\n", page, err) 168 } 169 links, err := grabLinks(root, linkSel, u) 170 if err != nil { 171 return fmt.Errorf("%s error: %s\n", page, err) 172 } 173 for _, l := range links { 174 fmt.Println(l) 175 } 176 177 return nil 178 } 179 180 func fetchAndParse(client *http.Client, u string) (*html.Node, error) { 181 req, err := http.NewRequest("GET", u, nil) 182 if err != nil { 183 return nil, err 184 } 185 // NOTE: FT.com always returns 403 if no Accept header is present. 186 // Seems like a reasonable thing to send anyway... 187 req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 188 189 fmt.Fprintf(os.Stderr, "fetch %s\n", u) 190 191 resp, err := client.Do(req) 192 if err != nil { 193 return nil, err 194 } 195 defer resp.Body.Close() 196 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 197 err = fmt.Errorf("HTTP code %d (%s)", resp.StatusCode, u) 198 return nil, err 199 } 200 201 return html.Parse(resp.Body) 202 } 203 204 // GetAttr retrieved the value of an attribute on a node. 205 // Returns empty string if attribute doesn't exist. 206 func GetAttr(n *html.Node, attr string) string { 207 for _, a := range n.Attr { 208 if a.Key == attr { 209 return a.Val 210 } 211 } 212 return "" 213 } 214 215 func grabLinks(root *html.Node, linkSel cascadia.Selector, baseURL string) ([]string, error) { 216 u, err := url.Parse(baseURL) 217 if err != nil { 218 return nil, err 219 } 220 221 out := []string{} 222 for _, a := range linkSel.MatchAll(root) { 223 link, err := getAbsHref(a, u) 224 if err != nil { 225 fmt.Fprintf(os.Stderr, "%s BAD link: '%s'\n", baseURL, err) 226 continue 227 } 228 out = append(out, link) 229 } 230 return out, nil 231 } 232 233 func getAbsHref(anchor *html.Node, baseURL *url.URL) (string, error) { 234 h := GetAttr(anchor, "href") 235 absURL, err := baseURL.Parse(h) 236 if err != nil { 237 return "", fmt.Errorf("bad href (%s): %s", h, err) 238 } 239 return absURL.String(), nil 240 }