github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/waybackwalker/main.go (about)

     1  package main
     2  
     3  // TODO: add some error tolerance (eg wayback machine produces a server timeout 408 sometimes)
     4  // TODO: filter out links to other domains
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"github.com/andybalholm/cascadia"
    10  	"github.com/bcampbell/arts/util"
    11  	"golang.org/x/net/html"
    12  	"net/http"
    13  	"net/url"
    14  	"os"
    15  	"time"
    16  )
    17  
    18  type Options struct {
    19  	dayFrom, dayTo string
    20  }
    21  
    22  func (opts *Options) DayRange() ([]time.Time, error) {
    23  	from, to, err := opts.parseDays()
    24  	if err != nil {
    25  		return nil, err
    26  	}
    27  
    28  	// make sure we're at start of day
    29  	from = time.Date(from.Year(), from.Month(), from.Day(), 0, 0, 0, 0, time.UTC)
    30  
    31  	out := []time.Time{}
    32  	for day := from; !day.After(to); day = day.AddDate(0, 0, 1) {
    33  		out = append(out, day)
    34  	}
    35  	return out, nil
    36  }
    37  
    38  func (opts *Options) parseDays() (time.Time, time.Time, error) {
    39  
    40  	const dayFmt = "2006-01-02"
    41  	z := time.Time{}
    42  
    43  	var from, to time.Time
    44  	var err error
    45  	if opts.dayFrom == "" {
    46  		return z, z, fmt.Errorf("'from' day required")
    47  	}
    48  	from, err = time.Parse(dayFmt, opts.dayFrom)
    49  	if err != nil {
    50  		return z, z, fmt.Errorf("bad 'from' day (%s)", err)
    51  	}
    52  
    53  	if opts.dayTo == "" {
    54  		return z, z, fmt.Errorf("'to' day required")
    55  	}
    56  	to, err = time.Parse(dayFmt, opts.dayTo)
    57  	if err != nil {
    58  		return z, z, fmt.Errorf("bad 'to' day (%s)", err)
    59  	}
    60  
    61  	if to.Before(from) {
    62  		return z, z, fmt.Errorf("bad date range ('from' is after 'to')")
    63  	}
    64  
    65  	return from, to, nil
    66  }
    67  
    68  func main() {
    69  	flag.Usage = func() {
    70  
    71  		fmt.Fprintf(os.Stderr, "Usage:\n")
    72  		fmt.Fprintf(os.Stderr, "%s [OPTIONS] URL(s)...\n", os.Args[0])
    73  		fmt.Fprintf(os.Stderr, `
    74  Grabs page snapshots from wayback machine for URLs over the given time
    75  period, scans them for links, and dumps them out to stdout.
    76  
    77  
    78  Input URLs can be absolute or relative - relative links will be
    79  considered relative to the previous URL in the list.
    80  eg:
    81     http://www.telegraph.co.uk/ /news/ /sport/ /business/
    82  is just fine.
    83  
    84  options:
    85  `)
    86  		flag.PrintDefaults()
    87  	}
    88  
    89  	opts := Options{}
    90  
    91  	flag.StringVar(&opts.dayFrom, "from", "", "from date")
    92  	flag.StringVar(&opts.dayTo, "to", "", "to date")
    93  	flag.Parse()
    94  
    95  	var err error
    96  	if flag.NArg() < 1 {
    97  		fmt.Fprintf(os.Stderr, "ERROR: missing URL(s)\n")
    98  		flag.Usage()
    99  		os.Exit(1)
   100  	}
   101  
   102  	err = doit(&opts, flag.Args())
   103  	if err != nil {
   104  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
   105  		os.Exit(1)
   106  	}
   107  
   108  	os.Exit(0)
   109  }
   110  
   111  // expand a list of URLs, using the previous URL in the list as the context for the next
   112  func expandURLs(origURLs []string) ([]string, error) {
   113  	prev := &url.URL{}
   114  	cooked := make([]string, len(origURLs))
   115  	for i, origURL := range origURLs {
   116  		parsed, err := prev.Parse(origURL)
   117  		if err != nil {
   118  			return nil, fmt.Errorf("bad URL '%s'", origURL)
   119  		}
   120  
   121  		if !parsed.IsAbs() {
   122  			return nil, fmt.Errorf("URL not absolute (and can't be guessed from previous) '%s'", origURL)
   123  		}
   124  		prev = parsed
   125  		cooked[i] = parsed.String()
   126  	}
   127  	return cooked, nil
   128  }
   129  
   130  func doit(opts *Options, urls []string) error {
   131  	urls, err := expandURLs(urls)
   132  	if err != nil {
   133  		return err
   134  	}
   135  
   136  	days, err := opts.DayRange()
   137  	if err != nil {
   138  		return err
   139  	}
   140  
   141  	client := &http.Client{
   142  		Transport: util.NewPoliteTripper(),
   143  	}
   144  
   145  	for _, day := range days {
   146  		timeStamp := day.Format("20060102")
   147  		for _, u := range urls {
   148  			err := doPage(client, u, timeStamp)
   149  			if err != nil {
   150  				return err
   151  			}
   152  		}
   153  
   154  	}
   155  	return nil
   156  }
   157  
   158  func doPage(client *http.Client, u string, when string) error {
   159  	linkSel := cascadia.MustCompile("a")
   160  
   161  	// the "id_" suffix asks for the original html. Without this wayback machine
   162  	// will rewrite all the links to go through itself for easy browsing.
   163  	// This will redirect (302) to the nearest memento to our requested timestamp.
   164  	page := fmt.Sprintf("http://web.archive.org/web/%sid_/%s", when, u)
   165  	root, err := fetchAndParse(client, page)
   166  	if err != nil {
   167  		return fmt.Errorf("%s failed: %s\n", page, err)
   168  	}
   169  	links, err := grabLinks(root, linkSel, u)
   170  	if err != nil {
   171  		return fmt.Errorf("%s error: %s\n", page, err)
   172  	}
   173  	for _, l := range links {
   174  		fmt.Println(l)
   175  	}
   176  
   177  	return nil
   178  }
   179  
   180  func fetchAndParse(client *http.Client, u string) (*html.Node, error) {
   181  	req, err := http.NewRequest("GET", u, nil)
   182  	if err != nil {
   183  		return nil, err
   184  	}
   185  	// NOTE: FT.com always returns 403 if no Accept header is present.
   186  	// Seems like a reasonable thing to send anyway...
   187  	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
   188  
   189  	fmt.Fprintf(os.Stderr, "fetch %s\n", u)
   190  
   191  	resp, err := client.Do(req)
   192  	if err != nil {
   193  		return nil, err
   194  	}
   195  	defer resp.Body.Close()
   196  	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   197  		err = fmt.Errorf("HTTP code %d (%s)", resp.StatusCode, u)
   198  		return nil, err
   199  	}
   200  
   201  	return html.Parse(resp.Body)
   202  }
   203  
   204  // GetAttr retrieved the value of an attribute on a node.
   205  // Returns empty string if attribute doesn't exist.
   206  func GetAttr(n *html.Node, attr string) string {
   207  	for _, a := range n.Attr {
   208  		if a.Key == attr {
   209  			return a.Val
   210  		}
   211  	}
   212  	return ""
   213  }
   214  
   215  func grabLinks(root *html.Node, linkSel cascadia.Selector, baseURL string) ([]string, error) {
   216  	u, err := url.Parse(baseURL)
   217  	if err != nil {
   218  		return nil, err
   219  	}
   220  
   221  	out := []string{}
   222  	for _, a := range linkSel.MatchAll(root) {
   223  		link, err := getAbsHref(a, u)
   224  		if err != nil {
   225  			fmt.Fprintf(os.Stderr, "%s BAD link: '%s'\n", baseURL, err)
   226  			continue
   227  		}
   228  		out = append(out, link)
   229  	}
   230  	return out, nil
   231  }
   232  
   233  func getAbsHref(anchor *html.Node, baseURL *url.URL) (string, error) {
   234  	h := GetAttr(anchor, "href")
   235  	absURL, err := baseURL.Parse(h)
   236  	if err != nil {
   237  		return "", fmt.Errorf("bad href (%s): %s", h, err)
   238  	}
   239  	return absURL.String(), nil
   240  }