github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/main.go (about)

     1  package main
     2  
     3  // hacky little tool to try and grab old articles from a site
     4  
     5  import (
     6  	"flag"
     7  	"fmt"
     8  	"github.com/andybalholm/cascadia"
     9  	"github.com/bcampbell/arts/util"
    10  	"golang.org/x/net/html"
    11  	"net/http"
    12  	"net/http/cookiejar"
    13  	"net/url"
    14  	"os"
    15  	"strings"
    16  	//	"github.com/bcampbell/scrapeomat/paywall"
    17  	"time"
    18  )
    19  
    20  type Options struct {
    21  	dayFrom, dayTo string
    22  	nPages         int
    23  	nStart         int
    24  	//	list           bool // list scrapers then exit
    25  }
    26  
    27  func (opts *Options) DayRange() ([]time.Time, error) {
    28  	from, to, err := opts.parseDays()
    29  	if err != nil {
    30  		return nil, err
    31  	}
    32  
    33  	if from.IsZero() {
    34  		return nil, fmt.Errorf("missing 'from' day")
    35  	}
    36  	if to.IsZero() {
    37  		return nil, fmt.Errorf("missing 'to' day")
    38  	}
    39  
    40  	// make sure we're at start of day
    41  	from = time.Date(from.Year(), from.Month(), from.Day(), 0, 0, 0, 0, time.UTC)
    42  
    43  	out := []time.Time{}
    44  	for day := from; !day.After(to); day = day.AddDate(0, 0, 1) {
    45  		out = append(out, day)
    46  	}
    47  	return out, nil
    48  }
    49  
    50  func (opts *Options) parseDays() (time.Time, time.Time, error) {
    51  
    52  	const dayFmt = "2006-01-02"
    53  	z := time.Time{}
    54  
    55  	from := z
    56  	to := z
    57  	var err error
    58  	if opts.dayFrom != "" {
    59  		from, err = time.Parse(dayFmt, opts.dayFrom)
    60  		if err != nil {
    61  			return z, z, fmt.Errorf("bad 'from' day (%s)", err)
    62  		}
    63  	}
    64  
    65  	if opts.dayTo != "" {
    66  		to, err = time.Parse(dayFmt, opts.dayTo)
    67  		if err != nil {
    68  			return z, z, fmt.Errorf("bad 'to' day (%s)", err)
    69  		}
    70  
    71  		if !from.IsZero() && to.Before(from) {
    72  			return z, z, fmt.Errorf("bad date range ('from' is after 'to')")
    73  		}
    74  	}
    75  
    76  	return from, to, nil
    77  }
    78  
    79  var scrapers map[string](func(*Options) error) = map[string](func(*Options) error){
    80  	"ft":                DoFT,
    81  	"bbc":               DoBBCNews,
    82  	"thetimes":          DoTheTimes,
    83  	"dailystar":         DoDailyStar,
    84  	"telegraph":         DoTelegraph,
    85  	"croydonadvertiser": DoCroydonAdvertiser,
    86  	"viceuk":            DoViceUK,
    87  	"eluniversal":       DoElUniversal,
    88  	"milenio":           DoMilenio,
    89  	"excelsior":         DoExcelsior,
    90  	"jornada":           DoJornada,
    91  	"sdpnoticias":       DoSDPNoticias,
    92  	//"thesun": DoTheSun,
    93  }
    94  
    95  func main() {
    96  	flag.Usage = func() {
    97  
    98  		sites := []string{}
    99  		for site, _ := range scrapers {
   100  			sites = append(sites, site)
   101  		}
   102  
   103  		fmt.Fprintf(os.Stderr, "Usage:\n")
   104  		fmt.Fprintf(os.Stderr, "%s [OPTIONS] %s\n", os.Args[0], strings.Join(sites, "|"))
   105  		fmt.Fprintf(os.Stderr, "Grab older articles from various sites, dumping the urls out to stdout\n")
   106  		flag.PrintDefaults()
   107  	}
   108  
   109  	opts := Options{}
   110  
   111  	flag.IntVar(&opts.nPages, "n", 0, "max num of search result pages to fetch")
   112  	flag.IntVar(&opts.nStart, "s", 0, "start value (page, whatever)")
   113  	flag.StringVar(&opts.dayFrom, "from", "", "from date")
   114  	flag.StringVar(&opts.dayTo, "to", "", "to date")
   115  	//flag.BoolVar(&opts.list, "l", false, "list available backfill scrapers, then exit")
   116  	flag.Parse()
   117  
   118  	var err error
   119  	if flag.NArg() < 1 {
   120  		fmt.Fprintf(os.Stderr, "ERROR: missing publication\n")
   121  		flag.Usage()
   122  		os.Exit(1)
   123  	}
   124  
   125  	site := flag.Arg(0)
   126  	scraper := scrapers[site]
   127  	if scraper == nil {
   128  		fmt.Fprintf(os.Stderr, "ERROR: unknown publication '%s'\n", site)
   129  		os.Exit(1)
   130  	}
   131  
   132  	err = scraper(&opts)
   133  
   134  	if err != nil {
   135  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
   136  		os.Exit(1)
   137  	}
   138  
   139  	os.Exit(0)
   140  }
   141  
   142  // GetAttr retrieved the value of an attribute on a node.
   143  // Returns empty string if attribute doesn't exist.
   144  func GetAttr(n *html.Node, attr string) string {
   145  	for _, a := range n.Attr {
   146  		if a.Key == attr {
   147  			return a.Val
   148  		}
   149  	}
   150  	return ""
   151  }
   152  
   153  // handles the sun and scottish sun
   154  // nasty and hacky and needs lots of manual intervention.
   155  // set high to the num of articles and manually set up the url for the sun or scottish sun.
   156  // their search links are all ajaxy, so can't just issue a search and
   157  // autoclick the 'next page' link. Instead we iterate through the results
   158  // 10 at a time using the minimal html returned by /search/showMoreAction.do
   159  func DoTheSun(opts *Options) error {
   160  	linkSel := cascadia.MustCompile("li h3 a")
   161  
   162  	// need to log in
   163  	jar, err := cookiejar.New(nil)
   164  	if err != nil {
   165  		return err
   166  	}
   167  	client := &http.Client{
   168  		Transport: util.NewPoliteTripper(),
   169  		Jar:       jar,
   170  	}
   171  
   172  	//high := 3180
   173  	high := 850
   174  	for offset := 0; offset < high; offset += 10 {
   175  
   176  		//u := "http://www.thesun.co.uk/search/showMoreAction.do?pubName=sol&querystring=the&navigators=publication_name:The+Sun&offset=" + fmt.Sprintf("%d", offset) + "&hits=10&sortby=relevance&from=20140828&to=20140917&th=3180"
   177  		u := "http://www.thesun.co.uk/search/showMoreAction.do?pubName=sol&querystring=the&navigators=publication_name:The+Scottish+Sun&offset=" + fmt.Sprintf("%d", offset) + "&hits=10&sortby=date&from=20140828&to=20140917&th=850"
   178  
   179  		root, err := fetchAndParse(client, u)
   180  		if err != nil {
   181  			return err
   182  		}
   183  		baseURL, err := url.Parse(u)
   184  		if err != nil {
   185  			return err
   186  		}
   187  		for _, a := range linkSel.MatchAll(root) {
   188  			fmt.Fprintln(os.Stderr, ".")
   189  			href := GetAttr(a, "href")
   190  			absURL, err := baseURL.Parse(href)
   191  			if err != nil {
   192  				fmt.Fprintf(os.Stderr, "skip %s\n", href)
   193  				continue
   194  			}
   195  			fmt.Println(absURL)
   196  		}
   197  	}
   198  	return nil
   199  }
   200  
   201  func fetchAndParse(client *http.Client, u string) (*html.Node, error) {
   202  	req, err := http.NewRequest("GET", u, nil)
   203  	if err != nil {
   204  		return nil, err
   205  	}
   206  	// NOTE: FT.com always returns 403 if no Accept header is present.
   207  	// Seems like a reasonable thing to send anyway...
   208  	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
   209  
   210  	fmt.Fprintf(os.Stderr, "fetch %s\n", u)
   211  
   212  	resp, err := client.Do(req)
   213  	if err != nil {
   214  		return nil, err
   215  	}
   216  	defer resp.Body.Close()
   217  	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   218  		err = fmt.Errorf("HTTP code %d (%s)", resp.StatusCode, u)
   219  		return nil, err
   220  	}
   221  
   222  	return html.Parse(resp.Body)
   223  }
   224  
   225  func DoFT(opts *Options) error {
   226  
   227  	if opts.dayFrom == "" || opts.dayTo == "" {
   228  
   229  		return fmt.Errorf("Date range required for FT")
   230  	}
   231  
   232  	// FT limits number of pages or results you can iterate through,
   233  	// so perform a separate search for each day
   234  	days, err := genDateRange(opts.dayFrom, opts.dayTo)
   235  	if err != nil {
   236  		return fmt.Errorf("bad date range: %s", err)
   237  	}
   238  	for _, day := range days {
   239  
   240  		dayFrom := day.Format(dayFmt)
   241  		dayTo := dayFrom
   242  
   243  		searchURL := "http://search.ft.com/search?q=&t=all&rpp=100&fa=people%2Corganisations%2Cregions%2Csections%2Ctopics%2Ccategory%2Cbrand&s=-initialPublishDateTime&f=initialPublishDateTime[" + dayFrom + "T00%3A00%3A00%2C" + dayTo + "T23%3A59%3A59]"
   244  		s := &Searcher{
   245  			SearchURL: searchURL,
   246  			Params:    url.Values{
   247  				/*
   248  					"q":   []string{""},    // querystring
   249  					"rpp": []string{"100"}, // results-per-page
   250  				*/
   251  			},
   252  			PageParam:     "p",
   253  			ResultLinkSel: cascadia.MustCompile(".results .result h3 a"),
   254  			//		NoMoreResultsSel: cascadia.MustCompile(".results .result-list .empty"),
   255  			NPages: 8, // should be enough to cover a day!
   256  		}
   257  
   258  		// next link doesn't show up here (but does in firefox).
   259  		// Maybe pretending to be a real browser and sending more headers would help?
   260  		//nextPageSel: cascadia.MustCompile(".pagination .next a")
   261  		// so, for now, just iterate page by page until no more results.
   262  
   263  		err := s.Run(os.Stdout)
   264  		if err != nil {
   265  			return err
   266  		}
   267  	}
   268  	return nil
   269  }
   270  
   271  func DoTheTimes(opts *Options) error {
   272  
   273  	// The Times search doesn't do stopwords, so a search for 'a' does the trick nicely ;-)
   274  	s := &Searcher{
   275  		SearchURL:     "http://www.thetimes.co.uk/search?q=a&sort=date_published&sortorder=desc",
   276  		Params:        url.Values{},
   277  		PageParam:     "p",
   278  		ResultLinkSel: cascadia.MustCompile(".SearchResultList h2.Item-headline a"),
   279  		NPages:        opts.nPages,
   280  	}
   281  
   282  	err := s.Run(os.Stdout)
   283  	if err != nil {
   284  		return err
   285  	}
   286  	return nil
   287  }
   288  
   289  func DoCroydonAdvertiser(opts *Options) error {
   290  
   291  	s := &Searcher{
   292  		SearchURL:     "http://www.croydonadvertiser.co.uk/search/search.html?searchType=&searchPhrase=&where=&orderByOption=dateDesc",
   293  		Params:        url.Values{},
   294  		NextPageSel:   cascadia.MustCompile(`.search-results a[rel="next"]`),
   295  		ResultLinkSel: cascadia.MustCompile(".search-results .channel-list-item a"),
   296  		NPages:        opts.nPages,
   297  	}
   298  
   299  	err := s.Run(os.Stdout)
   300  	if err != nil {
   301  		return err
   302  	}
   303  	return nil
   304  }
   305  func DoTheCourier(opts *Options) error {
   306  	// no specific date range, but you can get the results for the last month/year/week
   307  	linkSel := cascadia.MustCompile(".search-page-results-list .article-title a")
   308  	nextPageSel := cascadia.MustCompile(".search-page-pagination a.next")
   309  
   310  	client := &http.Client{Transport: util.NewPoliteTripper()}
   311  
   312  	// annoying stopwords in place, so do a bunch of generic-as-possible searches...
   313  	terms := []string{"up", "its", "from", "could", "said", "scotland", "england"}
   314  
   315  	for _, term := range terms {
   316  		u := "http://www.thecourier.co.uk/search?q=" + term + "&d=&s=mostRecent&a=&p=pastMonth"
   317  		for {
   318  			// rpp = results per page
   319  			// fa=facets?
   320  			// s=sort
   321  
   322  			root, err := fetchAndParse(client, u)
   323  			if err != nil {
   324  				return err
   325  			}
   326  			baseURL, err := url.Parse(u)
   327  			if err != nil {
   328  				return err
   329  			}
   330  			cnt := 0
   331  			for _, a := range linkSel.MatchAll(root) {
   332  				href := GetAttr(a, "href")
   333  				absURL, err := baseURL.Parse(href)
   334  				if err != nil {
   335  					fmt.Fprintf(os.Stderr, "skip %s\n", href)
   336  					continue
   337  				}
   338  				cnt++
   339  				fmt.Println(absURL)
   340  			}
   341  
   342  			n := nextPageSel.MatchFirst(root)
   343  			if n == nil {
   344  				fmt.Fprintf(os.Stderr, "fin.\n")
   345  				break
   346  			}
   347  
   348  			absNext, err := baseURL.Parse(GetAttr(n, "href"))
   349  			if err != nil {
   350  				return err
   351  			}
   352  			u = absNext.String()
   353  			//	fmt.Fprintf(os.Stderr, "NEXT %s\n", u)
   354  		}
   355  	}
   356  	return nil
   357  }
   358  
   359  const dayFmt = "2006-01-02"
   360  
   361  // TODO: kill this
   362  func genDateRange(dayFrom, dayTo string) ([]time.Time, error) {
   363  
   364  	var from, to time.Time
   365  	if dayFrom == "" {
   366  		return nil, fmt.Errorf("from day required")
   367  	}
   368  	from, err := time.Parse(dayFmt, dayFrom)
   369  	if err != nil {
   370  		return nil, err
   371  	}
   372  
   373  	if dayTo == "" {
   374  		now := time.Now()
   375  		to = time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC)
   376  	} else {
   377  		to, err = time.Parse(dayFmt, dayTo)
   378  		if err != nil {
   379  			return nil, err
   380  		}
   381  	}
   382  
   383  	if to.Before(from) {
   384  		return nil, fmt.Errorf("to day is before from")
   385  	}
   386  
   387  	out := []time.Time{}
   388  	end := to.AddDate(0, 0, 1)
   389  	for day := from; day.Before(end); day = day.AddDate(0, 0, 1) {
   390  		out = append(out, day)
   391  	}
   392  	return out, nil
   393  }
   394  
   395  // daily star has handy archive pages, one per day:
   396  // http://www.dailystar.co.uk/sitearchive/YYYY/M/D
   397  func DoDailyStar(opts *Options) error {
   398  	days, err := genDateRange(opts.dayFrom, opts.dayTo)
   399  	if err != nil {
   400  		return err
   401  	}
   402  	client := &http.Client{
   403  		Transport: util.NewPoliteTripper(),
   404  	}
   405  	linkSel := cascadia.MustCompile(".sitemap li a")
   406  	for _, day := range days {
   407  		page := fmt.Sprintf("http://www.dailystar.co.uk/sitearchive/%d/%d/%d",
   408  			day.Year(), day.Month(), day.Day())
   409  		root, err := fetchAndParse(client, page)
   410  		if err != nil {
   411  			return fmt.Errorf("%s failed: %s\n", page, err)
   412  		}
   413  		links, err := grabLinks(root, linkSel, page)
   414  		if err != nil {
   415  			return fmt.Errorf("%s error: %s\n", page, err)
   416  		}
   417  		for _, l := range links {
   418  			fmt.Println(l)
   419  		}
   420  	}
   421  
   422  	return nil
   423  }
   424  
   425  func grabLinks(root *html.Node, linkSel cascadia.Selector, baseURL string) ([]string, error) {
   426  	u, err := url.Parse(baseURL)
   427  	if err != nil {
   428  		return nil, err
   429  	}
   430  
   431  	out := []string{}
   432  	for _, a := range linkSel.MatchAll(root) {
   433  		link, err := href(a, u)
   434  		if err != nil {
   435  			fmt.Fprintf(os.Stderr, "%s BAD link: '%s'\n", baseURL, err)
   436  			continue
   437  		}
   438  		out = append(out, link)
   439  	}
   440  	return out, nil
   441  }
   442  
   443  func href(anchor *html.Node, baseURL *url.URL) (string, error) {
   444  	h := GetAttr(anchor, "href")
   445  	absURL, err := baseURL.Parse(h)
   446  	if err != nil {
   447  		return "", fmt.Errorf("bad href (%s): %s", h, err)
   448  	}
   449  	return absURL.String(), nil
   450  }
   451  
   452  func DoBBCNews(opts *Options) error {
   453  	// BBC has a search facility, but doesn't seem to have an option to
   454  	// sort by date... so it's a matter of stepping through huge numbers of results
   455  	// in the hope that we pick up what we need. Sigh.
   456  
   457  	// TODO: could discard articles outside desired date range... but probably not
   458  	// worth the effort
   459  
   460  	/*
   461  		dFrom, err := time.Parse("2006-01-02", dayFrom)
   462  		if err != nil {
   463  			return err
   464  		}
   465  		dTo, err := time.Parse("2006-01-02", dayTo)
   466  		if err != nil {
   467  			return err
   468  		}
   469  		dTo.AddDate(0, 0, 1)
   470  		discardCnt := 0
   471  	*/
   472  	artSel := cascadia.MustCompile(`article`)
   473  	linkSel := cascadia.MustCompile(`h1 a`)
   474  	dateSel := cascadia.MustCompile(`time`)
   475  
   476  	const MAXPAGE = 1000
   477  	for pageNum := 1; pageNum <= MAXPAGE; pageNum++ {
   478  		// http://www.bbc.co.uk/search?q=the&sa_f=search-serp&filter=news
   479  		// http://www.bbc.co.uk/search/more?page=2&q=the&sa_f=search-serp&filter=news
   480  		page := `http://www.bbc.co.uk/search/more?page=2&q=the&sa_f=search-serp&filter=news`
   481  
   482  		baseURL, err := url.Parse(page)
   483  		if err != nil {
   484  			return err
   485  		}
   486  
   487  		client := &http.Client{Transport: util.NewPoliteTripper()}
   488  		root, err := fetchAndParse(client, page)
   489  		if err != nil {
   490  			return fmt.Errorf("%s failed: %s\n", page, err)
   491  		}
   492  
   493  		for _, art := range artSel.MatchAll(root) {
   494  
   495  			d := dateSel.MatchFirst(art)
   496  			a := linkSel.MatchFirst(art)
   497  
   498  			if d == nil {
   499  				return fmt.Errorf("%s: missing date\n", page)
   500  			}
   501  			if a == nil {
   502  				return fmt.Errorf("%s: missing link\n", page)
   503  			}
   504  
   505  			artURL, err := href(a, baseURL)
   506  			if err != nil {
   507  				return fmt.Errorf("%s error: %s\n", page, err)
   508  			}
   509  
   510  			/*
   511  				// TODO: date range filtering here...
   512  				dt, err := time.Parse(time.RFC3339, GetAttr(d, "datetime"))
   513  				if err != nil {
   514  					return err
   515  				}
   516  
   517  				//if (dt.Equal(dFrom)||dt.After(dFrom)) && dt.Before(dTo) {...}
   518  			*/
   519  
   520  			fmt.Println(artURL)
   521  		}
   522  		//html.Render(os.Stdout, root)
   523  		//fmt.Printf("\n")
   524  
   525  	}
   526  	return nil
   527  }