github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/scraper.go

github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/scraper.go (about)

     1  package main
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"github.com/bcampbell/arts/arts"
     7  	"github.com/bcampbell/arts/util"
     8  	"github.com/bcampbell/biscuit"
     9  	"github.com/bcampbell/scrapeomat/arc"
    10  	"github.com/bcampbell/scrapeomat/discover"
    11  	"github.com/bcampbell/scrapeomat/paywall"
    12  	"github.com/bcampbell/scrapeomat/store"
    13  	"io/ioutil"
    14  	"log"
    15  	"net/http"
    16  	"net/http/cookiejar"
    17  	"net/url"
    18  	"os"
    19  	"time"
    20  )
    21  
    22  type ScrapeStats struct {
    23  	Start      time.Time
    24  	End        time.Time
    25  	ErrorCount int
    26  	FetchCount int
    27  
    28  	StashCount int
    29  }
    30  
    31  // TODO: factor out a scraper interface, to handle both generic and custom scrapers
    32  // Name() string
    33  // Discover(c *http.Client) ([]string, error)
    34  // DoRun(db *store.Store, c *http.Client) error
    35  // DoRunFromList(arts []string, db *store.Store, c *http.Client) error
    36  
    37  type Scraper struct {
    38  	Name       string
    39  	Conf       *ScraperConf
    40  	discoverer *discover.Discoverer
    41  	errorLog   *log.Logger
    42  	infoLog    *log.Logger
    43  	archiveDir string
    44  	stats      ScrapeStats
    45  	runPeriod  time.Duration
    46  	client     *http.Client
    47  	quit       chan struct{}
    48  }
    49  
    50  type ScraperConf struct {
    51  	discover.DiscovererDef
    52  	Cookies    bool
    53  	CookieFile string
    54  	PubCode    string
    55  }
    56  
    57  var ErrQuit = errors.New("quit requested")
    58  
    59  func NewScraper(name string, conf *ScraperConf, verbosity int, archiveDir string) (*Scraper, error) {
    60  	scraper := Scraper{
    61  		Name:       name,
    62  		Conf:       conf,
    63  		archiveDir: archiveDir,
    64  		runPeriod:  3 * time.Hour,
    65  		quit:       make(chan struct{}, 1),
    66  	}
    67  
    68  	scraper.errorLog = log.New(os.Stderr, "ERR "+name+": ", 0)
    69  	if verbosity > 0 {
    70  		scraper.infoLog = log.New(os.Stderr, "INF "+name+": ", 0)
    71  	} else {
    72  		scraper.infoLog = log.New(ioutil.Discard, "", 0)
    73  	}
    74  
    75  	// set up disoverer
    76  	disc, err := discover.NewDiscoverer(conf.DiscovererDef)
    77  	if err != nil {
    78  		return nil, err
    79  	}
    80  	disc.ErrorLog = scraper.errorLog
    81  	if verbosity > 1 {
    82  		disc.InfoLog = scraper.infoLog
    83  	}
    84  	scraper.discoverer = disc
    85  
    86  	// create the http client
    87  	// use politetripper to avoid hammering servers
    88  	var c *http.Client
    89  	transport := util.NewPoliteTripper()
    90  	transport.PerHostDelay = 1 * time.Second
    91  
    92  	if conf.Cookies || (conf.CookieFile != "") {
    93  		jar, err := cookiejar.New(nil)
    94  		if err != nil {
    95  			return nil, err
    96  		}
    97  		// If CookieFile set, load cookies here
    98  		if conf.CookieFile != "" {
    99  
   100  			cookieFile, err := os.Open(conf.CookieFile)
   101  			if err != nil {
   102  				return nil, err
   103  			}
   104  			defer cookieFile.Close()
   105  			cookies, err := biscuit.ReadCookies(cookieFile)
   106  			if err != nil {
   107  				return nil, err
   108  			}
   109  			// TODO: use another cookie jar that lets us bulk-load without
   110  			// filtering by URL (SetCookies() kind of assumes you're handling a
   111  			// http response and want to filter dodgy cookies)
   112  			host, err := url.Parse(conf.URL)
   113  			if err != nil {
   114  				return nil, err
   115  			}
   116  			jar.SetCookies(host, cookies)
   117  		}
   118  		c = &http.Client{
   119  			Transport: transport,
   120  			Jar:       jar,
   121  		}
   122  
   123  	} else {
   124  		c = &http.Client{
   125  			Transport: transport,
   126  		}
   127  	}
   128  	scraper.client = c
   129  
   130  	return &scraper, nil
   131  }
   132  
   133  func (scraper *Scraper) Login() error {
   134  	login := paywall.GetLogin(scraper.Name)
   135  	if login != nil {
   136  		scraper.infoLog.Printf("Logging in\n")
   137  		err := login(scraper.client)
   138  		if err != nil {
   139  			return fmt.Errorf("Login failed (%s)\n", err)
   140  		}
   141  	}
   142  	return nil
   143  }
   144  
   145  func (scraper *Scraper) Discover() ([]string, error) {
   146  	disc := scraper.discoverer
   147  
   148  	artLinks, err := disc.Run(scraper.client, scraper.quit)
   149  	if err == discover.ErrQuit {
   150  		return nil, ErrQuit
   151  	}
   152  	if err != nil {
   153  		return nil, err
   154  	}
   155  
   156  	foundArts := make([]string, 0, len(artLinks))
   157  	for l, _ := range artLinks {
   158  		foundArts = append(foundArts, l.String())
   159  	}
   160  	return foundArts, nil
   161  }
   162  
   163  // start the scraper, running it at regular intervals
   164  func (scraper *Scraper) Start(db store.Store) {
   165  	for {
   166  		lastRun := time.Now()
   167  		err := scraper.DoRun(db)
   168  		if err == ErrQuit {
   169  			scraper.infoLog.Printf("Quit requested!\n")
   170  			return
   171  		}
   172  		if err != nil {
   173  			scraper.errorLog.Printf("run aborted: %s", err)
   174  		}
   175  
   176  		nextRun := lastRun.Add(scraper.runPeriod)
   177  		delay := nextRun.Sub(time.Now())
   178  		scraper.infoLog.Printf("next run at %s (sleeping for %s)\n", nextRun.Format(time.RFC3339), delay)
   179  		// wait for next run, or a quit request
   180  		select {
   181  		case <-scraper.quit:
   182  			scraper.infoLog.Printf("Quit requested!\n")
   183  			return
   184  		case <-time.After(delay):
   185  			scraper.infoLog.Printf("Wakeup!\n")
   186  		}
   187  	}
   188  }
   189  
   190  // stop the scraper, at the next opportunity
   191  func (scraper *Scraper) Stop() {
   192  	scraper.quit <- struct{}{}
   193  }
   194  
   195  // perform a single scraper run
   196  func (scraper *Scraper) DoRun(db store.Store) error {
   197  
   198  	scraper.infoLog.Printf("start run\n")
   199  	// reset the stats
   200  	scraper.stats = ScrapeStats{}
   201  	scraper.stats.Start = time.Now()
   202  	defer func() {
   203  		stats := &scraper.stats
   204  		stats.End = time.Now()
   205  		elapsed := stats.End.Sub(stats.Start)
   206  		defer scraper.infoLog.Printf("run finished in %s (%d new articles, %d errors)\n", elapsed, stats.StashCount, stats.ErrorCount)
   207  	}()
   208  
   209  	err := scraper.Login()
   210  	if err != nil {
   211  		return err
   212  	}
   213  
   214  	foundArts, err := scraper.Discover()
   215  	if err != nil {
   216  		return err
   217  	}
   218  
   219  	newArts, err := db.WhichAreNew(foundArts)
   220  	if err != nil {
   221  		return fmt.Errorf("WhichAreNew() failed: %s", err)
   222  	}
   223  
   224  	stats := scraper.discoverer.Stats
   225  	scraper.infoLog.Printf("found %d articles, %d new (%d pages fetched, %d errors)\n",
   226  		len(foundArts), len(newArts), stats.FetchCount, stats.ErrorCount)
   227  
   228  	return scraper.FetchAndStash(newArts, db, false)
   229  }
   230  
   231  func uniq(in []string) []string {
   232  	foo := map[string]struct{}{}
   233  	for _, s := range in {
   234  		foo[s] = struct{}{}
   235  	}
   236  	out := make([]string, 0, len(foo))
   237  	for s, _ := range foo {
   238  		out = append(out, s)
   239  	}
   240  	return out
   241  }
   242  
   243  // perform a single scraper run, using a list of article URLS instead of invoking the discovery
   244  func (scraper *Scraper) DoRunFromList(arts []string, db store.Store, updateMode bool) error {
   245  
   246  	scraper.infoLog.Printf("start run from list\n")
   247  	// reset the stats
   248  	scraper.stats = ScrapeStats{}
   249  	scraper.stats.Start = time.Now()
   250  	defer func() {
   251  		stats := &scraper.stats
   252  		stats.End = time.Now()
   253  		elapsed := stats.End.Sub(stats.Start)
   254  		defer scraper.infoLog.Printf("finished in %s (%d new articles, %d errors)\n", elapsed, stats.StashCount, stats.ErrorCount)
   255  	}()
   256  
   257  	// use base url from the discovery config
   258  	baseURL := scraper.discoverer.StartURL
   259  
   260  	// process/reject urls using site rules
   261  	cookedArts := []string{}
   262  	rejectCnt := 0
   263  	for _, artURL := range arts {
   264  		cooked, err := scraper.discoverer.CookArticleURL(&baseURL, artURL)
   265  		if err != nil {
   266  			scraper.infoLog.Printf("Reject %s (%s)\n", artURL, err)
   267  			rejectCnt++
   268  			continue
   269  		}
   270  		cookedArts = append(cookedArts, cooked.String())
   271  	}
   272  
   273  	// remove any dupes
   274  	cookedArts = uniq(cookedArts)
   275  
   276  	var err error
   277  	var newArts []string
   278  	if !updateMode {
   279  		newArts, err = db.WhichAreNew(cookedArts)
   280  		if err != nil {
   281  			return fmt.Errorf("WhichAreNew() failed: %s", err)
   282  		}
   283  	} else {
   284  		// all of `em
   285  		newArts = cookedArts
   286  	}
   287  
   288  	scraper.infoLog.Printf("%d articles, %d rejected\n",
   289  		len(newArts), rejectCnt)
   290  
   291  	err = scraper.Login()
   292  	if err != nil {
   293  		return err
   294  	}
   295  
   296  	return scraper.FetchAndStash(newArts, db, updateMode)
   297  }
   298  
   299  func (scraper *Scraper) checkQuit() bool {
   300  	select {
   301  	case <-scraper.quit:
   302  		return true
   303  	default:
   304  		return false
   305  	}
   306  }
   307  
   308  func (scraper *Scraper) FetchAndStash(newArts []string, db store.Store, updateMode bool) error {
   309  	//scraper.infoLog.Printf("Start scraping\n")
   310  
   311  	// fetch and extract 'em
   312  	for _, artURL := range newArts {
   313  		if scraper.checkQuit() {
   314  			return ErrQuit
   315  		}
   316  
   317  		//		scraper.infoLog.Printf("fetch/scrape %s", artURL)
   318  		art, err := scraper.ScrapeArt(artURL)
   319  		if err != nil {
   320  			scraper.errorLog.Printf("%s\n", err)
   321  			scraper.stats.ErrorCount += 1
   322  			if scraper.stats.ErrorCount > 100+len(newArts)/10 {
   323  				return fmt.Errorf("too many errors (%d)", scraper.stats.ErrorCount)
   324  			}
   325  			continue
   326  		}
   327  
   328  		// TODO: wrap in transaction...
   329  		// check the urls - we might already have it
   330  		var ids []int
   331  		ids, err = db.FindURLs(art.URLs)
   332  		if err == nil {
   333  			if len(ids) == 1 {
   334  				art.ID = ids[0]
   335  			}
   336  			if len(ids) > 1 {
   337  				err = fmt.Errorf("resolves to %d articles", len(ids))
   338  			}
   339  		}
   340  
   341  		if err == nil {
   342  			if art.ID != 0 && !updateMode {
   343  				scraper.errorLog.Printf("already got %s (id %d)\n", artURL, art.ID)
   344  				// TODO: add missing URLs!!!
   345  				continue
   346  			}
   347  			_, err = db.Stash(art)
   348  		}
   349  		if err != nil {
   350  			scraper.errorLog.Printf("stash failure on: %s (on %s)\n", err, artURL)
   351  			scraper.stats.ErrorCount += 1
   352  			if scraper.stats.ErrorCount > 100+len(newArts)/10 {
   353  				return fmt.Errorf("too many errors (%d)", scraper.stats.ErrorCount)
   354  			}
   355  			continue
   356  		}
   357  		scraper.stats.StashCount += 1
   358  		scraper.infoLog.Printf("scraped %s (%d chars)\n", artURL, len(art.Content))
   359  	}
   360  	return nil
   361  }
   362  
   363  func (scraper *Scraper) ScrapeArt(artURL string) (*store.Article, error) {
   364  	// FETCH
   365  	fetchTime := time.Now()
   366  	req, err := http.NewRequest("GET", artURL, nil)
   367  	if err != nil {
   368  		return nil, err
   369  	}
   370  	// NOTE: FT.com always returns 403 if no Accept header is present.
   371  	// Seems like a reasonable thing to send anyway...
   372  	//req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
   373  	req.Header.Set("Accept", "*/*")
   374  	if scraper.Conf.UserAgent != "" {
   375  		req.Header.Set("User-Agent", scraper.Conf.UserAgent)
   376  	}
   377  
   378  	// other possible headers we might want to fiddle with:
   379  	//req.Header.Set("User-Agent", `Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0`)
   380  	//req.Header.Set("Referrer", "http://...")
   381  	//req.Header.Set("Accept-Language", "en-US,en;q=0.5")
   382  
   383  	resp, err := scraper.client.Do(req)
   384  	if err != nil {
   385  		return nil, err
   386  	}
   387  	defer resp.Body.Close()
   388  
   389  	// ARCHIVE
   390  	err = arc.ArchiveResponse(scraper.archiveDir, resp, artURL, fetchTime)
   391  	if err != nil {
   392  		return nil, err
   393  	}
   394  
   395  	// EXTRACT
   396  	if resp.StatusCode != 200 {
   397  		return nil, fmt.Errorf("HTTP error: %s (%s)", resp.Status, artURL)
   398  	}
   399  
   400  	rawHTML, err := ioutil.ReadAll(resp.Body)
   401  	if err != nil {
   402  		return nil, err
   403  	}
   404  
   405  	scraped, err := arts.ExtractFromHTML(rawHTML, artURL)
   406  	if err != nil {
   407  		return nil, err
   408  	}
   409  
   410  	art := store.ConvertArticle(scraped)
   411  
   412  	if scraper.Conf.PubCode != "" {
   413  		art.Publication.Code = scraper.Conf.PubCode
   414  	} else {
   415  		art.Publication.Code = scraper.Name
   416  	}
   417  	// TODO: set publication code here!
   418  	return art, nil
   419  }