github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/bulkscrape/scrape.go (about)

     1  package main
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/bcampbell/arts/arts"
     6  	"github.com/bcampbell/arts/util"
     7  	"github.com/bcampbell/scrapeomat/store"
     8  	"io/ioutil"
     9  	"net/http"
    10  	"os"
    11  	"time"
    12  )
    13  
    14  type ScrapeStats struct {
    15  	Start      time.Time
    16  	End        time.Time
    17  	ErrorCount int
    18  	FetchCount int
    19  	StashCount int
    20  }
    21  
    22  func buildHTTPClient() *http.Client {
    23  	// create the http client
    24  	// use politetripper to avoid hammering servers
    25  	transport := util.NewPoliteTripper()
    26  	transport.PerHostDelay = 1 * time.Second
    27  	return &http.Client{
    28  		Transport: transport,
    29  	}
    30  }
    31  
    32  func ScrapeArticles(artURLs []string, db store.Store) error {
    33  
    34  	client := buildHTTPClient()
    35  
    36  	// reset the stats
    37  	stats := ScrapeStats{}
    38  	stats.Start = time.Now()
    39  	defer func() {
    40  		stats.End = time.Now()
    41  		elapsed := stats.End.Sub(stats.Start)
    42  		defer fmt.Printf("finished in %s (%d new articles, %d errors)\n", elapsed, stats.StashCount, stats.ErrorCount)
    43  	}()
    44  
    45  	newArts, err := db.WhichAreNew(artURLs)
    46  	if err != nil {
    47  		return err
    48  	}
    49  
    50  	for _, artURL := range newArts {
    51  		// grab and stash
    52  		fmt.Printf("%s\n", artURL)
    53  		art, err := scrape(client, artURL)
    54  		if err == nil {
    55  			stats.FetchCount++
    56  			var stashed bool
    57  			stashed, err = stash(art, db)
    58  			if err == nil && stashed {
    59  				stats.StashCount++
    60  			}
    61  		}
    62  
    63  		if err != nil {
    64  			fmt.Fprintf(os.Stderr, "ERR: %s\n", err)
    65  			stats.ErrorCount++
    66  			if !opts.noErrBailout {
    67  				// bail out if errors get out of hand
    68  				if stats.ErrorCount > 100+len(newArts)/10 {
    69  					return fmt.Errorf("too many errors (%d)", stats.ErrorCount)
    70  				}
    71  			}
    72  		}
    73  	}
    74  	return nil
    75  }
    76  
    77  func scrape(client *http.Client, artURL string) (*store.Article, error) {
    78  	// FETCH
    79  
    80  	//fetchTime := time.Now()
    81  	req, err := http.NewRequest("GET", artURL, nil)
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  	// NOTE: some sites always returns 403 if no Accept header is present. (ft.com)
    86  	// Seems like a reasonable thing to send anyway...
    87  	//req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
    88  	req.Header.Set("Accept", "*/*")
    89  
    90  	// NOTE: Johnson press seems to return 403's if User-Agent is not correct format?
    91  	// In pre-1.15 golang, default was borked.
    92  	// see https://github.com/golang/go/issues/9792
    93  	req.Header.Set("User-Agent", "steno/0.1")
    94  
    95  	resp, err := client.Do(req)
    96  	if err != nil {
    97  		return nil, err
    98  	}
    99  	defer resp.Body.Close()
   100  
   101  	// TODO: could archive to .warc file here
   102  
   103  	// EXTRACT
   104  	if resp.StatusCode != 200 {
   105  		return nil, fmt.Errorf("HTTP error: %s (%s)", resp.Status, artURL)
   106  	}
   107  
   108  	rawHTML, err := ioutil.ReadAll(resp.Body)
   109  	if err != nil {
   110  		return nil, err
   111  	}
   112  
   113  	scraped, err := arts.ExtractFromHTML(rawHTML, artURL)
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  
   118  	art := store.ConvertArticle(scraped)
   119  	return art, nil
   120  }
   121  
   122  // stash returns true if the article was added to db.
   123  // Returns false if we already had it.
   124  func stash(art *store.Article, db store.Store) (bool, error) {
   125  	// load into db.
   126  	// check the urls - we might already have it
   127  	ids, err := db.FindURLs(art.URLs)
   128  	if err != nil {
   129  		return false, err
   130  	}
   131  	if len(ids) > 1 {
   132  		return false, fmt.Errorf("resolves to %d articles", len(ids))
   133  	}
   134  	if len(ids) == 1 {
   135  		fmt.Fprintf(os.Stderr, "SKIP (already in db): %s\n", art.CanonicalURL)
   136  		return false, nil
   137  	}
   138  	_, err = db.Stash(art)
   139  	if err != nil {
   140  		return false, err
   141  	}
   142  	return true, nil
   143  }