github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/bulkscrape/scrape.go (about) 1 package main 2 3 import ( 4 "fmt" 5 "github.com/bcampbell/arts/arts" 6 "github.com/bcampbell/arts/util" 7 "github.com/bcampbell/scrapeomat/store" 8 "io/ioutil" 9 "net/http" 10 "os" 11 "time" 12 ) 13 14 type ScrapeStats struct { 15 Start time.Time 16 End time.Time 17 ErrorCount int 18 FetchCount int 19 StashCount int 20 } 21 22 func buildHTTPClient() *http.Client { 23 // create the http client 24 // use politetripper to avoid hammering servers 25 transport := util.NewPoliteTripper() 26 transport.PerHostDelay = 1 * time.Second 27 return &http.Client{ 28 Transport: transport, 29 } 30 } 31 32 func ScrapeArticles(artURLs []string, db store.Store) error { 33 34 client := buildHTTPClient() 35 36 // reset the stats 37 stats := ScrapeStats{} 38 stats.Start = time.Now() 39 defer func() { 40 stats.End = time.Now() 41 elapsed := stats.End.Sub(stats.Start) 42 defer fmt.Printf("finished in %s (%d new articles, %d errors)\n", elapsed, stats.StashCount, stats.ErrorCount) 43 }() 44 45 newArts, err := db.WhichAreNew(artURLs) 46 if err != nil { 47 return err 48 } 49 50 for _, artURL := range newArts { 51 // grab and stash 52 fmt.Printf("%s\n", artURL) 53 art, err := scrape(client, artURL) 54 if err == nil { 55 stats.FetchCount++ 56 var stashed bool 57 stashed, err = stash(art, db) 58 if err == nil && stashed { 59 stats.StashCount++ 60 } 61 } 62 63 if err != nil { 64 fmt.Fprintf(os.Stderr, "ERR: %s\n", err) 65 stats.ErrorCount++ 66 if !opts.noErrBailout { 67 // bail out if errors get out of hand 68 if stats.ErrorCount > 100+len(newArts)/10 { 69 return fmt.Errorf("too many errors (%d)", stats.ErrorCount) 70 } 71 } 72 } 73 } 74 return nil 75 } 76 77 func scrape(client *http.Client, artURL string) (*store.Article, error) { 78 // FETCH 79 80 //fetchTime := time.Now() 81 req, err := http.NewRequest("GET", artURL, nil) 82 if err != nil { 83 return nil, err 84 } 85 // NOTE: some sites always returns 403 if no Accept header is present. (ft.com) 86 // Seems like a reasonable thing to send anyway... 87 //req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 88 req.Header.Set("Accept", "*/*") 89 90 // NOTE: Johnson press seems to return 403's if User-Agent is not correct format? 91 // In pre-1.15 golang, default was borked. 92 // see https://github.com/golang/go/issues/9792 93 req.Header.Set("User-Agent", "steno/0.1") 94 95 resp, err := client.Do(req) 96 if err != nil { 97 return nil, err 98 } 99 defer resp.Body.Close() 100 101 // TODO: could archive to .warc file here 102 103 // EXTRACT 104 if resp.StatusCode != 200 { 105 return nil, fmt.Errorf("HTTP error: %s (%s)", resp.Status, artURL) 106 } 107 108 rawHTML, err := ioutil.ReadAll(resp.Body) 109 if err != nil { 110 return nil, err 111 } 112 113 scraped, err := arts.ExtractFromHTML(rawHTML, artURL) 114 if err != nil { 115 return nil, err 116 } 117 118 art := store.ConvertArticle(scraped) 119 return art, nil 120 } 121 122 // stash returns true if the article was added to db. 123 // Returns false if we already had it. 124 func stash(art *store.Article, db store.Store) (bool, error) { 125 // load into db. 126 // check the urls - we might already have it 127 ids, err := db.FindURLs(art.URLs) 128 if err != nil { 129 return false, err 130 } 131 if len(ids) > 1 { 132 return false, fmt.Errorf("resolves to %d articles", len(ids)) 133 } 134 if len(ids) == 1 { 135 fmt.Fprintf(os.Stderr, "SKIP (already in db): %s\n", art.CanonicalURL) 136 return false, nil 137 } 138 _, err = db.Stash(art) 139 if err != nil { 140 return false, err 141 } 142 return true, nil 143 }