github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/scraper.go (about) 1 package main 2 3 import ( 4 "errors" 5 "fmt" 6 "github.com/bcampbell/arts/arts" 7 "github.com/bcampbell/arts/util" 8 "github.com/bcampbell/biscuit" 9 "github.com/bcampbell/scrapeomat/arc" 10 "github.com/bcampbell/scrapeomat/discover" 11 "github.com/bcampbell/scrapeomat/paywall" 12 "github.com/bcampbell/scrapeomat/store" 13 "io/ioutil" 14 "log" 15 "net/http" 16 "net/http/cookiejar" 17 "net/url" 18 "os" 19 "time" 20 ) 21 22 type ScrapeStats struct { 23 Start time.Time 24 End time.Time 25 ErrorCount int 26 FetchCount int 27 28 StashCount int 29 } 30 31 // TODO: factor out a scraper interface, to handle both generic and custom scrapers 32 // Name() string 33 // Discover(c *http.Client) ([]string, error) 34 // DoRun(db *store.Store, c *http.Client) error 35 // DoRunFromList(arts []string, db *store.Store, c *http.Client) error 36 37 type Scraper struct { 38 Name string 39 Conf *ScraperConf 40 discoverer *discover.Discoverer 41 errorLog *log.Logger 42 infoLog *log.Logger 43 archiveDir string 44 stats ScrapeStats 45 runPeriod time.Duration 46 client *http.Client 47 quit chan struct{} 48 } 49 50 type ScraperConf struct { 51 discover.DiscovererDef 52 Cookies bool 53 CookieFile string 54 PubCode string 55 } 56 57 var ErrQuit = errors.New("quit requested") 58 59 func NewScraper(name string, conf *ScraperConf, verbosity int, archiveDir string) (*Scraper, error) { 60 scraper := Scraper{ 61 Name: name, 62 Conf: conf, 63 archiveDir: archiveDir, 64 runPeriod: 3 * time.Hour, 65 quit: make(chan struct{}, 1), 66 } 67 68 scraper.errorLog = log.New(os.Stderr, "ERR "+name+": ", 0) 69 if verbosity > 0 { 70 scraper.infoLog = log.New(os.Stderr, "INF "+name+": ", 0) 71 } else { 72 scraper.infoLog = log.New(ioutil.Discard, "", 0) 73 } 74 75 // set up disoverer 76 disc, err := discover.NewDiscoverer(conf.DiscovererDef) 77 if err != nil { 78 return nil, err 79 } 80 disc.ErrorLog = scraper.errorLog 81 if verbosity > 1 { 82 disc.InfoLog = scraper.infoLog 83 } 84 scraper.discoverer = disc 85 86 // create the http client 87 // use politetripper to avoid hammering servers 88 var c *http.Client 89 transport := util.NewPoliteTripper() 90 transport.PerHostDelay = 1 * time.Second 91 92 if conf.Cookies || (conf.CookieFile != "") { 93 jar, err := cookiejar.New(nil) 94 if err != nil { 95 return nil, err 96 } 97 // If CookieFile set, load cookies here 98 if conf.CookieFile != "" { 99 100 cookieFile, err := os.Open(conf.CookieFile) 101 if err != nil { 102 return nil, err 103 } 104 defer cookieFile.Close() 105 cookies, err := biscuit.ReadCookies(cookieFile) 106 if err != nil { 107 return nil, err 108 } 109 // TODO: use another cookie jar that lets us bulk-load without 110 // filtering by URL (SetCookies() kind of assumes you're handling a 111 // http response and want to filter dodgy cookies) 112 host, err := url.Parse(conf.URL) 113 if err != nil { 114 return nil, err 115 } 116 jar.SetCookies(host, cookies) 117 } 118 c = &http.Client{ 119 Transport: transport, 120 Jar: jar, 121 } 122 123 } else { 124 c = &http.Client{ 125 Transport: transport, 126 } 127 } 128 scraper.client = c 129 130 return &scraper, nil 131 } 132 133 func (scraper *Scraper) Login() error { 134 login := paywall.GetLogin(scraper.Name) 135 if login != nil { 136 scraper.infoLog.Printf("Logging in\n") 137 err := login(scraper.client) 138 if err != nil { 139 return fmt.Errorf("Login failed (%s)\n", err) 140 } 141 } 142 return nil 143 } 144 145 func (scraper *Scraper) Discover() ([]string, error) { 146 disc := scraper.discoverer 147 148 artLinks, err := disc.Run(scraper.client, scraper.quit) 149 if err == discover.ErrQuit { 150 return nil, ErrQuit 151 } 152 if err != nil { 153 return nil, err 154 } 155 156 foundArts := make([]string, 0, len(artLinks)) 157 for l, _ := range artLinks { 158 foundArts = append(foundArts, l.String()) 159 } 160 return foundArts, nil 161 } 162 163 // start the scraper, running it at regular intervals 164 func (scraper *Scraper) Start(db store.Store) { 165 for { 166 lastRun := time.Now() 167 err := scraper.DoRun(db) 168 if err == ErrQuit { 169 scraper.infoLog.Printf("Quit requested!\n") 170 return 171 } 172 if err != nil { 173 scraper.errorLog.Printf("run aborted: %s", err) 174 } 175 176 nextRun := lastRun.Add(scraper.runPeriod) 177 delay := nextRun.Sub(time.Now()) 178 scraper.infoLog.Printf("next run at %s (sleeping for %s)\n", nextRun.Format(time.RFC3339), delay) 179 // wait for next run, or a quit request 180 select { 181 case <-scraper.quit: 182 scraper.infoLog.Printf("Quit requested!\n") 183 return 184 case <-time.After(delay): 185 scraper.infoLog.Printf("Wakeup!\n") 186 } 187 } 188 } 189 190 // stop the scraper, at the next opportunity 191 func (scraper *Scraper) Stop() { 192 scraper.quit <- struct{}{} 193 } 194 195 // perform a single scraper run 196 func (scraper *Scraper) DoRun(db store.Store) error { 197 198 scraper.infoLog.Printf("start run\n") 199 // reset the stats 200 scraper.stats = ScrapeStats{} 201 scraper.stats.Start = time.Now() 202 defer func() { 203 stats := &scraper.stats 204 stats.End = time.Now() 205 elapsed := stats.End.Sub(stats.Start) 206 defer scraper.infoLog.Printf("run finished in %s (%d new articles, %d errors)\n", elapsed, stats.StashCount, stats.ErrorCount) 207 }() 208 209 err := scraper.Login() 210 if err != nil { 211 return err 212 } 213 214 foundArts, err := scraper.Discover() 215 if err != nil { 216 return err 217 } 218 219 newArts, err := db.WhichAreNew(foundArts) 220 if err != nil { 221 return fmt.Errorf("WhichAreNew() failed: %s", err) 222 } 223 224 stats := scraper.discoverer.Stats 225 scraper.infoLog.Printf("found %d articles, %d new (%d pages fetched, %d errors)\n", 226 len(foundArts), len(newArts), stats.FetchCount, stats.ErrorCount) 227 228 return scraper.FetchAndStash(newArts, db, false) 229 } 230 231 func uniq(in []string) []string { 232 foo := map[string]struct{}{} 233 for _, s := range in { 234 foo[s] = struct{}{} 235 } 236 out := make([]string, 0, len(foo)) 237 for s, _ := range foo { 238 out = append(out, s) 239 } 240 return out 241 } 242 243 // perform a single scraper run, using a list of article URLS instead of invoking the discovery 244 func (scraper *Scraper) DoRunFromList(arts []string, db store.Store, updateMode bool) error { 245 246 scraper.infoLog.Printf("start run from list\n") 247 // reset the stats 248 scraper.stats = ScrapeStats{} 249 scraper.stats.Start = time.Now() 250 defer func() { 251 stats := &scraper.stats 252 stats.End = time.Now() 253 elapsed := stats.End.Sub(stats.Start) 254 defer scraper.infoLog.Printf("finished in %s (%d new articles, %d errors)\n", elapsed, stats.StashCount, stats.ErrorCount) 255 }() 256 257 // use base url from the discovery config 258 baseURL := scraper.discoverer.StartURL 259 260 // process/reject urls using site rules 261 cookedArts := []string{} 262 rejectCnt := 0 263 for _, artURL := range arts { 264 cooked, err := scraper.discoverer.CookArticleURL(&baseURL, artURL) 265 if err != nil { 266 scraper.infoLog.Printf("Reject %s (%s)\n", artURL, err) 267 rejectCnt++ 268 continue 269 } 270 cookedArts = append(cookedArts, cooked.String()) 271 } 272 273 // remove any dupes 274 cookedArts = uniq(cookedArts) 275 276 var err error 277 var newArts []string 278 if !updateMode { 279 newArts, err = db.WhichAreNew(cookedArts) 280 if err != nil { 281 return fmt.Errorf("WhichAreNew() failed: %s", err) 282 } 283 } else { 284 // all of `em 285 newArts = cookedArts 286 } 287 288 scraper.infoLog.Printf("%d articles, %d rejected\n", 289 len(newArts), rejectCnt) 290 291 err = scraper.Login() 292 if err != nil { 293 return err 294 } 295 296 return scraper.FetchAndStash(newArts, db, updateMode) 297 } 298 299 func (scraper *Scraper) checkQuit() bool { 300 select { 301 case <-scraper.quit: 302 return true 303 default: 304 return false 305 } 306 } 307 308 func (scraper *Scraper) FetchAndStash(newArts []string, db store.Store, updateMode bool) error { 309 //scraper.infoLog.Printf("Start scraping\n") 310 311 // fetch and extract 'em 312 for _, artURL := range newArts { 313 if scraper.checkQuit() { 314 return ErrQuit 315 } 316 317 // scraper.infoLog.Printf("fetch/scrape %s", artURL) 318 art, err := scraper.ScrapeArt(artURL) 319 if err != nil { 320 scraper.errorLog.Printf("%s\n", err) 321 scraper.stats.ErrorCount += 1 322 if scraper.stats.ErrorCount > 100+len(newArts)/10 { 323 return fmt.Errorf("too many errors (%d)", scraper.stats.ErrorCount) 324 } 325 continue 326 } 327 328 // TODO: wrap in transaction... 329 // check the urls - we might already have it 330 var ids []int 331 ids, err = db.FindURLs(art.URLs) 332 if err == nil { 333 if len(ids) == 1 { 334 art.ID = ids[0] 335 } 336 if len(ids) > 1 { 337 err = fmt.Errorf("resolves to %d articles", len(ids)) 338 } 339 } 340 341 if err == nil { 342 if art.ID != 0 && !updateMode { 343 scraper.errorLog.Printf("already got %s (id %d)\n", artURL, art.ID) 344 // TODO: add missing URLs!!! 345 continue 346 } 347 _, err = db.Stash(art) 348 } 349 if err != nil { 350 scraper.errorLog.Printf("stash failure on: %s (on %s)\n", err, artURL) 351 scraper.stats.ErrorCount += 1 352 if scraper.stats.ErrorCount > 100+len(newArts)/10 { 353 return fmt.Errorf("too many errors (%d)", scraper.stats.ErrorCount) 354 } 355 continue 356 } 357 scraper.stats.StashCount += 1 358 scraper.infoLog.Printf("scraped %s (%d chars)\n", artURL, len(art.Content)) 359 } 360 return nil 361 } 362 363 func (scraper *Scraper) ScrapeArt(artURL string) (*store.Article, error) { 364 // FETCH 365 fetchTime := time.Now() 366 req, err := http.NewRequest("GET", artURL, nil) 367 if err != nil { 368 return nil, err 369 } 370 // NOTE: FT.com always returns 403 if no Accept header is present. 371 // Seems like a reasonable thing to send anyway... 372 //req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 373 req.Header.Set("Accept", "*/*") 374 if scraper.Conf.UserAgent != "" { 375 req.Header.Set("User-Agent", scraper.Conf.UserAgent) 376 } 377 378 // other possible headers we might want to fiddle with: 379 //req.Header.Set("User-Agent", `Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0`) 380 //req.Header.Set("Referrer", "http://...") 381 //req.Header.Set("Accept-Language", "en-US,en;q=0.5") 382 383 resp, err := scraper.client.Do(req) 384 if err != nil { 385 return nil, err 386 } 387 defer resp.Body.Close() 388 389 // ARCHIVE 390 err = arc.ArchiveResponse(scraper.archiveDir, resp, artURL, fetchTime) 391 if err != nil { 392 return nil, err 393 } 394 395 // EXTRACT 396 if resp.StatusCode != 200 { 397 return nil, fmt.Errorf("HTTP error: %s (%s)", resp.Status, artURL) 398 } 399 400 rawHTML, err := ioutil.ReadAll(resp.Body) 401 if err != nil { 402 return nil, err 403 } 404 405 scraped, err := arts.ExtractFromHTML(rawHTML, artURL) 406 if err != nil { 407 return nil, err 408 } 409 410 art := store.ConvertArticle(scraped) 411 412 if scraper.Conf.PubCode != "" { 413 art.Publication.Code = scraper.Conf.PubCode 414 } else { 415 art.Publication.Code = scraper.Name 416 } 417 // TODO: set publication code here! 418 return art, nil 419 }