github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/main.go (about) 1 package main 2 3 // hacky little tool to try and grab old articles from a site 4 5 import ( 6 "flag" 7 "fmt" 8 "github.com/andybalholm/cascadia" 9 "github.com/bcampbell/arts/util" 10 "golang.org/x/net/html" 11 "net/http" 12 "net/http/cookiejar" 13 "net/url" 14 "os" 15 "strings" 16 // "github.com/bcampbell/scrapeomat/paywall" 17 "time" 18 ) 19 20 type Options struct { 21 dayFrom, dayTo string 22 nPages int 23 nStart int 24 // list bool // list scrapers then exit 25 } 26 27 func (opts *Options) DayRange() ([]time.Time, error) { 28 from, to, err := opts.parseDays() 29 if err != nil { 30 return nil, err 31 } 32 33 if from.IsZero() { 34 return nil, fmt.Errorf("missing 'from' day") 35 } 36 if to.IsZero() { 37 return nil, fmt.Errorf("missing 'to' day") 38 } 39 40 // make sure we're at start of day 41 from = time.Date(from.Year(), from.Month(), from.Day(), 0, 0, 0, 0, time.UTC) 42 43 out := []time.Time{} 44 for day := from; !day.After(to); day = day.AddDate(0, 0, 1) { 45 out = append(out, day) 46 } 47 return out, nil 48 } 49 50 func (opts *Options) parseDays() (time.Time, time.Time, error) { 51 52 const dayFmt = "2006-01-02" 53 z := time.Time{} 54 55 from := z 56 to := z 57 var err error 58 if opts.dayFrom != "" { 59 from, err = time.Parse(dayFmt, opts.dayFrom) 60 if err != nil { 61 return z, z, fmt.Errorf("bad 'from' day (%s)", err) 62 } 63 } 64 65 if opts.dayTo != "" { 66 to, err = time.Parse(dayFmt, opts.dayTo) 67 if err != nil { 68 return z, z, fmt.Errorf("bad 'to' day (%s)", err) 69 } 70 71 if !from.IsZero() && to.Before(from) { 72 return z, z, fmt.Errorf("bad date range ('from' is after 'to')") 73 } 74 } 75 76 return from, to, nil 77 } 78 79 var scrapers map[string](func(*Options) error) = map[string](func(*Options) error){ 80 "ft": DoFT, 81 "bbc": DoBBCNews, 82 "thetimes": DoTheTimes, 83 "dailystar": DoDailyStar, 84 "telegraph": DoTelegraph, 85 "croydonadvertiser": DoCroydonAdvertiser, 86 "viceuk": DoViceUK, 87 "eluniversal": DoElUniversal, 88 "milenio": DoMilenio, 89 "excelsior": DoExcelsior, 90 "jornada": DoJornada, 91 "sdpnoticias": DoSDPNoticias, 92 //"thesun": DoTheSun, 93 } 94 95 func main() { 96 flag.Usage = func() { 97 98 sites := []string{} 99 for site, _ := range scrapers { 100 sites = append(sites, site) 101 } 102 103 fmt.Fprintf(os.Stderr, "Usage:\n") 104 fmt.Fprintf(os.Stderr, "%s [OPTIONS] %s\n", os.Args[0], strings.Join(sites, "|")) 105 fmt.Fprintf(os.Stderr, "Grab older articles from various sites, dumping the urls out to stdout\n") 106 flag.PrintDefaults() 107 } 108 109 opts := Options{} 110 111 flag.IntVar(&opts.nPages, "n", 0, "max num of search result pages to fetch") 112 flag.IntVar(&opts.nStart, "s", 0, "start value (page, whatever)") 113 flag.StringVar(&opts.dayFrom, "from", "", "from date") 114 flag.StringVar(&opts.dayTo, "to", "", "to date") 115 //flag.BoolVar(&opts.list, "l", false, "list available backfill scrapers, then exit") 116 flag.Parse() 117 118 var err error 119 if flag.NArg() < 1 { 120 fmt.Fprintf(os.Stderr, "ERROR: missing publication\n") 121 flag.Usage() 122 os.Exit(1) 123 } 124 125 site := flag.Arg(0) 126 scraper := scrapers[site] 127 if scraper == nil { 128 fmt.Fprintf(os.Stderr, "ERROR: unknown publication '%s'\n", site) 129 os.Exit(1) 130 } 131 132 err = scraper(&opts) 133 134 if err != nil { 135 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 136 os.Exit(1) 137 } 138 139 os.Exit(0) 140 } 141 142 // GetAttr retrieved the value of an attribute on a node. 143 // Returns empty string if attribute doesn't exist. 144 func GetAttr(n *html.Node, attr string) string { 145 for _, a := range n.Attr { 146 if a.Key == attr { 147 return a.Val 148 } 149 } 150 return "" 151 } 152 153 // handles the sun and scottish sun 154 // nasty and hacky and needs lots of manual intervention. 155 // set high to the num of articles and manually set up the url for the sun or scottish sun. 156 // their search links are all ajaxy, so can't just issue a search and 157 // autoclick the 'next page' link. Instead we iterate through the results 158 // 10 at a time using the minimal html returned by /search/showMoreAction.do 159 func DoTheSun(opts *Options) error { 160 linkSel := cascadia.MustCompile("li h3 a") 161 162 // need to log in 163 jar, err := cookiejar.New(nil) 164 if err != nil { 165 return err 166 } 167 client := &http.Client{ 168 Transport: util.NewPoliteTripper(), 169 Jar: jar, 170 } 171 172 //high := 3180 173 high := 850 174 for offset := 0; offset < high; offset += 10 { 175 176 //u := "http://www.thesun.co.uk/search/showMoreAction.do?pubName=sol&querystring=the&navigators=publication_name:The+Sun&offset=" + fmt.Sprintf("%d", offset) + "&hits=10&sortby=relevance&from=20140828&to=20140917&th=3180" 177 u := "http://www.thesun.co.uk/search/showMoreAction.do?pubName=sol&querystring=the&navigators=publication_name:The+Scottish+Sun&offset=" + fmt.Sprintf("%d", offset) + "&hits=10&sortby=date&from=20140828&to=20140917&th=850" 178 179 root, err := fetchAndParse(client, u) 180 if err != nil { 181 return err 182 } 183 baseURL, err := url.Parse(u) 184 if err != nil { 185 return err 186 } 187 for _, a := range linkSel.MatchAll(root) { 188 fmt.Fprintln(os.Stderr, ".") 189 href := GetAttr(a, "href") 190 absURL, err := baseURL.Parse(href) 191 if err != nil { 192 fmt.Fprintf(os.Stderr, "skip %s\n", href) 193 continue 194 } 195 fmt.Println(absURL) 196 } 197 } 198 return nil 199 } 200 201 func fetchAndParse(client *http.Client, u string) (*html.Node, error) { 202 req, err := http.NewRequest("GET", u, nil) 203 if err != nil { 204 return nil, err 205 } 206 // NOTE: FT.com always returns 403 if no Accept header is present. 207 // Seems like a reasonable thing to send anyway... 208 req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 209 210 fmt.Fprintf(os.Stderr, "fetch %s\n", u) 211 212 resp, err := client.Do(req) 213 if err != nil { 214 return nil, err 215 } 216 defer resp.Body.Close() 217 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 218 err = fmt.Errorf("HTTP code %d (%s)", resp.StatusCode, u) 219 return nil, err 220 } 221 222 return html.Parse(resp.Body) 223 } 224 225 func DoFT(opts *Options) error { 226 227 if opts.dayFrom == "" || opts.dayTo == "" { 228 229 return fmt.Errorf("Date range required for FT") 230 } 231 232 // FT limits number of pages or results you can iterate through, 233 // so perform a separate search for each day 234 days, err := genDateRange(opts.dayFrom, opts.dayTo) 235 if err != nil { 236 return fmt.Errorf("bad date range: %s", err) 237 } 238 for _, day := range days { 239 240 dayFrom := day.Format(dayFmt) 241 dayTo := dayFrom 242 243 searchURL := "http://search.ft.com/search?q=&t=all&rpp=100&fa=people%2Corganisations%2Cregions%2Csections%2Ctopics%2Ccategory%2Cbrand&s=-initialPublishDateTime&f=initialPublishDateTime[" + dayFrom + "T00%3A00%3A00%2C" + dayTo + "T23%3A59%3A59]" 244 s := &Searcher{ 245 SearchURL: searchURL, 246 Params: url.Values{ 247 /* 248 "q": []string{""}, // querystring 249 "rpp": []string{"100"}, // results-per-page 250 */ 251 }, 252 PageParam: "p", 253 ResultLinkSel: cascadia.MustCompile(".results .result h3 a"), 254 // NoMoreResultsSel: cascadia.MustCompile(".results .result-list .empty"), 255 NPages: 8, // should be enough to cover a day! 256 } 257 258 // next link doesn't show up here (but does in firefox). 259 // Maybe pretending to be a real browser and sending more headers would help? 260 //nextPageSel: cascadia.MustCompile(".pagination .next a") 261 // so, for now, just iterate page by page until no more results. 262 263 err := s.Run(os.Stdout) 264 if err != nil { 265 return err 266 } 267 } 268 return nil 269 } 270 271 func DoTheTimes(opts *Options) error { 272 273 // The Times search doesn't do stopwords, so a search for 'a' does the trick nicely ;-) 274 s := &Searcher{ 275 SearchURL: "http://www.thetimes.co.uk/search?q=a&sort=date_published&sortorder=desc", 276 Params: url.Values{}, 277 PageParam: "p", 278 ResultLinkSel: cascadia.MustCompile(".SearchResultList h2.Item-headline a"), 279 NPages: opts.nPages, 280 } 281 282 err := s.Run(os.Stdout) 283 if err != nil { 284 return err 285 } 286 return nil 287 } 288 289 func DoCroydonAdvertiser(opts *Options) error { 290 291 s := &Searcher{ 292 SearchURL: "http://www.croydonadvertiser.co.uk/search/search.html?searchType=&searchPhrase=&where=&orderByOption=dateDesc", 293 Params: url.Values{}, 294 NextPageSel: cascadia.MustCompile(`.search-results a[rel="next"]`), 295 ResultLinkSel: cascadia.MustCompile(".search-results .channel-list-item a"), 296 NPages: opts.nPages, 297 } 298 299 err := s.Run(os.Stdout) 300 if err != nil { 301 return err 302 } 303 return nil 304 } 305 func DoTheCourier(opts *Options) error { 306 // no specific date range, but you can get the results for the last month/year/week 307 linkSel := cascadia.MustCompile(".search-page-results-list .article-title a") 308 nextPageSel := cascadia.MustCompile(".search-page-pagination a.next") 309 310 client := &http.Client{Transport: util.NewPoliteTripper()} 311 312 // annoying stopwords in place, so do a bunch of generic-as-possible searches... 313 terms := []string{"up", "its", "from", "could", "said", "scotland", "england"} 314 315 for _, term := range terms { 316 u := "http://www.thecourier.co.uk/search?q=" + term + "&d=&s=mostRecent&a=&p=pastMonth" 317 for { 318 // rpp = results per page 319 // fa=facets? 320 // s=sort 321 322 root, err := fetchAndParse(client, u) 323 if err != nil { 324 return err 325 } 326 baseURL, err := url.Parse(u) 327 if err != nil { 328 return err 329 } 330 cnt := 0 331 for _, a := range linkSel.MatchAll(root) { 332 href := GetAttr(a, "href") 333 absURL, err := baseURL.Parse(href) 334 if err != nil { 335 fmt.Fprintf(os.Stderr, "skip %s\n", href) 336 continue 337 } 338 cnt++ 339 fmt.Println(absURL) 340 } 341 342 n := nextPageSel.MatchFirst(root) 343 if n == nil { 344 fmt.Fprintf(os.Stderr, "fin.\n") 345 break 346 } 347 348 absNext, err := baseURL.Parse(GetAttr(n, "href")) 349 if err != nil { 350 return err 351 } 352 u = absNext.String() 353 // fmt.Fprintf(os.Stderr, "NEXT %s\n", u) 354 } 355 } 356 return nil 357 } 358 359 const dayFmt = "2006-01-02" 360 361 // TODO: kill this 362 func genDateRange(dayFrom, dayTo string) ([]time.Time, error) { 363 364 var from, to time.Time 365 if dayFrom == "" { 366 return nil, fmt.Errorf("from day required") 367 } 368 from, err := time.Parse(dayFmt, dayFrom) 369 if err != nil { 370 return nil, err 371 } 372 373 if dayTo == "" { 374 now := time.Now() 375 to = time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) 376 } else { 377 to, err = time.Parse(dayFmt, dayTo) 378 if err != nil { 379 return nil, err 380 } 381 } 382 383 if to.Before(from) { 384 return nil, fmt.Errorf("to day is before from") 385 } 386 387 out := []time.Time{} 388 end := to.AddDate(0, 0, 1) 389 for day := from; day.Before(end); day = day.AddDate(0, 0, 1) { 390 out = append(out, day) 391 } 392 return out, nil 393 } 394 395 // daily star has handy archive pages, one per day: 396 // http://www.dailystar.co.uk/sitearchive/YYYY/M/D 397 func DoDailyStar(opts *Options) error { 398 days, err := genDateRange(opts.dayFrom, opts.dayTo) 399 if err != nil { 400 return err 401 } 402 client := &http.Client{ 403 Transport: util.NewPoliteTripper(), 404 } 405 linkSel := cascadia.MustCompile(".sitemap li a") 406 for _, day := range days { 407 page := fmt.Sprintf("http://www.dailystar.co.uk/sitearchive/%d/%d/%d", 408 day.Year(), day.Month(), day.Day()) 409 root, err := fetchAndParse(client, page) 410 if err != nil { 411 return fmt.Errorf("%s failed: %s\n", page, err) 412 } 413 links, err := grabLinks(root, linkSel, page) 414 if err != nil { 415 return fmt.Errorf("%s error: %s\n", page, err) 416 } 417 for _, l := range links { 418 fmt.Println(l) 419 } 420 } 421 422 return nil 423 } 424 425 func grabLinks(root *html.Node, linkSel cascadia.Selector, baseURL string) ([]string, error) { 426 u, err := url.Parse(baseURL) 427 if err != nil { 428 return nil, err 429 } 430 431 out := []string{} 432 for _, a := range linkSel.MatchAll(root) { 433 link, err := href(a, u) 434 if err != nil { 435 fmt.Fprintf(os.Stderr, "%s BAD link: '%s'\n", baseURL, err) 436 continue 437 } 438 out = append(out, link) 439 } 440 return out, nil 441 } 442 443 func href(anchor *html.Node, baseURL *url.URL) (string, error) { 444 h := GetAttr(anchor, "href") 445 absURL, err := baseURL.Parse(h) 446 if err != nil { 447 return "", fmt.Errorf("bad href (%s): %s", h, err) 448 } 449 return absURL.String(), nil 450 } 451 452 func DoBBCNews(opts *Options) error { 453 // BBC has a search facility, but doesn't seem to have an option to 454 // sort by date... so it's a matter of stepping through huge numbers of results 455 // in the hope that we pick up what we need. Sigh. 456 457 // TODO: could discard articles outside desired date range... but probably not 458 // worth the effort 459 460 /* 461 dFrom, err := time.Parse("2006-01-02", dayFrom) 462 if err != nil { 463 return err 464 } 465 dTo, err := time.Parse("2006-01-02", dayTo) 466 if err != nil { 467 return err 468 } 469 dTo.AddDate(0, 0, 1) 470 discardCnt := 0 471 */ 472 artSel := cascadia.MustCompile(`article`) 473 linkSel := cascadia.MustCompile(`h1 a`) 474 dateSel := cascadia.MustCompile(`time`) 475 476 const MAXPAGE = 1000 477 for pageNum := 1; pageNum <= MAXPAGE; pageNum++ { 478 // http://www.bbc.co.uk/search?q=the&sa_f=search-serp&filter=news 479 // http://www.bbc.co.uk/search/more?page=2&q=the&sa_f=search-serp&filter=news 480 page := `http://www.bbc.co.uk/search/more?page=2&q=the&sa_f=search-serp&filter=news` 481 482 baseURL, err := url.Parse(page) 483 if err != nil { 484 return err 485 } 486 487 client := &http.Client{Transport: util.NewPoliteTripper()} 488 root, err := fetchAndParse(client, page) 489 if err != nil { 490 return fmt.Errorf("%s failed: %s\n", page, err) 491 } 492 493 for _, art := range artSel.MatchAll(root) { 494 495 d := dateSel.MatchFirst(art) 496 a := linkSel.MatchFirst(art) 497 498 if d == nil { 499 return fmt.Errorf("%s: missing date\n", page) 500 } 501 if a == nil { 502 return fmt.Errorf("%s: missing link\n", page) 503 } 504 505 artURL, err := href(a, baseURL) 506 if err != nil { 507 return fmt.Errorf("%s error: %s\n", page, err) 508 } 509 510 /* 511 // TODO: date range filtering here... 512 dt, err := time.Parse(time.RFC3339, GetAttr(d, "datetime")) 513 if err != nil { 514 return err 515 } 516 517 //if (dt.Equal(dFrom)||dt.After(dFrom)) && dt.Before(dTo) {...} 518 */ 519 520 fmt.Println(artURL) 521 } 522 //html.Render(os.Stdout, root) 523 //fmt.Printf("\n") 524 525 } 526 return nil 527 }