github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/jornada.go (about) 1 package main 2 3 import ( 4 "fmt" 5 "github.com/andybalholm/cascadia" 6 "github.com/bcampbell/arts/util" 7 "net/http" 8 "os" 9 ) 10 11 // archive pages, form: 12 // http://www.jornada.unam.mx/YYYY/MM/DD/section 13 14 func DoJornada(opts *Options) error { 15 16 linkSel := cascadia.MustCompile(`#section-cont a`) 17 18 sections := []string{"", "opinion", "politica", "economia", "mundo", "estados", "capital", "sociedad", "deportes", "cultura", "espectaculos"} 19 20 days, err := opts.DayRange() 21 if err != nil { 22 return err 23 } 24 25 client := &http.Client{Transport: util.NewPoliteTripper()} 26 27 for _, day := range days { 28 for _, section := range sections { 29 u := fmt.Sprintf("http://www.jornada.unam.mx/%04d/%02d/%02d/%s", day.Year(), day.Month(), day.Day(), section) 30 31 doc, err := fetchAndParse(client, u) 32 if err != nil { 33 fmt.Fprintf(os.Stderr, "SKIP: %s\n", err) 34 continue 35 } 36 37 links, err := grabLinks(doc, linkSel, u) 38 if err != nil { 39 return err 40 } 41 42 for _, l := range links { 43 fmt.Println(l) 44 } 45 46 } 47 // explicitly add the per-day editorials 48 fmt.Printf("http://www.jornada.unam.mx/%04d/%02d/%02d/edito\n", day.Year(), day.Month(), day.Day()) 49 fmt.Printf("http://www.jornada.unam.mx/%04d/%02d/%02d/correo\n", day.Year(), day.Month(), day.Day()) 50 } 51 return nil 52 }