github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/jornada.go (about)

     1  package main
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/andybalholm/cascadia"
     6  	"github.com/bcampbell/arts/util"
     7  	"net/http"
     8  	"os"
     9  )
    10  
    11  // archive pages, form:
    12  // http://www.jornada.unam.mx/YYYY/MM/DD/section
    13  
    14  func DoJornada(opts *Options) error {
    15  
    16  	linkSel := cascadia.MustCompile(`#section-cont a`)
    17  
    18  	sections := []string{"", "opinion", "politica", "economia", "mundo", "estados", "capital", "sociedad", "deportes", "cultura", "espectaculos"}
    19  
    20  	days, err := opts.DayRange()
    21  	if err != nil {
    22  		return err
    23  	}
    24  
    25  	client := &http.Client{Transport: util.NewPoliteTripper()}
    26  
    27  	for _, day := range days {
    28  		for _, section := range sections {
    29  			u := fmt.Sprintf("http://www.jornada.unam.mx/%04d/%02d/%02d/%s", day.Year(), day.Month(), day.Day(), section)
    30  
    31  			doc, err := fetchAndParse(client, u)
    32  			if err != nil {
    33  				fmt.Fprintf(os.Stderr, "SKIP: %s\n", err)
    34  				continue
    35  			}
    36  
    37  			links, err := grabLinks(doc, linkSel, u)
    38  			if err != nil {
    39  				return err
    40  			}
    41  
    42  			for _, l := range links {
    43  				fmt.Println(l)
    44  			}
    45  
    46  		}
    47  		// explicitly add the per-day editorials
    48  		fmt.Printf("http://www.jornada.unam.mx/%04d/%02d/%02d/edito\n", day.Year(), day.Month(), day.Day())
    49  		fmt.Printf("http://www.jornada.unam.mx/%04d/%02d/%02d/correo\n", day.Year(), day.Month(), day.Day())
    50  	}
    51  	return nil
    52  }