github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/excelsior.go (about)

     1  package main
     2  
     3  import (
     4      "fmt"
     5      "github.com/andybalholm/cascadia"
     6      "github.com/bcampbell/arts/util"
     7      "github.com/bcampbell/htmlutil"
     8  //  "golang.org/x/net/html"
     9      "net/http"
    10      "net/url"
    11      "strconv"
    12      "regexp"
    13  //  "os"
    14  //  "strings"
    15  //  "time"
    16  )
    17  
    18  // search page straightforward enough, but only returns first 10000 articles
    19  // so to go back further have to split inro sections
    20  //
    21  // http://www.excelsior.com.mx/buscador?b=una&f={%22periodo%22%3A365%2C%22tipo%22%3A%22articulo%22%2C%22seccion%22%3A%22nacional%22}&p=1000
    22  func DoExcelsior(opts *Options) error {
    23  
    24      filts := []string{}
    25      sections := []string{ "nacional","global","dinero","comunidad","adrenalina",
    26          "funcion","hacker","expresiones" }
    27  //    types = :=[]string{ "articulo", "columna" }
    28      for _,section := range sections {
    29          f := fmt.Sprintf(`{"periodo":365,"tipo":"articulo","seccion":"%s"}`, section)
    30          filts = append(filts,f)
    31      }
    32      filts = append(filts,`{"periodo":365,"tipo":"columna"}`)
    33  
    34  
    35  
    36      //_,dayTo,err := opts.parseDays()
    37  
    38      client := &http.Client{
    39          Transport: util.NewPoliteTripper(),
    40      }
    41      resultSel := cascadia.MustCompile("#imx-resultados-lista li")
    42      linkSel := cascadia.MustCompile("h3 a")
    43      dateSel := cascadia.MustCompile(".imx-nota-fecha")
    44  
    45      dayPat := regexp.MustCompile( `(\d{1,2})/(\d{1,2})/(\d{4})`)
    46  
    47      maxPage := 1000;    // 10 per page - clips out at page 1000 :-(
    48  
    49      for _,filt := range filts {
    50  
    51          for page := 1; page<=maxPage; page++ {
    52  
    53              v := url.Values{}
    54              v.Set("b","una")
    55              v.Set("f",filt)
    56              v.Set("p",strconv.Itoa(page))
    57  
    58              u := "http://www.excelsior.com.mx/buscador?" + v.Encode()
    59  
    60              root, err := fetchAndParse(client, u)
    61              if err != nil {
    62                  return fmt.Errorf("%s failed: %s\n", page, err)
    63              }
    64  
    65              for _,item := range resultSel.MatchAll(root) {
    66                  link := linkSel.MatchFirst(item)
    67                  dt := dateSel.MatchFirst(item)
    68                  href := GetAttr(link,"href")
    69  
    70  
    71                  m := dayPat.FindStringSubmatch( htmlutil.TextContent(dt))
    72  
    73  //                nDay,_ := strconv.Atoi(m[1])
    74  //                nMonth,_ := strconv.Atoi(m[2])
    75  //                nYear,_ := strconv.Atoi(m[3)
    76  
    77                  // cheese out - only want 2017
    78                  if m[3] == "2016" {
    79                      page = maxPage
    80                      continue
    81                  }
    82  
    83                  fmt.Println(href)
    84              }
    85          }
    86      }
    87  
    88      return nil
    89  }
    90