github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/milenio.go (about)

     1  package main
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/andybalholm/cascadia"
     6  	"github.com/bcampbell/arts/util"
     7  	"golang.org/x/net/html"
     8  	"net/http"
     9      "net/url"
    10      "encoding/json"
    11      "strconv"
    12      "strings"
    13      "io/ioutil"
    14      "bytes"
    15      "os"
    16      "time"
    17  )
    18  
    19  // use search ajax:
    20  // http://www.milenio.com/milappservices/search.json?term=una&orderby=desc&contentType=&page=2&limit=100&seccion=&iniDate=2017-01-01&endDate=2017-01-05
    21  
    22  // ({"data":{"results":"","count":""},"error":0,"message":""})
    23  // where results contains html snippet
    24  
    25  
    26  
    27  func DoMilenio(opts *Options) error {
    28  
    29      var raw struct {
    30          Data struct {
    31              Results string `json:"results"`
    32              /*Count int   `json:"count"`*/
    33          } `json:"data"`
    34  /*        Error int   `json:"error"`
    35          Message string `json:"message"`*/
    36      }
    37  
    38      if opts.dayFrom == "" || opts.dayTo=="" {
    39          return fmt.Errorf("date range required")
    40      }
    41  
    42  
    43  	linkSel := cascadia.MustCompile(`.md-listing-item h3 a.lnk`)
    44  
    45  	client := &http.Client{Transport: util.NewPoliteTripper()}
    46  
    47  
    48      // do it day by day. server gets slow for big ranges
    49      /*
    50      days,err := opts.DayRange()
    51      if err != nil {
    52          return err
    53      }
    54      for _,day := range days {
    55  */
    56          for page:=1; ; page++ {
    57              v := url.Values{}
    58              v.Set("term", "una")
    59              v.Set("orderby", "desc")
    60              v.Set("contentType", "")
    61              v.Set("page", strconv.Itoa(page))
    62              v.Set("limit", "100")  // max is 200?
    63              v.Set("seccion","")
    64              /*
    65              v.Set("iniDate", day.Format("2006-01-02"))
    66              v.Set("endDate", day.Format("2006-01-02"))
    67              */
    68              v.Set("iniDate", opts.dayFrom)
    69              v.Set("endDate", opts.dayTo)
    70  
    71              u := "http://www.milenio.com/milappservices/search.json?" + v.Encode()
    72  
    73              fmt.Fprintln(os.Stderr,"FETCH ", u)
    74              req, err := http.NewRequest("GET", u, nil)
    75              if err != nil {
    76                  return err
    77              }
    78              
    79              b := []byte{}
    80              retries := 0
    81              skip := false
    82              for {
    83  
    84                  resp, err := client.Do(req)
    85                  if err != nil {
    86                      return err
    87                  }
    88                  b, err = ioutil.ReadAll(resp.Body)
    89                  resp.Body.Close()
    90                  if err != nil {
    91                      return err
    92                  }
    93  
    94                  if (resp.StatusCode == 200 ) {
    95                      break
    96                  }
    97  
    98                  retries++
    99                  if retries >10 {
   100                      fmt.Fprintf(os.Stderr,"SKIP %s - too many reties. skip.", u)
   101                      skip = true;
   102                      break
   103                  }
   104                  fmt.Fprintf(os.Stderr,"HTTP %d: %s - retry in 5s",resp.StatusCode, u)
   105                  time.Sleep(5*time.Second)
   106              }
   107              if skip {
   108                  continue
   109              }
   110  
   111              // kill annoying wrapper
   112              b = bytes.TrimSpace(b)
   113              b = bytes.TrimPrefix(b, []byte("("))
   114              b = bytes.TrimSuffix(b, []byte(")"))
   115  
   116              err = json.Unmarshal(b, &raw)
   117              if err != nil {
   118                  return fmt.Errorf("json err, page %d: %s",page,err)
   119              }
   120  
   121      //        fmt.Printf("%q\n", raw);
   122  
   123              if(raw.Data.Results=="") {
   124                  break
   125              }
   126  
   127              root,err := html.Parse( strings.NewReader(raw.Data.Results))
   128              if err != nil {
   129                  return fmt.Errorf("html parse err, page %d: %s",page,err)
   130              }
   131  
   132  
   133              links, err := grabLinks(root, linkSel, u)
   134              if err != nil {
   135                  return err
   136              }
   137  
   138              for _, l := range links {
   139                  fmt.Println(l)
   140              }
   141          }
   142          /*
   143      }
   144      */
   145  	return nil
   146  }