github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/milenio.go (about) 1 package main 2 3 import ( 4 "fmt" 5 "github.com/andybalholm/cascadia" 6 "github.com/bcampbell/arts/util" 7 "golang.org/x/net/html" 8 "net/http" 9 "net/url" 10 "encoding/json" 11 "strconv" 12 "strings" 13 "io/ioutil" 14 "bytes" 15 "os" 16 "time" 17 ) 18 19 // use search ajax: 20 // http://www.milenio.com/milappservices/search.json?term=una&orderby=desc&contentType=&page=2&limit=100&seccion=&iniDate=2017-01-01&endDate=2017-01-05 21 22 // ({"data":{"results":"","count":""},"error":0,"message":""}) 23 // where results contains html snippet 24 25 26 27 func DoMilenio(opts *Options) error { 28 29 var raw struct { 30 Data struct { 31 Results string `json:"results"` 32 /*Count int `json:"count"`*/ 33 } `json:"data"` 34 /* Error int `json:"error"` 35 Message string `json:"message"`*/ 36 } 37 38 if opts.dayFrom == "" || opts.dayTo=="" { 39 return fmt.Errorf("date range required") 40 } 41 42 43 linkSel := cascadia.MustCompile(`.md-listing-item h3 a.lnk`) 44 45 client := &http.Client{Transport: util.NewPoliteTripper()} 46 47 48 // do it day by day. server gets slow for big ranges 49 /* 50 days,err := opts.DayRange() 51 if err != nil { 52 return err 53 } 54 for _,day := range days { 55 */ 56 for page:=1; ; page++ { 57 v := url.Values{} 58 v.Set("term", "una") 59 v.Set("orderby", "desc") 60 v.Set("contentType", "") 61 v.Set("page", strconv.Itoa(page)) 62 v.Set("limit", "100") // max is 200? 63 v.Set("seccion","") 64 /* 65 v.Set("iniDate", day.Format("2006-01-02")) 66 v.Set("endDate", day.Format("2006-01-02")) 67 */ 68 v.Set("iniDate", opts.dayFrom) 69 v.Set("endDate", opts.dayTo) 70 71 u := "http://www.milenio.com/milappservices/search.json?" + v.Encode() 72 73 fmt.Fprintln(os.Stderr,"FETCH ", u) 74 req, err := http.NewRequest("GET", u, nil) 75 if err != nil { 76 return err 77 } 78 79 b := []byte{} 80 retries := 0 81 skip := false 82 for { 83 84 resp, err := client.Do(req) 85 if err != nil { 86 return err 87 } 88 b, err = ioutil.ReadAll(resp.Body) 89 resp.Body.Close() 90 if err != nil { 91 return err 92 } 93 94 if (resp.StatusCode == 200 ) { 95 break 96 } 97 98 retries++ 99 if retries >10 { 100 fmt.Fprintf(os.Stderr,"SKIP %s - too many reties. skip.", u) 101 skip = true; 102 break 103 } 104 fmt.Fprintf(os.Stderr,"HTTP %d: %s - retry in 5s",resp.StatusCode, u) 105 time.Sleep(5*time.Second) 106 } 107 if skip { 108 continue 109 } 110 111 // kill annoying wrapper 112 b = bytes.TrimSpace(b) 113 b = bytes.TrimPrefix(b, []byte("(")) 114 b = bytes.TrimSuffix(b, []byte(")")) 115 116 err = json.Unmarshal(b, &raw) 117 if err != nil { 118 return fmt.Errorf("json err, page %d: %s",page,err) 119 } 120 121 // fmt.Printf("%q\n", raw); 122 123 if(raw.Data.Results=="") { 124 break 125 } 126 127 root,err := html.Parse( strings.NewReader(raw.Data.Results)) 128 if err != nil { 129 return fmt.Errorf("html parse err, page %d: %s",page,err) 130 } 131 132 133 links, err := grabLinks(root, linkSel, u) 134 if err != nil { 135 return err 136 } 137 138 for _, l := range links { 139 fmt.Println(l) 140 } 141 } 142 /* 143 } 144 */ 145 return nil 146 }