github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/excelsior.go (about) 1 package main 2 3 import ( 4 "fmt" 5 "github.com/andybalholm/cascadia" 6 "github.com/bcampbell/arts/util" 7 "github.com/bcampbell/htmlutil" 8 // "golang.org/x/net/html" 9 "net/http" 10 "net/url" 11 "strconv" 12 "regexp" 13 // "os" 14 // "strings" 15 // "time" 16 ) 17 18 // search page straightforward enough, but only returns first 10000 articles 19 // so to go back further have to split inro sections 20 // 21 // http://www.excelsior.com.mx/buscador?b=una&f={%22periodo%22%3A365%2C%22tipo%22%3A%22articulo%22%2C%22seccion%22%3A%22nacional%22}&p=1000 22 func DoExcelsior(opts *Options) error { 23 24 filts := []string{} 25 sections := []string{ "nacional","global","dinero","comunidad","adrenalina", 26 "funcion","hacker","expresiones" } 27 // types = :=[]string{ "articulo", "columna" } 28 for _,section := range sections { 29 f := fmt.Sprintf(`{"periodo":365,"tipo":"articulo","seccion":"%s"}`, section) 30 filts = append(filts,f) 31 } 32 filts = append(filts,`{"periodo":365,"tipo":"columna"}`) 33 34 35 36 //_,dayTo,err := opts.parseDays() 37 38 client := &http.Client{ 39 Transport: util.NewPoliteTripper(), 40 } 41 resultSel := cascadia.MustCompile("#imx-resultados-lista li") 42 linkSel := cascadia.MustCompile("h3 a") 43 dateSel := cascadia.MustCompile(".imx-nota-fecha") 44 45 dayPat := regexp.MustCompile( `(\d{1,2})/(\d{1,2})/(\d{4})`) 46 47 maxPage := 1000; // 10 per page - clips out at page 1000 :-( 48 49 for _,filt := range filts { 50 51 for page := 1; page<=maxPage; page++ { 52 53 v := url.Values{} 54 v.Set("b","una") 55 v.Set("f",filt) 56 v.Set("p",strconv.Itoa(page)) 57 58 u := "http://www.excelsior.com.mx/buscador?" + v.Encode() 59 60 root, err := fetchAndParse(client, u) 61 if err != nil { 62 return fmt.Errorf("%s failed: %s\n", page, err) 63 } 64 65 for _,item := range resultSel.MatchAll(root) { 66 link := linkSel.MatchFirst(item) 67 dt := dateSel.MatchFirst(item) 68 href := GetAttr(link,"href") 69 70 71 m := dayPat.FindStringSubmatch( htmlutil.TextContent(dt)) 72 73 // nDay,_ := strconv.Atoi(m[1]) 74 // nMonth,_ := strconv.Atoi(m[2]) 75 // nYear,_ := strconv.Atoi(m[3) 76 77 // cheese out - only want 2017 78 if m[3] == "2016" { 79 page = maxPage 80 continue 81 } 82 83 fmt.Println(href) 84 } 85 } 86 } 87 88 return nil 89 } 90