github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/assorted.go (about) 1 package main 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "github.com/andybalholm/cascadia" 7 "github.com/bcampbell/arts/util" 8 "net/http" 9 "net/url" 10 "os" 11 ) 12 13 // vice section pages are all javascript. There's an api to step back through the articles. 14 func DoViceUK(opts *Options) error { 15 client := &http.Client{ 16 Transport: util.NewPoliteTripper(), 17 } 18 19 type viceArtData struct { 20 URL string `json:"url"` 21 } 22 type viceArt struct { 23 Type string `json:"type"` 24 Data viceArtData `json:"data"` 25 } 26 27 for page := 1; page < (1 + opts.nPages); page++ { 28 u := fmt.Sprintf("https://www.vice.com/api/v1/latest?locale=en_uk&page=%d", page) 29 30 resp, err := client.Get(u) 31 if err != nil { 32 return err 33 } 34 defer resp.Body.Close() 35 36 /* 37 raw, err := ioutil.ReadAll(resp.Body) 38 39 if err != nil { 40 return err 41 } 42 */ 43 dec := json.NewDecoder(resp.Body) 44 var arts []viceArt 45 46 err = dec.Decode(&arts) 47 if err != nil { 48 return err 49 } 50 for _, art := range arts { 51 if art.Type != "articles" { 52 continue 53 } 54 fmt.Println(art.Data.URL) 55 } 56 57 } 58 59 return nil 60 } 61 62 // eg: 63 // https://www.sdpnoticias.com/nacional/list?page=80 64 65 func DoSDPNoticias(opts *Options) error { 66 67 sections := []string{ 68 "nacional", "internacional", "columnas", "deportes", "economia", 69 "sorprendente", "tecnologia", 70 "geek", "estilo-de-vida", 71 "enelshow/television", "enelshow/musica", 72 "enelshow/cine", "enelshow/famosos", 73 "gay", "sexxion", 74 "pitorreo", 75 "local/baja-california-sur", 76 "local/ciudad-de-mexico", 77 "local/chiapas", 78 "local/coahuila", 79 "local/edomex", 80 "local/guadalajara", 81 "local/guerrero", 82 "local/jalisco", 83 "local/monterrey", 84 "local/morelos", 85 "local/nuevo-leon", 86 "local/oaxaca", 87 "local/puebla", 88 "local/quintana-roo", 89 "local/sonora", 90 "local/tamaulipas", 91 "local/veracruz", 92 "estados"} 93 94 for _, section := range sections { 95 s := &Searcher{ 96 SearchURL: fmt.Sprintf("https://www.sdpnoticias.com/%s/list", section), 97 Params: url.Values{}, 98 PageParam: "page", 99 ResultLinkSel: cascadia.MustCompile(".news-listing a"), 100 NPages: opts.nPages, 101 } 102 103 err := s.Run(os.Stdout) 104 if err != nil { 105 106 continue 107 // return err 108 } 109 } 110 return nil 111 }