github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/assorted.go (about)

     1  package main
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"github.com/andybalholm/cascadia"
     7  	"github.com/bcampbell/arts/util"
     8  	"net/http"
     9  	"net/url"
    10  	"os"
    11  )
    12  
    13  // vice section pages are all javascript. There's an api to step back through the articles.
    14  func DoViceUK(opts *Options) error {
    15  	client := &http.Client{
    16  		Transport: util.NewPoliteTripper(),
    17  	}
    18  
    19  	type viceArtData struct {
    20  		URL string `json:"url"`
    21  	}
    22  	type viceArt struct {
    23  		Type string      `json:"type"`
    24  		Data viceArtData `json:"data"`
    25  	}
    26  
    27  	for page := 1; page < (1 + opts.nPages); page++ {
    28  		u := fmt.Sprintf("https://www.vice.com/api/v1/latest?locale=en_uk&page=%d", page)
    29  
    30  		resp, err := client.Get(u)
    31  		if err != nil {
    32  			return err
    33  		}
    34  		defer resp.Body.Close()
    35  
    36  		/*
    37  			raw, err := ioutil.ReadAll(resp.Body)
    38  
    39  			if err != nil {
    40  				return err
    41  			}
    42  		*/
    43  		dec := json.NewDecoder(resp.Body)
    44  		var arts []viceArt
    45  
    46  		err = dec.Decode(&arts)
    47  		if err != nil {
    48  			return err
    49  		}
    50  		for _, art := range arts {
    51  			if art.Type != "articles" {
    52  				continue
    53  			}
    54  			fmt.Println(art.Data.URL)
    55  		}
    56  
    57  	}
    58  
    59  	return nil
    60  }
    61  
    62  // eg:
    63  // https://www.sdpnoticias.com/nacional/list?page=80
    64  
    65  func DoSDPNoticias(opts *Options) error {
    66  
    67  	sections := []string{
    68  		"nacional", "internacional", "columnas", "deportes", "economia",
    69  		"sorprendente", "tecnologia",
    70  		"geek", "estilo-de-vida",
    71  		"enelshow/television", "enelshow/musica",
    72  		"enelshow/cine", "enelshow/famosos",
    73  		"gay", "sexxion",
    74  		"pitorreo",
    75  		"local/baja-california-sur",
    76  		"local/ciudad-de-mexico",
    77  		"local/chiapas",
    78  		"local/coahuila",
    79  		"local/edomex",
    80  		"local/guadalajara",
    81  		"local/guerrero",
    82  		"local/jalisco",
    83  		"local/monterrey",
    84  		"local/morelos",
    85  		"local/nuevo-leon",
    86  		"local/oaxaca",
    87  		"local/puebla",
    88  		"local/quintana-roo",
    89  		"local/sonora",
    90  		"local/tamaulipas",
    91  		"local/veracruz",
    92  		"estados"}
    93  
    94  	for _, section := range sections {
    95  		s := &Searcher{
    96  			SearchURL:     fmt.Sprintf("https://www.sdpnoticias.com/%s/list", section),
    97  			Params:        url.Values{},
    98  			PageParam:     "page",
    99  			ResultLinkSel: cascadia.MustCompile(".news-listing a"),
   100  			NPages:        opts.nPages,
   101  		}
   102  
   103  		err := s.Run(os.Stdout)
   104  		if err != nil {
   105  
   106  			continue
   107  			//		return err
   108  		}
   109  	}
   110  	return nil
   111  }