github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/sitemap/sitemap.go (about)

     1  package main
     2  
     3  /*
     4  TODO:
     5  - specify date range of interest
     6  - skip index files which look like they're outside the date range (often
     7    they'll have a date in the filename)
     8  - handle gzip files
     9  - read robots.txt to get sitemap files
    10  - add support for googlenews extra fields
    11  - factor out for use as a discovery mechanism in scrapeomat
    12  */
    13  
    14  import (
    15  	//	"encoding/csv"
    16  	"encoding/xml"
    17  	"flag"
    18  	"fmt"
    19  	"github.com/bcampbell/arts/util"
    20  	"net/http"
    21  	"os"
    22  	"strings"
    23  )
    24  
    25  type URL struct {
    26  	Loc        string `xml:"loc"`
    27  	ChangeFreq string `xml:"changefreq"`
    28  	Priority   string `xml:"priority"`
    29  	LastMod    string `xml:"lastmod"`
    30  }
    31  
    32  type URLSet struct {
    33  	URLs []URL `xml:"url"`
    34  }
    35  
    36  type Sitemap struct {
    37  	Loc     string `xml:"loc"`
    38  	LastMod string `xml:"lastmod"`
    39  }
    40  type SitemapIndex struct {
    41  	Sitemaps []Sitemap `xml:"sitemap"`
    42  }
    43  
    44  type Result struct {
    45  	XMLName  xml.Name
    46  	Sitemaps []Sitemap `xml:"sitemap"`
    47  	URLs     []URL     `xml:"url"`
    48  	/*URLSet       []URLSet       `xml:"urlset"`
    49  	itemapIndex []SitemapIndex `xml:"sitemapindex"`
    50  	*/
    51  }
    52  
    53  func main() {
    54  	flag.Parse()
    55  	for _, sitemapURL := range flag.Args() {
    56  		urls, err := doit(sitemapURL, "")
    57  		if err != nil {
    58  			fmt.Fprintln(os.Stderr, err)
    59  			os.Exit(1)
    60  		}
    61  		for _, u := range urls {
    62  			fmt.Println(u.LastMod, u.Loc)
    63  		}
    64  	}
    65  }
    66  
    67  func doit(sitemapURL string, indent string) ([]URL, error) {
    68  	result := make([]URL, 0)
    69  	politeClient := &http.Client{
    70  		Transport: util.NewPoliteTripper(),
    71  	}
    72  
    73  	resp, err := politeClient.Get(sitemapURL)
    74  	if err != nil {
    75  		return nil, err
    76  	}
    77  	defer resp.Body.Close()
    78  	if resp.StatusCode != 200 {
    79  		return nil, fmt.Errorf("%s%s: http error %d", indent, sitemapURL, resp.StatusCode)
    80  	}
    81  
    82  	dat := Result{}
    83  	dec := xml.NewDecoder(resp.Body)
    84  	err = dec.Decode(&dat)
    85  	if err != nil {
    86  		return nil, err
    87  	}
    88  	resp.Body.Close()
    89  
    90  	result = append(result, dat.URLs...)
    91  	for _, sitemap := range dat.Sitemaps {
    92  
    93  		if !strings.Contains(sitemap.LastMod, "2014") {
    94  			fmt.Fprintf(os.Stderr, "%sSKIP %s (%s)\n", indent, sitemap.Loc, sitemap.LastMod)
    95  			continue
    96  		}
    97  		foo, err := doit(sitemap.Loc, indent+"  ")
    98  		if err != nil {
    99  			return nil, err
   100  		}
   101  		fmt.Fprintf(os.Stderr, "%s%s (%s) => %d urls\n", indent, sitemap.Loc, sitemap.LastMod, len(foo))
   102  		result = append(result, foo...)
   103  	}
   104  	return result, nil
   105  }