github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/sitemapwalker/main.go (about)

     1  package main
     2  
     3  import (
     4  	"compress/gzip"
     5  	"encoding/xml"
     6  	"flag"
     7  	"fmt"
     8  	"github.com/bcampbell/arts/util"
     9  	"io"
    10  	"io/ioutil"
    11  	"net/http"
    12  	"net/url"
    13  	"os"
    14  	"time"
    15  )
    16  
    17  var opts struct {
    18  	nonrecursive bool
    19  	verbose      bool
    20  
    21  	fromDate      string
    22  	toDate        string
    23  	filterSitemap bool
    24  	from          time.Time
    25  	to            time.Time
    26  
    27  	maxErrs int
    28  }
    29  
    30  type sitemapfile struct {
    31  	SitemapIndex `xml:"sitemapindex"`
    32  	URLset       `xml:"urlset"`
    33  }
    34  
    35  type SitemapIndex struct {
    36  	//XMLName xml.Name `xml:"sitemapindex"`
    37  	Sitemap []struct {
    38  		Loc     string `xml:"loc"`
    39  		LastMod string `xml:"lastmod"`
    40  	} `xml:"sitemap"`
    41  }
    42  type URLset struct {
    43  	//XMLName xml.Name `xml:"urlset"`
    44  	URL []struct {
    45  		Loc     string `xml:"loc"`
    46  		LastMod string `xml:"lastmod"`
    47  	} `xml:"url"`
    48  }
    49  
    50  func usage() {
    51  
    52  	fmt.Fprintf(os.Stderr, `Usage: %s [OPTIONS] [URL] ...
    53  Find pages by scanning sitemap files, starting at the url(s) given.
    54  -to and/or -from can be use to give an (inclusive) range.
    55  <url> lastmod entries are rejected if they are outside that range.
    56  <sitemap> lastmod entries are checked against the range only if -s flag is used.
    57  
    58  Options:
    59  `, os.Args[0])
    60  
    61  	flag.PrintDefaults()
    62  }
    63  
    64  var stats struct {
    65  	fetchCnt      int
    66  	fetchErrs     int
    67  	parseErrs     int // Number of pages which failed to parse as XML
    68  	fetchRejected int
    69  	artsAccepted  int
    70  	artsRejected  int
    71  }
    72  
    73  //u := "https://www.thesun.co.uk/sitemap.xml?yyyy=2016&mm=06&dd=20"
    74  func main() {
    75  	// use a politetripper to throttle the request frequency
    76  	client := &http.Client{
    77  		Transport: util.NewPoliteTripper(),
    78  	}
    79  
    80  	flag.Usage = usage
    81  	flag.StringVar(&opts.fromDate, "from", "", "ignore links with LastMod before YYYY-MM-DD date")
    82  	flag.StringVar(&opts.toDate, "to", "", "ignore links with LastMod after YYYY-MM-DD date")
    83  	flag.BoolVar(&opts.filterSitemap, "s", false, "apply date filter to <sitemap> lastmod too?")
    84  	flag.BoolVar(&opts.nonrecursive, "n", false, "non-recursive (don't follow <sitemap> links)")
    85  	flag.IntVar(&opts.maxErrs, "e", 10, "maximum errors before bailing out (XML parsing errors don't count)")
    86  	flag.BoolVar(&opts.verbose, "v", false, "verbose")
    87  	flag.Parse()
    88  
    89  	var err error
    90  	if opts.fromDate != "" {
    91  		opts.from, err = time.Parse("2006-01-02", opts.fromDate)
    92  		if err != nil {
    93  			fmt.Fprintf(os.Stderr, "ERROR: bad 'from' date (%s)\n", err)
    94  			os.Exit(1)
    95  		}
    96  	}
    97  	if opts.toDate != "" {
    98  		opts.to, err = time.Parse("2006-01-02", opts.toDate)
    99  		if err != nil {
   100  			fmt.Fprintf(os.Stderr, "ERROR: bad 'to' date (%s)\n", err)
   101  			os.Exit(1)
   102  		}
   103  		opts.to.AddDate(0, 0, 1)
   104  	}
   105  
   106  	if flag.NArg() == 0 {
   107  		fmt.Fprintf(os.Stderr, "ERROR: no files or urls specified\n")
   108  		os.Exit(1)
   109  	}
   110  	// now run upon each supplied file or url
   111  	for _, u := range flag.Args() {
   112  
   113  		err = doit(client, u)
   114  		if err != nil {
   115  			fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
   116  			os.Exit(1)
   117  		}
   118  	}
   119  
   120  	if opts.verbose {
   121  		fmt.Fprintf(os.Stderr, "fetched %d files (%d errors, %d skipped, %d badxml), yielded %d links (%d rejected)\n",
   122  			stats.fetchCnt, stats.fetchErrs, stats.fetchRejected, stats.parseErrs, stats.artsAccepted, stats.artsRejected)
   123  	}
   124  }
   125  
   126  // try a couple of likely formats for LastMod timestamps
   127  func parseLastMod(lastMod string) (time.Time, error) {
   128  	var t time.Time
   129  	var err error
   130  
   131  	fmts := []string{time.RFC3339,
   132  		"2006-01-02T15:04:05Z0700", // eg 2021-04-30T18:10:59Z
   133  		"2006-01-02T15:04Z0700",    // eg 2021-04-30T18:10Z
   134  		"2006-01-02",
   135  	}
   136  	for _, fmt := range fmts {
   137  		t, err = time.Parse(fmt, lastMod)
   138  		if err == nil {
   139  			return t, nil
   140  		}
   141  	}
   142  	return t, err
   143  }
   144  
   145  func handleFetchErr(u string, err error) error {
   146  	stats.fetchErrs++
   147  	fmt.Fprintf(os.Stderr, "ERROR fetching %s - %s\n", u, err)
   148  	if stats.fetchErrs < opts.maxErrs {
   149  		return nil // keep going.
   150  	}
   151  	return fmt.Errorf("Too many errors.")
   152  }
   153  
   154  // fetch and process a single sitemap xml (file or url)
   155  func doit(client *http.Client, u string) error {
   156  	if opts.verbose {
   157  		fmt.Fprintf(os.Stderr, "fetching %s\n", u)
   158  	}
   159  
   160  	foo, err := url.Parse(u)
   161  	if err != nil {
   162  		return handleFetchErr(u, err)
   163  	}
   164  
   165  	var in io.ReadCloser
   166  	if foo.Scheme == "" {
   167  		in, err = os.Open(u)
   168  		if err != nil {
   169  			return handleFetchErr(u, err)
   170  		}
   171  	} else {
   172  		req, err := http.NewRequest("GET", u, nil)
   173  		if err != nil {
   174  			return handleFetchErr(u, err)
   175  		}
   176  		req.Header.Set("Accept", "*/*")
   177  		req.Header.Set("User-Agent", "steno/0.1")
   178  
   179  		resp, err := client.Do(req)
   180  		if err != nil {
   181  			return handleFetchErr(u, err)
   182  		}
   183  
   184  		if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   185  			return handleFetchErr(u, fmt.Errorf("http error %d", resp.StatusCode))
   186  		}
   187  
   188  		// handle gzipped files
   189  		// (net/http handles compressed Content-Encoding, but this is different.
   190  		// some sites have sitemap.xml.gz files, which are delivered to us
   191  		// verbatim, ie encoded.
   192  		// (Might also be worth checking for .gz extension in URL? Meh. Deal
   193  		// with it if we see a case in the wild not covered by Content-Type).
   194  		if resp.Header.Get("Content-Type") == "application/x-gzip" {
   195  			dec, err := gzip.NewReader(resp.Body)
   196  			if err != nil {
   197  				return fmt.Errorf("gunzip failed: %s", err)
   198  			}
   199  			in = ioutil.NopCloser(dec)
   200  		} else {
   201  			in = resp.Body
   202  		}
   203  	}
   204  	defer in.Close()
   205  
   206  	stats.fetchCnt++
   207  
   208  	result, err := parse(in)
   209  	if err != nil {
   210  		stats.parseErrs++
   211  		fmt.Fprintf(os.Stderr, "skipping %s - failed to parse (%s)", u, err)
   212  		return nil // keep going!
   213  	}
   214  
   215  	// dump out article links
   216  	for _, art := range result.URLset.URL {
   217  		accept := true
   218  		if (!opts.from.IsZero() || !opts.to.IsZero()) && art.LastMod != "" {
   219  			var t time.Time
   220  			t, err = parseLastMod(art.LastMod)
   221  			if err == nil {
   222  				//fmt.Fprintf(os.Stderr, "Parsed '%s' -> %v (from: %v to: %v)\n", art.LastMod, t, opts.from, opts.to)
   223  				if !opts.from.IsZero() && t.Before(opts.from) {
   224  					//fmt.Fprintf(os.Stderr, "Reject '%s' (too early)\n", art.LastMod)
   225  					accept = false // too early
   226  				}
   227  				if !opts.to.IsZero() && (t.Equal(opts.to) || t.After(opts.to)) {
   228  					accept = false // too late
   229  					//fmt.Fprintf(os.Stderr, "Reject '%s' (too late)\n", art.LastMod)
   230  				}
   231  			} else {
   232  				fmt.Fprintf(os.Stderr, "WARN: bad LastMod (%s) in %s (rejecting)\n", art.LastMod, u)
   233  				accept = false
   234  			}
   235  
   236  		}
   237  
   238  		if accept {
   239  			stats.artsAccepted++
   240  			fmt.Println(art.Loc)
   241  		} else {
   242  			stats.artsRejected++
   243  		}
   244  
   245  	}
   246  
   247  	// go through any referenced sitemap files
   248  	for _, foo := range result.SitemapIndex.Sitemap {
   249  		if opts.nonrecursive {
   250  			//fmt.Println(foo.Loc)
   251  		} else {
   252  			accept := true
   253  			if opts.filterSitemap && (!opts.from.IsZero() || !opts.to.IsZero()) && foo.LastMod != "" {
   254  				var t time.Time
   255  				t, err = parseLastMod(foo.LastMod)
   256  				if err == nil {
   257  					if !opts.from.IsZero() && t.Before(opts.from) {
   258  						accept = false // too early
   259  					}
   260  					if !opts.to.IsZero() && (t.Equal(opts.to) || t.After(opts.to)) {
   261  						accept = false // too late
   262  					}
   263  				} else {
   264  					fmt.Fprintf(os.Stderr, "WARN: bad LastMod in <sitemap> (%s) in %s (rejecting)\n", foo.LastMod, u)
   265  					accept = false
   266  				}
   267  
   268  			}
   269  
   270  			if accept {
   271  				err := doit(client, foo.Loc)
   272  				if err != nil {
   273  					return err
   274  				}
   275  			} else {
   276  				if opts.verbose {
   277  					fmt.Fprintf(os.Stderr, "skipping <sitemap> %s (lastmod=%s)\n", foo.Loc, foo.LastMod)
   278  				}
   279  				stats.fetchRejected++
   280  			}
   281  		}
   282  	}
   283  	return nil
   284  }
   285  
   286  func parse(in io.Reader) (*sitemapfile, error) {
   287  	dec := xml.NewDecoder(in)
   288  	result := sitemapfile{}
   289  
   290  	err := dec.Decode(&result)
   291  	if err != nil {
   292  		return nil, fmt.Errorf("decode failed: %s", err)
   293  	}
   294  
   295  	return &result, nil
   296  }