github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/sitemapwalker/main.go (about) 1 package main 2 3 import ( 4 "compress/gzip" 5 "encoding/xml" 6 "flag" 7 "fmt" 8 "github.com/bcampbell/arts/util" 9 "io" 10 "io/ioutil" 11 "net/http" 12 "net/url" 13 "os" 14 "time" 15 ) 16 17 var opts struct { 18 nonrecursive bool 19 verbose bool 20 21 fromDate string 22 toDate string 23 filterSitemap bool 24 from time.Time 25 to time.Time 26 27 maxErrs int 28 } 29 30 type sitemapfile struct { 31 SitemapIndex `xml:"sitemapindex"` 32 URLset `xml:"urlset"` 33 } 34 35 type SitemapIndex struct { 36 //XMLName xml.Name `xml:"sitemapindex"` 37 Sitemap []struct { 38 Loc string `xml:"loc"` 39 LastMod string `xml:"lastmod"` 40 } `xml:"sitemap"` 41 } 42 type URLset struct { 43 //XMLName xml.Name `xml:"urlset"` 44 URL []struct { 45 Loc string `xml:"loc"` 46 LastMod string `xml:"lastmod"` 47 } `xml:"url"` 48 } 49 50 func usage() { 51 52 fmt.Fprintf(os.Stderr, `Usage: %s [OPTIONS] [URL] ... 53 Find pages by scanning sitemap files, starting at the url(s) given. 54 -to and/or -from can be use to give an (inclusive) range. 55 <url> lastmod entries are rejected if they are outside that range. 56 <sitemap> lastmod entries are checked against the range only if -s flag is used. 57 58 Options: 59 `, os.Args[0]) 60 61 flag.PrintDefaults() 62 } 63 64 var stats struct { 65 fetchCnt int 66 fetchErrs int 67 parseErrs int // Number of pages which failed to parse as XML 68 fetchRejected int 69 artsAccepted int 70 artsRejected int 71 } 72 73 //u := "https://www.thesun.co.uk/sitemap.xml?yyyy=2016&mm=06&dd=20" 74 func main() { 75 // use a politetripper to throttle the request frequency 76 client := &http.Client{ 77 Transport: util.NewPoliteTripper(), 78 } 79 80 flag.Usage = usage 81 flag.StringVar(&opts.fromDate, "from", "", "ignore links with LastMod before YYYY-MM-DD date") 82 flag.StringVar(&opts.toDate, "to", "", "ignore links with LastMod after YYYY-MM-DD date") 83 flag.BoolVar(&opts.filterSitemap, "s", false, "apply date filter to <sitemap> lastmod too?") 84 flag.BoolVar(&opts.nonrecursive, "n", false, "non-recursive (don't follow <sitemap> links)") 85 flag.IntVar(&opts.maxErrs, "e", 10, "maximum errors before bailing out (XML parsing errors don't count)") 86 flag.BoolVar(&opts.verbose, "v", false, "verbose") 87 flag.Parse() 88 89 var err error 90 if opts.fromDate != "" { 91 opts.from, err = time.Parse("2006-01-02", opts.fromDate) 92 if err != nil { 93 fmt.Fprintf(os.Stderr, "ERROR: bad 'from' date (%s)\n", err) 94 os.Exit(1) 95 } 96 } 97 if opts.toDate != "" { 98 opts.to, err = time.Parse("2006-01-02", opts.toDate) 99 if err != nil { 100 fmt.Fprintf(os.Stderr, "ERROR: bad 'to' date (%s)\n", err) 101 os.Exit(1) 102 } 103 opts.to.AddDate(0, 0, 1) 104 } 105 106 if flag.NArg() == 0 { 107 fmt.Fprintf(os.Stderr, "ERROR: no files or urls specified\n") 108 os.Exit(1) 109 } 110 // now run upon each supplied file or url 111 for _, u := range flag.Args() { 112 113 err = doit(client, u) 114 if err != nil { 115 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 116 os.Exit(1) 117 } 118 } 119 120 if opts.verbose { 121 fmt.Fprintf(os.Stderr, "fetched %d files (%d errors, %d skipped, %d badxml), yielded %d links (%d rejected)\n", 122 stats.fetchCnt, stats.fetchErrs, stats.fetchRejected, stats.parseErrs, stats.artsAccepted, stats.artsRejected) 123 } 124 } 125 126 // try a couple of likely formats for LastMod timestamps 127 func parseLastMod(lastMod string) (time.Time, error) { 128 var t time.Time 129 var err error 130 131 fmts := []string{time.RFC3339, 132 "2006-01-02T15:04:05Z0700", // eg 2021-04-30T18:10:59Z 133 "2006-01-02T15:04Z0700", // eg 2021-04-30T18:10Z 134 "2006-01-02", 135 } 136 for _, fmt := range fmts { 137 t, err = time.Parse(fmt, lastMod) 138 if err == nil { 139 return t, nil 140 } 141 } 142 return t, err 143 } 144 145 func handleFetchErr(u string, err error) error { 146 stats.fetchErrs++ 147 fmt.Fprintf(os.Stderr, "ERROR fetching %s - %s\n", u, err) 148 if stats.fetchErrs < opts.maxErrs { 149 return nil // keep going. 150 } 151 return fmt.Errorf("Too many errors.") 152 } 153 154 // fetch and process a single sitemap xml (file or url) 155 func doit(client *http.Client, u string) error { 156 if opts.verbose { 157 fmt.Fprintf(os.Stderr, "fetching %s\n", u) 158 } 159 160 foo, err := url.Parse(u) 161 if err != nil { 162 return handleFetchErr(u, err) 163 } 164 165 var in io.ReadCloser 166 if foo.Scheme == "" { 167 in, err = os.Open(u) 168 if err != nil { 169 return handleFetchErr(u, err) 170 } 171 } else { 172 req, err := http.NewRequest("GET", u, nil) 173 if err != nil { 174 return handleFetchErr(u, err) 175 } 176 req.Header.Set("Accept", "*/*") 177 req.Header.Set("User-Agent", "steno/0.1") 178 179 resp, err := client.Do(req) 180 if err != nil { 181 return handleFetchErr(u, err) 182 } 183 184 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 185 return handleFetchErr(u, fmt.Errorf("http error %d", resp.StatusCode)) 186 } 187 188 // handle gzipped files 189 // (net/http handles compressed Content-Encoding, but this is different. 190 // some sites have sitemap.xml.gz files, which are delivered to us 191 // verbatim, ie encoded. 192 // (Might also be worth checking for .gz extension in URL? Meh. Deal 193 // with it if we see a case in the wild not covered by Content-Type). 194 if resp.Header.Get("Content-Type") == "application/x-gzip" { 195 dec, err := gzip.NewReader(resp.Body) 196 if err != nil { 197 return fmt.Errorf("gunzip failed: %s", err) 198 } 199 in = ioutil.NopCloser(dec) 200 } else { 201 in = resp.Body 202 } 203 } 204 defer in.Close() 205 206 stats.fetchCnt++ 207 208 result, err := parse(in) 209 if err != nil { 210 stats.parseErrs++ 211 fmt.Fprintf(os.Stderr, "skipping %s - failed to parse (%s)", u, err) 212 return nil // keep going! 213 } 214 215 // dump out article links 216 for _, art := range result.URLset.URL { 217 accept := true 218 if (!opts.from.IsZero() || !opts.to.IsZero()) && art.LastMod != "" { 219 var t time.Time 220 t, err = parseLastMod(art.LastMod) 221 if err == nil { 222 //fmt.Fprintf(os.Stderr, "Parsed '%s' -> %v (from: %v to: %v)\n", art.LastMod, t, opts.from, opts.to) 223 if !opts.from.IsZero() && t.Before(opts.from) { 224 //fmt.Fprintf(os.Stderr, "Reject '%s' (too early)\n", art.LastMod) 225 accept = false // too early 226 } 227 if !opts.to.IsZero() && (t.Equal(opts.to) || t.After(opts.to)) { 228 accept = false // too late 229 //fmt.Fprintf(os.Stderr, "Reject '%s' (too late)\n", art.LastMod) 230 } 231 } else { 232 fmt.Fprintf(os.Stderr, "WARN: bad LastMod (%s) in %s (rejecting)\n", art.LastMod, u) 233 accept = false 234 } 235 236 } 237 238 if accept { 239 stats.artsAccepted++ 240 fmt.Println(art.Loc) 241 } else { 242 stats.artsRejected++ 243 } 244 245 } 246 247 // go through any referenced sitemap files 248 for _, foo := range result.SitemapIndex.Sitemap { 249 if opts.nonrecursive { 250 //fmt.Println(foo.Loc) 251 } else { 252 accept := true 253 if opts.filterSitemap && (!opts.from.IsZero() || !opts.to.IsZero()) && foo.LastMod != "" { 254 var t time.Time 255 t, err = parseLastMod(foo.LastMod) 256 if err == nil { 257 if !opts.from.IsZero() && t.Before(opts.from) { 258 accept = false // too early 259 } 260 if !opts.to.IsZero() && (t.Equal(opts.to) || t.After(opts.to)) { 261 accept = false // too late 262 } 263 } else { 264 fmt.Fprintf(os.Stderr, "WARN: bad LastMod in <sitemap> (%s) in %s (rejecting)\n", foo.LastMod, u) 265 accept = false 266 } 267 268 } 269 270 if accept { 271 err := doit(client, foo.Loc) 272 if err != nil { 273 return err 274 } 275 } else { 276 if opts.verbose { 277 fmt.Fprintf(os.Stderr, "skipping <sitemap> %s (lastmod=%s)\n", foo.Loc, foo.LastMod) 278 } 279 stats.fetchRejected++ 280 } 281 } 282 } 283 return nil 284 } 285 286 func parse(in io.Reader) (*sitemapfile, error) { 287 dec := xml.NewDecoder(in) 288 result := sitemapfile{} 289 290 err := dec.Decode(&result) 291 if err != nil { 292 return nil, fmt.Errorf("decode failed: %s", err) 293 } 294 295 return &result, nil 296 }