github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/sitemap/sitemap.go (about) 1 package main 2 3 /* 4 TODO: 5 - specify date range of interest 6 - skip index files which look like they're outside the date range (often 7 they'll have a date in the filename) 8 - handle gzip files 9 - read robots.txt to get sitemap files 10 - add support for googlenews extra fields 11 - factor out for use as a discovery mechanism in scrapeomat 12 */ 13 14 import ( 15 // "encoding/csv" 16 "encoding/xml" 17 "flag" 18 "fmt" 19 "github.com/bcampbell/arts/util" 20 "net/http" 21 "os" 22 "strings" 23 ) 24 25 type URL struct { 26 Loc string `xml:"loc"` 27 ChangeFreq string `xml:"changefreq"` 28 Priority string `xml:"priority"` 29 LastMod string `xml:"lastmod"` 30 } 31 32 type URLSet struct { 33 URLs []URL `xml:"url"` 34 } 35 36 type Sitemap struct { 37 Loc string `xml:"loc"` 38 LastMod string `xml:"lastmod"` 39 } 40 type SitemapIndex struct { 41 Sitemaps []Sitemap `xml:"sitemap"` 42 } 43 44 type Result struct { 45 XMLName xml.Name 46 Sitemaps []Sitemap `xml:"sitemap"` 47 URLs []URL `xml:"url"` 48 /*URLSet []URLSet `xml:"urlset"` 49 itemapIndex []SitemapIndex `xml:"sitemapindex"` 50 */ 51 } 52 53 func main() { 54 flag.Parse() 55 for _, sitemapURL := range flag.Args() { 56 urls, err := doit(sitemapURL, "") 57 if err != nil { 58 fmt.Fprintln(os.Stderr, err) 59 os.Exit(1) 60 } 61 for _, u := range urls { 62 fmt.Println(u.LastMod, u.Loc) 63 } 64 } 65 } 66 67 func doit(sitemapURL string, indent string) ([]URL, error) { 68 result := make([]URL, 0) 69 politeClient := &http.Client{ 70 Transport: util.NewPoliteTripper(), 71 } 72 73 resp, err := politeClient.Get(sitemapURL) 74 if err != nil { 75 return nil, err 76 } 77 defer resp.Body.Close() 78 if resp.StatusCode != 200 { 79 return nil, fmt.Errorf("%s%s: http error %d", indent, sitemapURL, resp.StatusCode) 80 } 81 82 dat := Result{} 83 dec := xml.NewDecoder(resp.Body) 84 err = dec.Decode(&dat) 85 if err != nil { 86 return nil, err 87 } 88 resp.Body.Close() 89 90 result = append(result, dat.URLs...) 91 for _, sitemap := range dat.Sitemaps { 92 93 if !strings.Contains(sitemap.LastMod, "2014") { 94 fmt.Fprintf(os.Stderr, "%sSKIP %s (%s)\n", indent, sitemap.Loc, sitemap.LastMod) 95 continue 96 } 97 foo, err := doit(sitemap.Loc, indent+" ") 98 if err != nil { 99 return nil, err 100 } 101 fmt.Fprintf(os.Stderr, "%s%s (%s) => %d urls\n", indent, sitemap.Loc, sitemap.LastMod, len(foo)) 102 result = append(result, foo...) 103 } 104 return result, nil 105 }