github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/linkgrabber/main.go (about) 1 package main 2 3 import ( 4 "flag" 5 "fmt" 6 "github.com/andybalholm/cascadia" 7 "github.com/bcampbell/arts/util" 8 "golang.org/x/net/html" 9 "net/http" 10 "net/url" 11 "os" 12 ) 13 14 var opts struct { 15 linkSel string 16 followSel string 17 verbose bool 18 } 19 20 func main() { 21 flag.Usage = func() { 22 23 fmt.Fprintf(os.Stderr, "Usage:\n") 24 fmt.Fprintf(os.Stderr, "%s [OPTIONS] URL(s)...\n", os.Args[0]) 25 fmt.Fprintf(os.Stderr, ` 26 Scans the pages at the given URLs and dumps all the links out to stdout. 27 28 Input URLs can be absolute or relative - relative links will be 29 considered relative to the previous URL in the list. 30 eg: 31 http://pseudopolisherald.com/ /politics /local /hubwards 32 is just fine. 33 34 `) 35 flag.PrintDefaults() 36 } 37 38 flag.StringVar(&opts.linkSel, "l", "a", "css selector to find links to output") 39 flag.StringVar(&opts.followSel, "f", "", "css selector of links to follow") 40 flag.BoolVar(&opts.verbose, "v", false, "output extra info (on stderr)") 41 flag.Parse() 42 43 var err error 44 if flag.NArg() < 1 { 45 fmt.Fprintf(os.Stderr, "ERROR: missing URL(s)\n") 46 flag.Usage() 47 os.Exit(1) 48 } 49 50 err = doit(flag.Args()) 51 if err != nil { 52 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 53 os.Exit(1) 54 } 55 56 os.Exit(0) 57 } 58 59 // expandURLs produces a list of absolute URLs from a list of (perhaps) partial 60 // URLs. It uses the previous URL in the list as the context for the next. 61 func expandURLs(origURLs []string) ([]string, error) { 62 prev := &url.URL{} 63 cooked := make([]string, len(origURLs)) 64 for i, origURL := range origURLs { 65 parsed, err := prev.Parse(origURL) 66 if err != nil { 67 return nil, fmt.Errorf("bad URL '%s'", origURL) 68 } 69 70 if !parsed.IsAbs() { 71 return nil, fmt.Errorf("URL not absolute (and can't be guessed from previous) '%s'", origURL) 72 } 73 prev = parsed 74 cooked[i] = parsed.String() 75 } 76 return cooked, nil 77 } 78 79 func doit(urls []string) error { 80 linkSel, err := cascadia.Compile(opts.linkSel) 81 if err != nil { 82 return fmt.Errorf("Bad link selector: %s", err) 83 } 84 85 var followSel cascadia.Selector = nil 86 if opts.followSel != "" { 87 followSel, err = cascadia.Compile(opts.followSel) 88 if err != nil { 89 return fmt.Errorf("Bad follow selector: %s", err) 90 } 91 } 92 93 urls, err = expandURLs(urls) 94 if err != nil { 95 return err 96 } 97 98 client := &http.Client{ 99 Transport: util.NewPoliteTripper(), 100 } 101 102 queued := map[string]struct{}{} 103 visited := map[string]struct{}{} 104 105 for _, u := range urls { 106 queued[u] = struct{}{} 107 } 108 109 errCnt := 0 110 // while we have urls queued to scrape... 111 for len(queued) > 0 { 112 for u, _ := range queued { 113 found, follow, err := doPage(client, u, linkSel, followSel) 114 115 // shift url into visited set 116 visited[u] = struct{}{} 117 delete(queued, u) 118 if err != nil { 119 fmt.Fprintf(os.Stderr, "FAILED: %s (%s)\n", u, err) 120 errCnt++ 121 if errCnt > 10 { 122 return fmt.Errorf("Too many errors.") 123 } 124 continue 125 } 126 if opts.verbose { 127 fmt.Fprintf(os.Stderr, "%s (%d,%d)\n", u, len(found), len(follow)) 128 } 129 130 // output any found links 131 for _, l := range found { 132 fmt.Println(l) 133 } 134 135 // queue up any links we want to follow 136 for _, l := range follow { 137 _, got := visited[l] 138 if !got { 139 queued[l] = struct{}{} 140 } 141 } 142 } 143 } 144 145 return nil 146 } 147 148 func doPage(client *http.Client, pageURL string, linkSel cascadia.Selector, followSel cascadia.Selector) ([]string, []string, error) { 149 150 root, err := fetchAndParse(client, pageURL) 151 if err != nil { 152 return []string{}, []string{}, err 153 } 154 found, err := grabLinks(root, linkSel, pageURL) 155 if err != nil { 156 return []string{}, []string{}, err 157 } 158 159 follow := []string{} 160 if followSel != nil { 161 follow, err = grabLinks(root, followSel, pageURL) 162 if err != nil { 163 return []string{}, []string{}, err 164 } 165 } 166 167 return found, follow, nil 168 } 169 170 func fetchAndParse(client *http.Client, u string) (*html.Node, error) { 171 req, err := http.NewRequest("GET", u, nil) 172 if err != nil { 173 return nil, err 174 } 175 // NOTE: FT.com always returns 403 if no Accept header is present. 176 // Seems like a reasonable thing to send anyway... 177 req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 178 179 // TODO: verbose flag!!! 180 // fmt.Fprintf(os.Stderr, "fetch %s\n", u) 181 182 resp, err := client.Do(req) 183 if err != nil { 184 return nil, err 185 } 186 defer resp.Body.Close() 187 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 188 err = fmt.Errorf("HTTP code %d (%s)", resp.StatusCode, u) 189 return nil, err 190 } 191 192 return html.Parse(resp.Body) 193 } 194 195 // GetAttr retrieved the value of an attribute on a node. 196 // Returns empty string if attribute doesn't exist. 197 func GetAttr(n *html.Node, attr string) string { 198 for _, a := range n.Attr { 199 if a.Key == attr { 200 return a.Val 201 } 202 } 203 return "" 204 } 205 206 func grabLinks(root *html.Node, linkSel cascadia.Selector, baseURL string) ([]string, error) { 207 u, err := url.Parse(baseURL) 208 if err != nil { 209 return nil, err 210 } 211 212 out := []string{} 213 for _, a := range linkSel.MatchAll(root) { 214 link, err := getAbsHref(a, u) 215 if err != nil { 216 fmt.Fprintf(os.Stderr, "%s BAD link: '%s'\n", baseURL, err) 217 continue 218 } 219 out = append(out, link) 220 } 221 return out, nil 222 } 223 224 func getAbsHref(anchor *html.Node, baseURL *url.URL) (string, error) { 225 h := GetAttr(anchor, "href") 226 absURL, err := baseURL.Parse(h) 227 if err != nil { 228 return "", fmt.Errorf("bad href (%s): %s", h, err) 229 } 230 return absURL.String(), nil 231 }