github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/linkgrabber/main.go (about)

     1  package main
     2  
     3  import (
     4  	"flag"
     5  	"fmt"
     6  	"github.com/andybalholm/cascadia"
     7  	"github.com/bcampbell/arts/util"
     8  	"golang.org/x/net/html"
     9  	"net/http"
    10  	"net/url"
    11  	"os"
    12  )
    13  
    14  var opts struct {
    15  	linkSel   string
    16  	followSel string
    17  	verbose   bool
    18  }
    19  
    20  func main() {
    21  	flag.Usage = func() {
    22  
    23  		fmt.Fprintf(os.Stderr, "Usage:\n")
    24  		fmt.Fprintf(os.Stderr, "%s [OPTIONS] URL(s)...\n", os.Args[0])
    25  		fmt.Fprintf(os.Stderr, `
    26  Scans the pages at the given URLs and dumps all the links out to stdout.
    27  
    28  Input URLs can be absolute or relative - relative links will be
    29  considered relative to the previous URL in the list.
    30  eg:
    31     http://pseudopolisherald.com/ /politics /local /hubwards
    32  is just fine.
    33  
    34  `)
    35  		flag.PrintDefaults()
    36  	}
    37  
    38  	flag.StringVar(&opts.linkSel, "l", "a", "css selector to find links to output")
    39  	flag.StringVar(&opts.followSel, "f", "", "css selector of links to follow")
    40  	flag.BoolVar(&opts.verbose, "v", false, "output extra info (on stderr)")
    41  	flag.Parse()
    42  
    43  	var err error
    44  	if flag.NArg() < 1 {
    45  		fmt.Fprintf(os.Stderr, "ERROR: missing URL(s)\n")
    46  		flag.Usage()
    47  		os.Exit(1)
    48  	}
    49  
    50  	err = doit(flag.Args())
    51  	if err != nil {
    52  		fmt.Fprintf(os.Stderr, "ERROR: %s\n", err)
    53  		os.Exit(1)
    54  	}
    55  
    56  	os.Exit(0)
    57  }
    58  
    59  // expandURLs produces a list of absolute URLs from a list of (perhaps) partial
    60  // URLs. It uses the previous URL in the list as the context for the next.
    61  func expandURLs(origURLs []string) ([]string, error) {
    62  	prev := &url.URL{}
    63  	cooked := make([]string, len(origURLs))
    64  	for i, origURL := range origURLs {
    65  		parsed, err := prev.Parse(origURL)
    66  		if err != nil {
    67  			return nil, fmt.Errorf("bad URL '%s'", origURL)
    68  		}
    69  
    70  		if !parsed.IsAbs() {
    71  			return nil, fmt.Errorf("URL not absolute (and can't be guessed from previous) '%s'", origURL)
    72  		}
    73  		prev = parsed
    74  		cooked[i] = parsed.String()
    75  	}
    76  	return cooked, nil
    77  }
    78  
    79  func doit(urls []string) error {
    80  	linkSel, err := cascadia.Compile(opts.linkSel)
    81  	if err != nil {
    82  		return fmt.Errorf("Bad link selector: %s", err)
    83  	}
    84  
    85  	var followSel cascadia.Selector = nil
    86  	if opts.followSel != "" {
    87  		followSel, err = cascadia.Compile(opts.followSel)
    88  		if err != nil {
    89  			return fmt.Errorf("Bad follow selector: %s", err)
    90  		}
    91  	}
    92  
    93  	urls, err = expandURLs(urls)
    94  	if err != nil {
    95  		return err
    96  	}
    97  
    98  	client := &http.Client{
    99  		Transport: util.NewPoliteTripper(),
   100  	}
   101  
   102  	queued := map[string]struct{}{}
   103  	visited := map[string]struct{}{}
   104  
   105  	for _, u := range urls {
   106  		queued[u] = struct{}{}
   107  	}
   108  
   109  	errCnt := 0
   110  	// while we have urls queued to scrape...
   111  	for len(queued) > 0 {
   112  		for u, _ := range queued {
   113  			found, follow, err := doPage(client, u, linkSel, followSel)
   114  
   115  			// shift url into visited set
   116  			visited[u] = struct{}{}
   117  			delete(queued, u)
   118  			if err != nil {
   119  				fmt.Fprintf(os.Stderr, "FAILED: %s (%s)\n", u, err)
   120  				errCnt++
   121  				if errCnt > 10 {
   122  					return fmt.Errorf("Too many errors.")
   123  				}
   124  				continue
   125  			}
   126  			if opts.verbose {
   127  				fmt.Fprintf(os.Stderr, "%s (%d,%d)\n", u, len(found), len(follow))
   128  			}
   129  
   130  			// output any found links
   131  			for _, l := range found {
   132  				fmt.Println(l)
   133  			}
   134  
   135  			// queue up any links we want to follow
   136  			for _, l := range follow {
   137  				_, got := visited[l]
   138  				if !got {
   139  					queued[l] = struct{}{}
   140  				}
   141  			}
   142  		}
   143  	}
   144  
   145  	return nil
   146  }
   147  
   148  func doPage(client *http.Client, pageURL string, linkSel cascadia.Selector, followSel cascadia.Selector) ([]string, []string, error) {
   149  
   150  	root, err := fetchAndParse(client, pageURL)
   151  	if err != nil {
   152  		return []string{}, []string{}, err
   153  	}
   154  	found, err := grabLinks(root, linkSel, pageURL)
   155  	if err != nil {
   156  		return []string{}, []string{}, err
   157  	}
   158  
   159  	follow := []string{}
   160  	if followSel != nil {
   161  		follow, err = grabLinks(root, followSel, pageURL)
   162  		if err != nil {
   163  			return []string{}, []string{}, err
   164  		}
   165  	}
   166  
   167  	return found, follow, nil
   168  }
   169  
   170  func fetchAndParse(client *http.Client, u string) (*html.Node, error) {
   171  	req, err := http.NewRequest("GET", u, nil)
   172  	if err != nil {
   173  		return nil, err
   174  	}
   175  	// NOTE: FT.com always returns 403 if no Accept header is present.
   176  	// Seems like a reasonable thing to send anyway...
   177  	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
   178  
   179  	// TODO: verbose flag!!!
   180  	// fmt.Fprintf(os.Stderr, "fetch %s\n", u)
   181  
   182  	resp, err := client.Do(req)
   183  	if err != nil {
   184  		return nil, err
   185  	}
   186  	defer resp.Body.Close()
   187  	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   188  		err = fmt.Errorf("HTTP code %d (%s)", resp.StatusCode, u)
   189  		return nil, err
   190  	}
   191  
   192  	return html.Parse(resp.Body)
   193  }
   194  
   195  // GetAttr retrieved the value of an attribute on a node.
   196  // Returns empty string if attribute doesn't exist.
   197  func GetAttr(n *html.Node, attr string) string {
   198  	for _, a := range n.Attr {
   199  		if a.Key == attr {
   200  			return a.Val
   201  		}
   202  	}
   203  	return ""
   204  }
   205  
   206  func grabLinks(root *html.Node, linkSel cascadia.Selector, baseURL string) ([]string, error) {
   207  	u, err := url.Parse(baseURL)
   208  	if err != nil {
   209  		return nil, err
   210  	}
   211  
   212  	out := []string{}
   213  	for _, a := range linkSel.MatchAll(root) {
   214  		link, err := getAbsHref(a, u)
   215  		if err != nil {
   216  			fmt.Fprintf(os.Stderr, "%s BAD link: '%s'\n", baseURL, err)
   217  			continue
   218  		}
   219  		out = append(out, link)
   220  	}
   221  	return out, nil
   222  }
   223  
   224  func getAbsHref(anchor *html.Node, baseURL *url.URL) (string, error) {
   225  	h := GetAttr(anchor, "href")
   226  	absURL, err := baseURL.Parse(h)
   227  	if err != nil {
   228  		return "", fmt.Errorf("bad href (%s): %s", h, err)
   229  	}
   230  	return absURL.String(), nil
   231  }