github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/discover/discover.go (about)

     1  package discover
     2  
     3  //
     4  //
     5  // TODO:
     6  //   should be able to guess article link format statistically
     7  //   handle/allow subdomains (eg: www1.politicalbetting.com)
     8  //   filter unwanted navlinks (eg "mirror.co.uk/all-about/fred bloggs")
     9  //   HTTP error handling
    10  //   multiple url formats (eg spectator has multiple cms's)
    11  //   logging
    12  
    13  import (
    14  	"errors"
    15  	"fmt"
    16  	"github.com/PuerkitoBio/purell"
    17  	"github.com/andybalholm/cascadia"
    18  	"golang.org/x/net/html"
    19  	"net/http"
    20  	"net/url"
    21  	//	"os"
    22  	"regexp"
    23  	"strings"
    24  )
    25  
    26  type Logger interface {
    27  	Printf(format string, v ...interface{})
    28  }
    29  
    30  type NullLogger struct{}
    31  
    32  func (l NullLogger) Printf(format string, v ...interface{}) {
    33  }
    34  
    35  type DiscovererDef struct {
    36  	Name string
    37  	URL  string
    38  	// article urls to include - regexes
    39  	ArtPat []string
    40  	// article urls to exclude - regexes
    41  	XArtPat []string
    42  
    43  	// article url forms to include (eg "/YYYY/MM/SLUG.html")
    44  	ArtForm []string
    45  	// article url forms to exclude
    46  	XArtForm []string
    47  
    48  	// CSS selector to identify navigation links
    49  	NavSel string
    50  	// regexp patterns of pages to skip during link discovery
    51  	XNavPat []string
    52  
    53  	// css selector for elements to cull during article discovery
    54  	CruftSel string
    55  
    56  	// BaseErrorThreshold is starting number of http errors to accept before
    57  	// bailing out. default is 5   (and 0 is considered as unset, so default is applied)
    58  	// error threshold formula: base + 10% of successful request count
    59  	BaseErrorThreshold int
    60  
    61  	// Hostpat is a regex matching accepted domains
    62  	// if empty, reject everything on a different domain
    63  	HostPat string
    64  
    65  	// If NoStripQuery is set then article URLs won't have the query part zapped
    66  	NoStripQuery bool
    67  
    68  	// UserAgent string to use in HTTP requests
    69  	UserAgent string
    70  }
    71  
    72  type DiscoverStats struct {
    73  	ErrorCount int
    74  	FetchCount int
    75  }
    76  
    77  type Discoverer struct {
    78  	Name               string
    79  	StartURL           url.URL
    80  	ArtPats            []*regexp.Regexp
    81  	XArtPats           []*regexp.Regexp
    82  	NavLinkSel         cascadia.Selector
    83  	XNavPats           []*regexp.Regexp
    84  	CruftSel           cascadia.Selector
    85  	BaseErrorThreshold int
    86  	StripFragments     bool
    87  	StripQuery         bool
    88  	HostPat            *regexp.Regexp
    89  	UserAgent          string
    90  
    91  	ErrorLog Logger
    92  	InfoLog  Logger
    93  	Stats    DiscoverStats
    94  }
    95  
    96  // compile a slice of strings into a slice of regexps
    97  func buildRegExps(pats []string) ([]*regexp.Regexp, error) {
    98  	out := make([]*regexp.Regexp, len(pats))
    99  	for idx, pat := range pats {
   100  		re, err := regexp.Compile(pat)
   101  		if err != nil {
   102  			return nil, err
   103  		}
   104  		out[idx] = re
   105  	}
   106  	return out, nil
   107  }
   108  
   109  func NewDiscoverer(cfg DiscovererDef) (*Discoverer, error) {
   110  	disc := &Discoverer{}
   111  	u, err := url.Parse(cfg.URL)
   112  	if err != nil {
   113  		return nil, err
   114  	}
   115  	disc.Name = cfg.Name
   116  	disc.StartURL = *u
   117  	// parse the regexp include/exclude rules
   118  	disc.ArtPats, err = buildRegExps(cfg.ArtPat)
   119  	if err != nil {
   120  		return nil, err
   121  	}
   122  	disc.XArtPats, err = buildRegExps(cfg.XArtPat)
   123  	if err != nil {
   124  		return nil, err
   125  	}
   126  	// parse the simplified include/exclude forms
   127  	for _, f := range cfg.ArtForm {
   128  		re, err := patToRegexp(f)
   129  		if err != nil {
   130  			return nil, err
   131  		}
   132  		disc.ArtPats = append(disc.ArtPats, re)
   133  	}
   134  	for _, f := range cfg.XArtForm {
   135  		re, err := patToRegexp(f)
   136  		if err != nil {
   137  			return nil, err
   138  		}
   139  		disc.XArtPats = append(disc.XArtPats, re)
   140  	}
   141  
   142  	if cfg.NavSel == "" {
   143  		disc.NavLinkSel = nil
   144  	} else {
   145  		sel, err := cascadia.Compile(cfg.NavSel)
   146  		if err != nil {
   147  			return nil, err
   148  		}
   149  		disc.NavLinkSel = sel
   150  	}
   151  
   152  	disc.XNavPats, err = buildRegExps(cfg.XNavPat)
   153  	if err != nil {
   154  		return nil, err
   155  	}
   156  
   157  	if cfg.CruftSel == "" {
   158  		disc.CruftSel = nil
   159  	} else {
   160  		sel, err := cascadia.Compile(cfg.CruftSel)
   161  		if err != nil {
   162  			return nil, err
   163  		}
   164  		disc.CruftSel = sel
   165  	}
   166  
   167  	disc.BaseErrorThreshold = cfg.BaseErrorThreshold
   168  	// treat base threshold of 0 as unset, and use a default
   169  	if disc.BaseErrorThreshold == 0 {
   170  		disc.BaseErrorThreshold = 5
   171  	}
   172  
   173  	if cfg.HostPat != "" {
   174  		re, err := regexp.Compile(cfg.HostPat)
   175  		if err != nil {
   176  			return nil, err
   177  		}
   178  		disc.HostPat = re
   179  	}
   180  
   181  	disc.UserAgent = cfg.UserAgent
   182  
   183  	// defaults
   184  	disc.StripFragments = true
   185  	disc.StripQuery = !cfg.NoStripQuery
   186  	disc.ErrorLog = NullLogger{}
   187  	disc.InfoLog = NullLogger{}
   188  	return disc, nil
   189  }
   190  
   191  var ErrQuit = errors.New("quit requested")
   192  
   193  func (disc *Discoverer) Run(client *http.Client, quit <-chan struct{}) (LinkSet, error) {
   194  	// reset stats
   195  	disc.Stats = DiscoverStats{}
   196  
   197  	queued := make(LinkSet) // nav pages to scan for article links
   198  	seen := make(LinkSet)   // nav pages we've scanned
   199  	arts := make(LinkSet)   // article links we've found so far
   200  
   201  	queued.Add(disc.StartURL)
   202  
   203  	for len(queued) > 0 {
   204  
   205  		if quit != nil {
   206  			select {
   207  			case <-quit:
   208  				return nil, ErrQuit
   209  			default:
   210  			}
   211  		}
   212  		pageURL := queued.Pop()
   213  		seen.Add(pageURL)
   214  		//
   215  
   216  		root, err := disc.fetchAndParse(client, &pageURL)
   217  		if err != nil {
   218  			disc.ErrorLog.Printf("%s\n", err.Error())
   219  			disc.Stats.ErrorCount++
   220  			if disc.Stats.ErrorCount > disc.BaseErrorThreshold+(disc.Stats.FetchCount/10) {
   221  				return nil, errors.New("Error threshold exceeded")
   222  			} else {
   223  				continue
   224  			}
   225  		}
   226  		disc.Stats.FetchCount++
   227  
   228  		// debugging hack - dump out html we into files
   229  		/*
   230  			dumpFilename := fmt.Sprintf("dump%03d.html", disc.Stats.FetchCount)
   231  			dump, err := os.Create(dumpFilename)
   232  			if err != nil {
   233  				fmt.Fprintf(os.Stderr, "dump err: %s\n", err)
   234  			} else {
   235  				err = html.Render(dump, root)
   236  				if err != nil {
   237  					fmt.Fprintf(os.Stderr, "dump render err: %s\n", err)
   238  				} else {
   239  					fmt.Printf("%s => %s\n", pageURL.String(), dumpFilename)
   240  				}
   241  				dump.Close()
   242  			}
   243  		*/
   244  		// end debugging hack
   245  
   246  		// remove cruft from page before discovery
   247  		if disc.CruftSel != nil {
   248  			for _, cruft := range disc.CruftSel.MatchAll(root) {
   249  				if cruft.Parent != nil { // check to handle nested cruft...
   250  					cruft.Parent.RemoveChild(cruft)
   251  				}
   252  			}
   253  		}
   254  
   255  		navLinks, err := disc.findNavLinks(&pageURL, root)
   256  		if err != nil {
   257  			return nil, err
   258  		}
   259  		for navLink, _ := range navLinks {
   260  			if _, got := seen[navLink]; !got {
   261  				queued.Add(navLink)
   262  			}
   263  		}
   264  
   265  		foo, err := disc.findArticles(&pageURL, root)
   266  		if err != nil {
   267  			return nil, err
   268  		}
   269  		arts.Merge(foo)
   270  
   271  		disc.InfoLog.Printf("Visited %s, found %d articles\n", pageURL.String(), len(foo))
   272  	}
   273  
   274  	return arts, nil
   275  }
   276  
   277  func (disc *Discoverer) fetchAndParse(c *http.Client, pageURL *url.URL) (*html.Node, error) {
   278  	req, err := http.NewRequest("GET", pageURL.String(), nil)
   279  	if err != nil {
   280  		return nil, err
   281  	}
   282  	// NOTE: FT.com always returns 403 if no Accept header is present.
   283  	// Seems like a reasonable thing to send anyway...
   284  	// req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
   285  	req.Header.Set("Accept", "*/*")
   286  	if disc.UserAgent != "" {
   287  		req.Header.Set("User-Agent", disc.UserAgent)
   288  	}
   289  
   290  	resp, err := c.Do(req)
   291  	if err != nil {
   292  		return nil, err
   293  	}
   294  	defer resp.Body.Close()
   295  
   296  	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   297  		err = errors.New(fmt.Sprintf("HTTP code %d (%s)", resp.StatusCode, pageURL.String()))
   298  
   299  		return nil, err
   300  
   301  	}
   302  
   303  	root, err := html.Parse(resp.Body)
   304  	if err != nil {
   305  		return nil, err
   306  	}
   307  
   308  	return root, nil
   309  }
   310  
   311  var aSel cascadia.Selector = cascadia.MustCompile("a")
   312  
   313  func (disc *Discoverer) findArticles(baseURL *url.URL, root *html.Node) (LinkSet, error) {
   314  	arts := make(LinkSet)
   315  	for _, a := range aSel.MatchAll(root) {
   316  
   317  		rawURL := GetAttr(a, "href")
   318  		u, err := disc.CookArticleURL(baseURL, rawURL)
   319  		if err != nil {
   320  			continue
   321  		}
   322  		arts[*u] = true
   323  	}
   324  	return arts, nil
   325  }
   326  
   327  func (disc *Discoverer) CookArticleURL(baseURL *url.URL, artLink string) (*url.URL, error) {
   328  	// parse, extending to absolute
   329  	u, err := baseURL.Parse(artLink)
   330  	if err != nil {
   331  		return nil, err
   332  	}
   333  	// apply our sanitising rules for this site
   334  	if disc.StripFragments {
   335  		u.Fragment = ""
   336  	}
   337  	if disc.StripQuery {
   338  		u.RawQuery = ""
   339  	}
   340  
   341  	// normalise url (strip trailing /, etc)
   342  	normalised := purell.NormalizeURL(u, purell.FlagsUsuallySafeGreedy)
   343  	// need it back as a url.URL
   344  	u, err = url.Parse(normalised)
   345  	if err != nil {
   346  		return nil, err
   347  	}
   348  
   349  	// on a host we accept?
   350  	if !disc.isHostGood(u.Host) {
   351  		return nil, fmt.Errorf("bad host (%s)", u.Host)
   352  	}
   353  
   354  	// matches one of our url forms?
   355  	foo := u.RequestURI()
   356  	accept := false
   357  	for _, pat := range disc.ArtPats {
   358  		if pat.MatchString(foo) {
   359  			accept = true
   360  			break
   361  		}
   362  	}
   363  	if !accept {
   364  		return nil, fmt.Errorf("non-article")
   365  	}
   366  
   367  	for _, pat := range disc.XArtPats {
   368  		if pat.MatchString(foo) {
   369  			//disc.InfoLog.Printf("reject %s (%s)\n", artLink, pat)
   370  			return nil, fmt.Errorf("match %s", pat)
   371  		}
   372  	}
   373  
   374  	return u, nil
   375  }
   376  
   377  func (disc *Discoverer) findNavLinks(pageURL *url.URL, root *html.Node) (LinkSet, error) {
   378  	navLinks := make(LinkSet)
   379  	if disc.NavLinkSel == nil {
   380  		return navLinks, nil
   381  	}
   382  	for _, a := range disc.NavLinkSel.MatchAll(root) {
   383  		link, err := pageURL.Parse(GetAttr(a, "href"))
   384  		if err != nil {
   385  			continue
   386  		}
   387  
   388  		if !disc.isHostGood(link.Host) {
   389  			continue
   390  		}
   391  
   392  		// skip excluded nav links
   393  		skip := false
   394  		for _, pat := range disc.XNavPats {
   395  			if pat.MatchString(link.RequestURI()) {
   396  				skip = true
   397  				break
   398  			}
   399  		}
   400  		if skip {
   401  			continue
   402  		}
   403  
   404  		link.Fragment = ""
   405  
   406  		navLinks[*link] = true
   407  	}
   408  	return navLinks, nil
   409  }
   410  
   411  // is host domain one we'll accept?
   412  func (disc *Discoverer) isHostGood(host string) bool {
   413  	if disc.HostPat != nil {
   414  		return disc.HostPat.MatchString(host)
   415  	}
   416  	return host == disc.StartURL.Host
   417  }
   418  
   419  // GetAttr retrieved the value of an attribute on a node.
   420  // Returns empty string if attribute doesn't exist.
   421  func GetAttr(n *html.Node, attr string) string {
   422  	for _, a := range n.Attr {
   423  		if a.Key == attr {
   424  			return a.Val
   425  		}
   426  	}
   427  	return ""
   428  }
   429  
   430  // GetTextContent recursively fetches the text for a node
   431  func GetTextContent(n *html.Node) string {
   432  	if n.Type == html.TextNode {
   433  		return n.Data
   434  	}
   435  	txt := ""
   436  	for child := n.FirstChild; child != nil; child = child.NextSibling {
   437  		txt += GetTextContent(child)
   438  	}
   439  
   440  	return txt
   441  }
   442  
   443  // CompressSpace reduces all whitespace sequences (space, tabs, newlines etc) in a string to a single space.
   444  // Leading/trailing space is trimmed.
   445  // Has the effect of converting multiline strings to one line.
   446  func CompressSpace(s string) string {
   447  	multispacePat := regexp.MustCompile(`[\s]+`)
   448  	s = strings.TrimSpace(multispacePat.ReplaceAllLiteralString(s, " "))
   449  	return s
   450  }