github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/discover/discover.go (about) 1 package discover 2 3 // 4 // 5 // TODO: 6 // should be able to guess article link format statistically 7 // handle/allow subdomains (eg: www1.politicalbetting.com) 8 // filter unwanted navlinks (eg "mirror.co.uk/all-about/fred bloggs") 9 // HTTP error handling 10 // multiple url formats (eg spectator has multiple cms's) 11 // logging 12 13 import ( 14 "errors" 15 "fmt" 16 "github.com/PuerkitoBio/purell" 17 "github.com/andybalholm/cascadia" 18 "golang.org/x/net/html" 19 "net/http" 20 "net/url" 21 // "os" 22 "regexp" 23 "strings" 24 ) 25 26 type Logger interface { 27 Printf(format string, v ...interface{}) 28 } 29 30 type NullLogger struct{} 31 32 func (l NullLogger) Printf(format string, v ...interface{}) { 33 } 34 35 type DiscovererDef struct { 36 Name string 37 URL string 38 // article urls to include - regexes 39 ArtPat []string 40 // article urls to exclude - regexes 41 XArtPat []string 42 43 // article url forms to include (eg "/YYYY/MM/SLUG.html") 44 ArtForm []string 45 // article url forms to exclude 46 XArtForm []string 47 48 // CSS selector to identify navigation links 49 NavSel string 50 // regexp patterns of pages to skip during link discovery 51 XNavPat []string 52 53 // css selector for elements to cull during article discovery 54 CruftSel string 55 56 // BaseErrorThreshold is starting number of http errors to accept before 57 // bailing out. default is 5 (and 0 is considered as unset, so default is applied) 58 // error threshold formula: base + 10% of successful request count 59 BaseErrorThreshold int 60 61 // Hostpat is a regex matching accepted domains 62 // if empty, reject everything on a different domain 63 HostPat string 64 65 // If NoStripQuery is set then article URLs won't have the query part zapped 66 NoStripQuery bool 67 68 // UserAgent string to use in HTTP requests 69 UserAgent string 70 } 71 72 type DiscoverStats struct { 73 ErrorCount int 74 FetchCount int 75 } 76 77 type Discoverer struct { 78 Name string 79 StartURL url.URL 80 ArtPats []*regexp.Regexp 81 XArtPats []*regexp.Regexp 82 NavLinkSel cascadia.Selector 83 XNavPats []*regexp.Regexp 84 CruftSel cascadia.Selector 85 BaseErrorThreshold int 86 StripFragments bool 87 StripQuery bool 88 HostPat *regexp.Regexp 89 UserAgent string 90 91 ErrorLog Logger 92 InfoLog Logger 93 Stats DiscoverStats 94 } 95 96 // compile a slice of strings into a slice of regexps 97 func buildRegExps(pats []string) ([]*regexp.Regexp, error) { 98 out := make([]*regexp.Regexp, len(pats)) 99 for idx, pat := range pats { 100 re, err := regexp.Compile(pat) 101 if err != nil { 102 return nil, err 103 } 104 out[idx] = re 105 } 106 return out, nil 107 } 108 109 func NewDiscoverer(cfg DiscovererDef) (*Discoverer, error) { 110 disc := &Discoverer{} 111 u, err := url.Parse(cfg.URL) 112 if err != nil { 113 return nil, err 114 } 115 disc.Name = cfg.Name 116 disc.StartURL = *u 117 // parse the regexp include/exclude rules 118 disc.ArtPats, err = buildRegExps(cfg.ArtPat) 119 if err != nil { 120 return nil, err 121 } 122 disc.XArtPats, err = buildRegExps(cfg.XArtPat) 123 if err != nil { 124 return nil, err 125 } 126 // parse the simplified include/exclude forms 127 for _, f := range cfg.ArtForm { 128 re, err := patToRegexp(f) 129 if err != nil { 130 return nil, err 131 } 132 disc.ArtPats = append(disc.ArtPats, re) 133 } 134 for _, f := range cfg.XArtForm { 135 re, err := patToRegexp(f) 136 if err != nil { 137 return nil, err 138 } 139 disc.XArtPats = append(disc.XArtPats, re) 140 } 141 142 if cfg.NavSel == "" { 143 disc.NavLinkSel = nil 144 } else { 145 sel, err := cascadia.Compile(cfg.NavSel) 146 if err != nil { 147 return nil, err 148 } 149 disc.NavLinkSel = sel 150 } 151 152 disc.XNavPats, err = buildRegExps(cfg.XNavPat) 153 if err != nil { 154 return nil, err 155 } 156 157 if cfg.CruftSel == "" { 158 disc.CruftSel = nil 159 } else { 160 sel, err := cascadia.Compile(cfg.CruftSel) 161 if err != nil { 162 return nil, err 163 } 164 disc.CruftSel = sel 165 } 166 167 disc.BaseErrorThreshold = cfg.BaseErrorThreshold 168 // treat base threshold of 0 as unset, and use a default 169 if disc.BaseErrorThreshold == 0 { 170 disc.BaseErrorThreshold = 5 171 } 172 173 if cfg.HostPat != "" { 174 re, err := regexp.Compile(cfg.HostPat) 175 if err != nil { 176 return nil, err 177 } 178 disc.HostPat = re 179 } 180 181 disc.UserAgent = cfg.UserAgent 182 183 // defaults 184 disc.StripFragments = true 185 disc.StripQuery = !cfg.NoStripQuery 186 disc.ErrorLog = NullLogger{} 187 disc.InfoLog = NullLogger{} 188 return disc, nil 189 } 190 191 var ErrQuit = errors.New("quit requested") 192 193 func (disc *Discoverer) Run(client *http.Client, quit <-chan struct{}) (LinkSet, error) { 194 // reset stats 195 disc.Stats = DiscoverStats{} 196 197 queued := make(LinkSet) // nav pages to scan for article links 198 seen := make(LinkSet) // nav pages we've scanned 199 arts := make(LinkSet) // article links we've found so far 200 201 queued.Add(disc.StartURL) 202 203 for len(queued) > 0 { 204 205 if quit != nil { 206 select { 207 case <-quit: 208 return nil, ErrQuit 209 default: 210 } 211 } 212 pageURL := queued.Pop() 213 seen.Add(pageURL) 214 // 215 216 root, err := disc.fetchAndParse(client, &pageURL) 217 if err != nil { 218 disc.ErrorLog.Printf("%s\n", err.Error()) 219 disc.Stats.ErrorCount++ 220 if disc.Stats.ErrorCount > disc.BaseErrorThreshold+(disc.Stats.FetchCount/10) { 221 return nil, errors.New("Error threshold exceeded") 222 } else { 223 continue 224 } 225 } 226 disc.Stats.FetchCount++ 227 228 // debugging hack - dump out html we into files 229 /* 230 dumpFilename := fmt.Sprintf("dump%03d.html", disc.Stats.FetchCount) 231 dump, err := os.Create(dumpFilename) 232 if err != nil { 233 fmt.Fprintf(os.Stderr, "dump err: %s\n", err) 234 } else { 235 err = html.Render(dump, root) 236 if err != nil { 237 fmt.Fprintf(os.Stderr, "dump render err: %s\n", err) 238 } else { 239 fmt.Printf("%s => %s\n", pageURL.String(), dumpFilename) 240 } 241 dump.Close() 242 } 243 */ 244 // end debugging hack 245 246 // remove cruft from page before discovery 247 if disc.CruftSel != nil { 248 for _, cruft := range disc.CruftSel.MatchAll(root) { 249 if cruft.Parent != nil { // check to handle nested cruft... 250 cruft.Parent.RemoveChild(cruft) 251 } 252 } 253 } 254 255 navLinks, err := disc.findNavLinks(&pageURL, root) 256 if err != nil { 257 return nil, err 258 } 259 for navLink, _ := range navLinks { 260 if _, got := seen[navLink]; !got { 261 queued.Add(navLink) 262 } 263 } 264 265 foo, err := disc.findArticles(&pageURL, root) 266 if err != nil { 267 return nil, err 268 } 269 arts.Merge(foo) 270 271 disc.InfoLog.Printf("Visited %s, found %d articles\n", pageURL.String(), len(foo)) 272 } 273 274 return arts, nil 275 } 276 277 func (disc *Discoverer) fetchAndParse(c *http.Client, pageURL *url.URL) (*html.Node, error) { 278 req, err := http.NewRequest("GET", pageURL.String(), nil) 279 if err != nil { 280 return nil, err 281 } 282 // NOTE: FT.com always returns 403 if no Accept header is present. 283 // Seems like a reasonable thing to send anyway... 284 // req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 285 req.Header.Set("Accept", "*/*") 286 if disc.UserAgent != "" { 287 req.Header.Set("User-Agent", disc.UserAgent) 288 } 289 290 resp, err := c.Do(req) 291 if err != nil { 292 return nil, err 293 } 294 defer resp.Body.Close() 295 296 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 297 err = errors.New(fmt.Sprintf("HTTP code %d (%s)", resp.StatusCode, pageURL.String())) 298 299 return nil, err 300 301 } 302 303 root, err := html.Parse(resp.Body) 304 if err != nil { 305 return nil, err 306 } 307 308 return root, nil 309 } 310 311 var aSel cascadia.Selector = cascadia.MustCompile("a") 312 313 func (disc *Discoverer) findArticles(baseURL *url.URL, root *html.Node) (LinkSet, error) { 314 arts := make(LinkSet) 315 for _, a := range aSel.MatchAll(root) { 316 317 rawURL := GetAttr(a, "href") 318 u, err := disc.CookArticleURL(baseURL, rawURL) 319 if err != nil { 320 continue 321 } 322 arts[*u] = true 323 } 324 return arts, nil 325 } 326 327 func (disc *Discoverer) CookArticleURL(baseURL *url.URL, artLink string) (*url.URL, error) { 328 // parse, extending to absolute 329 u, err := baseURL.Parse(artLink) 330 if err != nil { 331 return nil, err 332 } 333 // apply our sanitising rules for this site 334 if disc.StripFragments { 335 u.Fragment = "" 336 } 337 if disc.StripQuery { 338 u.RawQuery = "" 339 } 340 341 // normalise url (strip trailing /, etc) 342 normalised := purell.NormalizeURL(u, purell.FlagsUsuallySafeGreedy) 343 // need it back as a url.URL 344 u, err = url.Parse(normalised) 345 if err != nil { 346 return nil, err 347 } 348 349 // on a host we accept? 350 if !disc.isHostGood(u.Host) { 351 return nil, fmt.Errorf("bad host (%s)", u.Host) 352 } 353 354 // matches one of our url forms? 355 foo := u.RequestURI() 356 accept := false 357 for _, pat := range disc.ArtPats { 358 if pat.MatchString(foo) { 359 accept = true 360 break 361 } 362 } 363 if !accept { 364 return nil, fmt.Errorf("non-article") 365 } 366 367 for _, pat := range disc.XArtPats { 368 if pat.MatchString(foo) { 369 //disc.InfoLog.Printf("reject %s (%s)\n", artLink, pat) 370 return nil, fmt.Errorf("match %s", pat) 371 } 372 } 373 374 return u, nil 375 } 376 377 func (disc *Discoverer) findNavLinks(pageURL *url.URL, root *html.Node) (LinkSet, error) { 378 navLinks := make(LinkSet) 379 if disc.NavLinkSel == nil { 380 return navLinks, nil 381 } 382 for _, a := range disc.NavLinkSel.MatchAll(root) { 383 link, err := pageURL.Parse(GetAttr(a, "href")) 384 if err != nil { 385 continue 386 } 387 388 if !disc.isHostGood(link.Host) { 389 continue 390 } 391 392 // skip excluded nav links 393 skip := false 394 for _, pat := range disc.XNavPats { 395 if pat.MatchString(link.RequestURI()) { 396 skip = true 397 break 398 } 399 } 400 if skip { 401 continue 402 } 403 404 link.Fragment = "" 405 406 navLinks[*link] = true 407 } 408 return navLinks, nil 409 } 410 411 // is host domain one we'll accept? 412 func (disc *Discoverer) isHostGood(host string) bool { 413 if disc.HostPat != nil { 414 return disc.HostPat.MatchString(host) 415 } 416 return host == disc.StartURL.Host 417 } 418 419 // GetAttr retrieved the value of an attribute on a node. 420 // Returns empty string if attribute doesn't exist. 421 func GetAttr(n *html.Node, attr string) string { 422 for _, a := range n.Attr { 423 if a.Key == attr { 424 return a.Val 425 } 426 } 427 return "" 428 } 429 430 // GetTextContent recursively fetches the text for a node 431 func GetTextContent(n *html.Node) string { 432 if n.Type == html.TextNode { 433 return n.Data 434 } 435 txt := "" 436 for child := n.FirstChild; child != nil; child = child.NextSibling { 437 txt += GetTextContent(child) 438 } 439 440 return txt 441 } 442 443 // CompressSpace reduces all whitespace sequences (space, tabs, newlines etc) in a string to a single space. 444 // Leading/trailing space is trimmed. 445 // Has the effect of converting multiline strings to one line. 446 func CompressSpace(s string) string { 447 multispacePat := regexp.MustCompile(`[\s]+`) 448 s = strings.TrimSpace(multispacePat.ReplaceAllLiteralString(s, " ")) 449 return s 450 }