github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/search.go (about) 1 package main 2 3 import ( 4 "fmt" 5 "github.com/andybalholm/cascadia" 6 "github.com/bcampbell/arts/util" 7 "io" 8 "net/http" 9 "net/url" 10 "os" 11 "strconv" 12 ) 13 14 type Searcher struct { 15 SearchURL string 16 Params url.Values 17 // which param holds pagenum (if set, used to iterate through search results) 18 PageParam string 19 // css selector to find next page 20 NextPageSel cascadia.Selector 21 // css selector to find article links 22 ResultLinkSel cascadia.Selector 23 //NoMoreResultsSel cascadia.Selector 24 // Number of pages to step through (0=no limit) 25 NPages int 26 } 27 28 func (s *Searcher) Run(out io.Writer) error { 29 30 if s.PageParam != "" && s.NextPageSel != nil { 31 return fmt.Errorf("PageParam and NextPageSel are mutually exclusive") 32 } 33 34 client := &http.Client{Transport: util.NewPoliteTripper()} 35 //found := []string{} 36 37 page, err := url.Parse(s.SearchURL) 38 if err != nil { 39 return err 40 } 41 s.Params = page.Query() 42 // TODO: better merging 43 /* 44 if len(s.Params) > 0 { 45 page.RawQuery = s.Params.Encode() 46 } 47 */ 48 49 pageCount := 0 50 for { 51 root, err := fetchAndParse(client, page.String()) 52 if err != nil { 53 return fmt.Errorf("%s failed: %s\n", page.String(), err) 54 } 55 56 cnt := 0 57 for _, a := range s.ResultLinkSel.MatchAll(root) { 58 // embiggen relative URLs 59 href := GetAttr(a, "href") 60 absURL, err := page.Parse(href) 61 if err != nil { 62 fmt.Fprintf(os.Stderr, "skip bad url %s\n", href) 63 continue 64 } 65 cnt++ 66 // found = append(found, absURL.String()) 67 fmt.Fprintln(out, absURL.String()) 68 } 69 70 // finish if "no more results" indicator seen... 71 /* 72 if s.NoMoreResultsSel != nil { 73 if len(s.NoMoreResultsSel(root)) > 0 { 74 break 75 } 76 } 77 */ 78 79 // finish if page limit hit... 80 pageCount++ 81 if s.NPages > 0 && pageCount >= s.NPages { 82 break 83 } 84 85 // finish if no more results on returned page 86 if cnt == 0 { 87 break 88 } 89 90 // determine next page 91 if s.NextPageSel != nil { 92 // use pagination link to fetch URL of next page 93 nexts := s.NextPageSel.MatchAll(root) 94 if len(nexts) < 1 { 95 return fmt.Errorf("no next-page link found on %s", page.String()) 96 } 97 98 href := GetAttr(nexts[0], "href") 99 nextURL, err := page.Parse(href) 100 if err != nil { 101 return fmt.Errorf("bad next url %s\n", href) 102 } 103 page = nextURL 104 } else if s.PageParam != "" { 105 106 // build new URL by incrementing pagenum 107 pageNum := s.Params.Get(s.PageParam) 108 if pageNum == "" { 109 pageNum = "1" 110 } 111 n, err := strconv.Atoi(pageNum) 112 if err != nil { 113 return fmt.Errorf("Bad page number: '%s'\n", s.Params[s.PageParam]) 114 } 115 s.Params.Set(s.PageParam, strconv.Itoa(n+1)) 116 117 page.RawQuery = s.Params.Encode() 118 } else { 119 // no next-page method - just stop now 120 break 121 } 122 } 123 return nil 124 }