github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/backfill/search.go (about)

     1  package main
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/andybalholm/cascadia"
     6  	"github.com/bcampbell/arts/util"
     7  	"io"
     8  	"net/http"
     9  	"net/url"
    10  	"os"
    11  	"strconv"
    12  )
    13  
    14  type Searcher struct {
    15  	SearchURL string
    16  	Params    url.Values
    17  	// which param holds pagenum (if set, used to iterate through search results)
    18  	PageParam string
    19  	// css selector to find next page
    20  	NextPageSel cascadia.Selector
    21  	// css selector to find article links
    22  	ResultLinkSel cascadia.Selector
    23  	//NoMoreResultsSel cascadia.Selector
    24  	// Number of pages to step through (0=no limit)
    25  	NPages int
    26  }
    27  
    28  func (s *Searcher) Run(out io.Writer) error {
    29  
    30  	if s.PageParam != "" && s.NextPageSel != nil {
    31  		return fmt.Errorf("PageParam and NextPageSel are mutually exclusive")
    32  	}
    33  
    34  	client := &http.Client{Transport: util.NewPoliteTripper()}
    35  	//found := []string{}
    36  
    37  	page, err := url.Parse(s.SearchURL)
    38  	if err != nil {
    39  		return err
    40  	}
    41  	s.Params = page.Query()
    42  	// TODO: better merging
    43  	/*
    44  		if len(s.Params) > 0 {
    45  			page.RawQuery = s.Params.Encode()
    46  		}
    47  	*/
    48  
    49  	pageCount := 0
    50  	for {
    51  		root, err := fetchAndParse(client, page.String())
    52  		if err != nil {
    53  			return fmt.Errorf("%s failed: %s\n", page.String(), err)
    54  		}
    55  
    56  		cnt := 0
    57  		for _, a := range s.ResultLinkSel.MatchAll(root) {
    58  			// embiggen relative URLs
    59  			href := GetAttr(a, "href")
    60  			absURL, err := page.Parse(href)
    61  			if err != nil {
    62  				fmt.Fprintf(os.Stderr, "skip bad url %s\n", href)
    63  				continue
    64  			}
    65  			cnt++
    66  			//			found = append(found, absURL.String())
    67  			fmt.Fprintln(out, absURL.String())
    68  		}
    69  
    70  		// finish if "no more results" indicator seen...
    71  		/*
    72  			if s.NoMoreResultsSel != nil {
    73  				if len(s.NoMoreResultsSel(root)) > 0 {
    74  					break
    75  				}
    76  			}
    77  		*/
    78  
    79  		// finish if page limit hit...
    80  		pageCount++
    81  		if s.NPages > 0 && pageCount >= s.NPages {
    82  			break
    83  		}
    84  
    85  		// finish if no more results on returned page
    86  		if cnt == 0 {
    87  			break
    88  		}
    89  
    90  		// determine next page
    91  		if s.NextPageSel != nil {
    92  			// use pagination link to fetch URL of next page
    93  			nexts := s.NextPageSel.MatchAll(root)
    94  			if len(nexts) < 1 {
    95  				return fmt.Errorf("no next-page link found on %s", page.String())
    96  			}
    97  
    98  			href := GetAttr(nexts[0], "href")
    99  			nextURL, err := page.Parse(href)
   100  			if err != nil {
   101  				return fmt.Errorf("bad next url %s\n", href)
   102  			}
   103  			page = nextURL
   104  		} else if s.PageParam != "" {
   105  
   106  			// build new URL by incrementing pagenum
   107  			pageNum := s.Params.Get(s.PageParam)
   108  			if pageNum == "" {
   109  				pageNum = "1"
   110  			}
   111  			n, err := strconv.Atoi(pageNum)
   112  			if err != nil {
   113  				return fmt.Errorf("Bad page number: '%s'\n", s.Params[s.PageParam])
   114  			}
   115  			s.Params.Set(s.PageParam, strconv.Itoa(n+1))
   116  
   117  			page.RawQuery = s.Params.Encode()
   118  		} else {
   119  			// no next-page method - just stop now
   120  			break
   121  		}
   122  	}
   123  	return nil
   124  }