github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/slurp/slurp.go (about)

     1  package slurp
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"io"
     7  	"net/http"
     8  	"net/url"
     9  	"strconv"
    10  	"time"
    11  )
    12  
    13  // Slurper is a client for talking to a slurp server
    14  type Slurper struct {
    15  	Client *http.Client
    16  	// eg "http://localhost:12345/ukarticles
    17  	Location string
    18  }
    19  
    20  func NewSlurper(location string) *Slurper {
    21  	return &Slurper{Location: location}
    22  }
    23  
    24  // Msg is a single message - can hold an article or error message
    25  type Msg struct {
    26  	Article *Article `json:"article,omitempty"`
    27  	Error   string   `json:"error,omitempty"`
    28  	Next    struct {
    29  		SinceID int `json:"since_id,omitempty"`
    30  	} `json:"next,omitempty"`
    31  }
    32  
    33  type Filter struct {
    34  	// date ranges are [from,to)
    35  	PubFrom time.Time
    36  	PubTo   time.Time
    37  	//	AddedFrom time.Time
    38  	//	AddedTo   time.Time
    39  	PubCodes []string
    40  	SinceID  int
    41  	Count    int
    42  }
    43  
    44  func (filt *Filter) params() url.Values {
    45  	params := url.Values{}
    46  
    47  	if !filt.PubFrom.IsZero() {
    48  		params.Set("pubfrom", filt.PubFrom.Format(time.RFC3339))
    49  	}
    50  	if !filt.PubTo.IsZero() {
    51  		params.Set("pubto", filt.PubTo.Format(time.RFC3339))
    52  	}
    53  	for _, pubCode := range filt.PubCodes {
    54  		params.Add("pub", pubCode)
    55  	}
    56  
    57  	if filt.SinceID > 0 {
    58  		params.Set("since_id", strconv.Itoa(filt.SinceID))
    59  	}
    60  	if filt.Count > 0 {
    61  		params.Set("count", strconv.Itoa(filt.Count))
    62  	}
    63  	return params
    64  }
    65  
    66  // !!! DEPRECATED !!!
    67  // Slurp downloads a set of articles from the server
    68  // returns a channel which streams out messages.
    69  // errors are returned via Msg. In the case of network errors,
    70  // Slurp may synthesise fake Msgs containing the error message.
    71  // Will repeatedly request until all results returned.
    72  // filter count param is not the total - it is the max articles to
    73  // return per request.
    74  // !!! DEPRECATED !!!
    75  func (s *Slurper) Slurp(filt *Filter) (chan Msg, chan struct{}) {
    76  
    77  	params := filt.params()
    78  
    79  	out := make(chan Msg)
    80  	cancel := make(chan struct{}, 1) // buffered to prevent deadlock
    81  	go func() {
    82  		defer close(out)
    83  		defer close(cancel)
    84  
    85  		client := s.Client
    86  		if client == nil {
    87  			client = &http.Client{}
    88  		}
    89  
    90  		for {
    91  			u := s.Location + "/api/slurp?" + params.Encode()
    92  			// fmt.Printf("request: %s\n", u)
    93  			resp, err := client.Get(u)
    94  			if err != nil {
    95  				out <- Msg{Error: fmt.Sprintf("HTTP Get failed: %s", err)}
    96  				return
    97  			}
    98  			defer resp.Body.Close()
    99  
   100  			if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   101  				out <- Msg{Error: fmt.Sprintf("HTTP Error: %s", resp.Status)}
   102  				return
   103  			}
   104  			nextSinceID := 0
   105  			dec := json.NewDecoder(resp.Body)
   106  			for {
   107  				// check for cancelation request
   108  				select {
   109  				case <-cancel:
   110  					out <- Msg{Error: "Cancelled"}
   111  					return
   112  				default:
   113  				}
   114  
   115  				var msg Msg
   116  				if err := dec.Decode(&msg); err == io.EOF {
   117  					break
   118  				} else if err != nil {
   119  					out <- Msg{Error: fmt.Sprintf("Decode error: %s", err)}
   120  					return
   121  				}
   122  
   123  				// is it a to-be-continued message?
   124  				if msg.Next.SinceID > 0 {
   125  					nextSinceID = msg.Next.SinceID
   126  				} else {
   127  					out <- msg
   128  				}
   129  			}
   130  
   131  			if nextSinceID == 0 {
   132  				break
   133  			}
   134  			// update the query params with the new since_id
   135  			params.Set("since_id", strconv.Itoa(nextSinceID))
   136  		}
   137  	}()
   138  
   139  	return out, cancel
   140  }
   141  
   142  func (s *Slurper) Slurp2(filt *Filter) *ArtStream {
   143  	client := s.Client
   144  	if client == nil {
   145  		client = &http.Client{}
   146  	}
   147  
   148  	params := filt.params()
   149  
   150  	u := s.Location + "/api/slurp?" + params.Encode()
   151  
   152  	out := &ArtStream{}
   153  	//	fmt.Printf("request: %s\n", u)
   154  	resp, err := client.Get(u)
   155  	if err != nil {
   156  		out.err = fmt.Errorf("HTTP Get failed: %s", err)
   157  		return out
   158  	}
   159  	out.response = resp
   160  
   161  	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   162  		out.err = fmt.Errorf("HTTP Error code %s", resp.Status)
   163  		return out
   164  	}
   165  
   166  	out.dec = json.NewDecoder(out.response.Body)
   167  	return out
   168  }
   169  
   170  type ArtStream struct {
   171  	response *http.Response
   172  	dec      *json.Decoder
   173  	err      error
   174  
   175  	// if there are more articles to grab, this will be set to non-zero when the stream ends
   176  	NextSinceID int
   177  }
   178  
   179  func (as *ArtStream) Close() {
   180  	if as.response != nil {
   181  		as.response.Body.Close()
   182  		as.response = nil
   183  	}
   184  }
   185  
   186  // returns io.EOF at end of stream
   187  func (as *ArtStream) Next() (*Article, error) {
   188  	if as.err != nil {
   189  		return nil, as.err
   190  	}
   191  	for {
   192  		// grab the next message off the wire
   193  		var msg Msg
   194  		err := as.dec.Decode(&msg)
   195  		if err == io.EOF {
   196  			as.err = err
   197  			return nil, err
   198  		} else if err != nil {
   199  			as.err = fmt.Errorf("Decode error: %s", err)
   200  			return nil, as.err
   201  		}
   202  
   203  		// is it a to-be-continued message?
   204  		if msg.Next.SinceID > 0 {
   205  			as.NextSinceID = msg.Next.SinceID
   206  			// probably that'll be the end of the stream, but loop until we hit the EOF anyway
   207  		} else {
   208  			return msg.Article, nil
   209  		}
   210  	}
   211  }
   212  
   213  // FetchCount returns the number of articles on the server matching the filter.
   214  func (s *Slurper) FetchCount(filt *Filter) (int, error) {
   215  	client := s.Client
   216  	if client == nil {
   217  		client = &http.Client{}
   218  	}
   219  
   220  	params := filt.params()
   221  
   222  	u := s.Location + "/api/count?" + params.Encode()
   223  
   224  	//	fmt.Printf("request: %s\n", u)
   225  	resp, err := client.Get(u)
   226  	if err != nil {
   227  		return 0, err
   228  	}
   229  	defer resp.Body.Close()
   230  	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
   231  		return 0, fmt.Errorf("HTTP Error code %s", resp.Status)
   232  	}
   233  
   234  	var cnt struct {
   235  		ArticleCount int `json:"article_count"`
   236  	}
   237  	err = json.NewDecoder(resp.Body).Decode(&cnt)
   238  	if err != nil {
   239  		return 0, err
   240  	}
   241  
   242  	return cnt.ArticleCount, nil
   243  }