github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/slurp/slurp.go (about) 1 package slurp 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "io" 7 "net/http" 8 "net/url" 9 "strconv" 10 "time" 11 ) 12 13 // Slurper is a client for talking to a slurp server 14 type Slurper struct { 15 Client *http.Client 16 // eg "http://localhost:12345/ukarticles 17 Location string 18 } 19 20 func NewSlurper(location string) *Slurper { 21 return &Slurper{Location: location} 22 } 23 24 // Msg is a single message - can hold an article or error message 25 type Msg struct { 26 Article *Article `json:"article,omitempty"` 27 Error string `json:"error,omitempty"` 28 Next struct { 29 SinceID int `json:"since_id,omitempty"` 30 } `json:"next,omitempty"` 31 } 32 33 type Filter struct { 34 // date ranges are [from,to) 35 PubFrom time.Time 36 PubTo time.Time 37 // AddedFrom time.Time 38 // AddedTo time.Time 39 PubCodes []string 40 SinceID int 41 Count int 42 } 43 44 func (filt *Filter) params() url.Values { 45 params := url.Values{} 46 47 if !filt.PubFrom.IsZero() { 48 params.Set("pubfrom", filt.PubFrom.Format(time.RFC3339)) 49 } 50 if !filt.PubTo.IsZero() { 51 params.Set("pubto", filt.PubTo.Format(time.RFC3339)) 52 } 53 for _, pubCode := range filt.PubCodes { 54 params.Add("pub", pubCode) 55 } 56 57 if filt.SinceID > 0 { 58 params.Set("since_id", strconv.Itoa(filt.SinceID)) 59 } 60 if filt.Count > 0 { 61 params.Set("count", strconv.Itoa(filt.Count)) 62 } 63 return params 64 } 65 66 // !!! DEPRECATED !!! 67 // Slurp downloads a set of articles from the server 68 // returns a channel which streams out messages. 69 // errors are returned via Msg. In the case of network errors, 70 // Slurp may synthesise fake Msgs containing the error message. 71 // Will repeatedly request until all results returned. 72 // filter count param is not the total - it is the max articles to 73 // return per request. 74 // !!! DEPRECATED !!! 75 func (s *Slurper) Slurp(filt *Filter) (chan Msg, chan struct{}) { 76 77 params := filt.params() 78 79 out := make(chan Msg) 80 cancel := make(chan struct{}, 1) // buffered to prevent deadlock 81 go func() { 82 defer close(out) 83 defer close(cancel) 84 85 client := s.Client 86 if client == nil { 87 client = &http.Client{} 88 } 89 90 for { 91 u := s.Location + "/api/slurp?" + params.Encode() 92 // fmt.Printf("request: %s\n", u) 93 resp, err := client.Get(u) 94 if err != nil { 95 out <- Msg{Error: fmt.Sprintf("HTTP Get failed: %s", err)} 96 return 97 } 98 defer resp.Body.Close() 99 100 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 101 out <- Msg{Error: fmt.Sprintf("HTTP Error: %s", resp.Status)} 102 return 103 } 104 nextSinceID := 0 105 dec := json.NewDecoder(resp.Body) 106 for { 107 // check for cancelation request 108 select { 109 case <-cancel: 110 out <- Msg{Error: "Cancelled"} 111 return 112 default: 113 } 114 115 var msg Msg 116 if err := dec.Decode(&msg); err == io.EOF { 117 break 118 } else if err != nil { 119 out <- Msg{Error: fmt.Sprintf("Decode error: %s", err)} 120 return 121 } 122 123 // is it a to-be-continued message? 124 if msg.Next.SinceID > 0 { 125 nextSinceID = msg.Next.SinceID 126 } else { 127 out <- msg 128 } 129 } 130 131 if nextSinceID == 0 { 132 break 133 } 134 // update the query params with the new since_id 135 params.Set("since_id", strconv.Itoa(nextSinceID)) 136 } 137 }() 138 139 return out, cancel 140 } 141 142 func (s *Slurper) Slurp2(filt *Filter) *ArtStream { 143 client := s.Client 144 if client == nil { 145 client = &http.Client{} 146 } 147 148 params := filt.params() 149 150 u := s.Location + "/api/slurp?" + params.Encode() 151 152 out := &ArtStream{} 153 // fmt.Printf("request: %s\n", u) 154 resp, err := client.Get(u) 155 if err != nil { 156 out.err = fmt.Errorf("HTTP Get failed: %s", err) 157 return out 158 } 159 out.response = resp 160 161 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 162 out.err = fmt.Errorf("HTTP Error code %s", resp.Status) 163 return out 164 } 165 166 out.dec = json.NewDecoder(out.response.Body) 167 return out 168 } 169 170 type ArtStream struct { 171 response *http.Response 172 dec *json.Decoder 173 err error 174 175 // if there are more articles to grab, this will be set to non-zero when the stream ends 176 NextSinceID int 177 } 178 179 func (as *ArtStream) Close() { 180 if as.response != nil { 181 as.response.Body.Close() 182 as.response = nil 183 } 184 } 185 186 // returns io.EOF at end of stream 187 func (as *ArtStream) Next() (*Article, error) { 188 if as.err != nil { 189 return nil, as.err 190 } 191 for { 192 // grab the next message off the wire 193 var msg Msg 194 err := as.dec.Decode(&msg) 195 if err == io.EOF { 196 as.err = err 197 return nil, err 198 } else if err != nil { 199 as.err = fmt.Errorf("Decode error: %s", err) 200 return nil, as.err 201 } 202 203 // is it a to-be-continued message? 204 if msg.Next.SinceID > 0 { 205 as.NextSinceID = msg.Next.SinceID 206 // probably that'll be the end of the stream, but loop until we hit the EOF anyway 207 } else { 208 return msg.Article, nil 209 } 210 } 211 } 212 213 // FetchCount returns the number of articles on the server matching the filter. 214 func (s *Slurper) FetchCount(filt *Filter) (int, error) { 215 client := s.Client 216 if client == nil { 217 client = &http.Client{} 218 } 219 220 params := filt.params() 221 222 u := s.Location + "/api/count?" + params.Encode() 223 224 // fmt.Printf("request: %s\n", u) 225 resp, err := client.Get(u) 226 if err != nil { 227 return 0, err 228 } 229 defer resp.Body.Close() 230 if resp.StatusCode < 200 || resp.StatusCode >= 300 { 231 return 0, fmt.Errorf("HTTP Error code %s", resp.Status) 232 } 233 234 var cnt struct { 235 ArticleCount int `json:"article_count"` 236 } 237 err = json.NewDecoder(resp.Body).Decode(&cnt) 238 if err != nil { 239 return 0, err 240 } 241 242 return cnt.ArticleCount, nil 243 }