github.com/MontFerret/ferret@v0.18.0/pkg/drivers/http/driver.go (about) 1 package http 2 3 import ( 4 "bytes" 5 "context" 6 "io" 7 "net/http" 8 "net/url" 9 10 "github.com/gobwas/glob" 11 12 "github.com/MontFerret/ferret/pkg/runtime/logging" 13 "github.com/MontFerret/ferret/pkg/runtime/values" 14 15 "golang.org/x/net/html/charset" 16 17 "github.com/PuerkitoBio/goquery" 18 "github.com/pkg/errors" 19 "github.com/sethgrid/pester" 20 21 "github.com/MontFerret/ferret/pkg/drivers" 22 "github.com/MontFerret/ferret/pkg/drivers/common" 23 ) 24 25 const DriverName = "http" 26 27 type Driver struct { 28 client *pester.Client 29 options *Options 30 } 31 32 func NewDriver(opts ...Option) *Driver { 33 drv := new(Driver) 34 drv.options = NewOptions(opts) 35 36 drv.client = newHTTPClient(drv.options) 37 38 return drv 39 } 40 41 func newHTTPClient(options *Options) (httpClient *pester.Client) { 42 httpClient = pester.New() 43 44 httpClient.Concurrency = options.Concurrency 45 httpClient.MaxRetries = options.MaxRetries 46 httpClient.Backoff = options.Backoff 47 httpClient.Timeout = options.Timeout 48 49 if options.HTTPTransport != nil { 50 httpClient.Transport = options.HTTPTransport 51 } 52 53 if options.Proxy == "" { 54 return 55 } 56 57 if err := addProxy(httpClient, options.Proxy); err != nil { 58 return 59 } 60 61 return 62 } 63 64 func addProxy(httpClient *pester.Client, proxyStr string) error { 65 if proxyStr == "" { 66 return nil 67 } 68 69 proxyURL, err := url.Parse(proxyStr) 70 if err != nil { 71 return err 72 } 73 74 proxy := http.ProxyURL(proxyURL) 75 76 if httpClient.Transport != nil { 77 httpClient.Transport.(*http.Transport).Proxy = proxy 78 79 return nil 80 } 81 82 httpClient.Transport = &http.Transport{Proxy: proxy} 83 84 return nil 85 } 86 87 func (drv *Driver) Name() string { 88 return drv.options.Name 89 } 90 91 func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTMLPage, error) { 92 req, err := http.NewRequest(http.MethodGet, params.URL, nil) 93 if err != nil { 94 return nil, err 95 } 96 97 params = drivers.SetDefaultParams(drv.options.Options, params) 98 99 drv.makeRequest(ctx, req, params) 100 101 resp, err := drv.client.Do(req) 102 if err != nil { 103 return nil, errors.Wrapf(err, "failed to retrieve a document %s", params.URL) 104 } 105 defer resp.Body.Close() 106 107 var queryFilters []drivers.StatusCodeFilter 108 109 if params.Ignore != nil { 110 queryFilters = params.Ignore.StatusCodes 111 } 112 113 if !drv.responseCodeAllowed(resp, queryFilters) { 114 return nil, errors.New(resp.Status) 115 } 116 117 body := io.Reader(resp.Body) 118 if params.Charset != "" { 119 body, err = drv.convertToUTF8(body, params.Charset) 120 if err != nil { 121 return nil, errors.Wrapf(err, "failed convert to UTF-8 a document %s", params.URL) 122 } 123 } 124 125 doc, err := goquery.NewDocumentFromReader(body) 126 if err != nil { 127 return nil, errors.Wrapf(err, "failed to parse a document %s", params.URL) 128 } 129 130 cookies, err := toDriverCookies(resp.Cookies()) 131 if err != nil { 132 return nil, err 133 } 134 135 r := drivers.HTTPResponse{ 136 StatusCode: resp.StatusCode, 137 Status: resp.Status, 138 Headers: drivers.NewHTTPHeadersWith(resp.Header), 139 } 140 141 return NewHTMLPage(doc, params.URL, r, cookies) 142 } 143 144 func (drv *Driver) Parse(_ context.Context, params drivers.ParseParams) (drivers.HTMLPage, error) { 145 buf := bytes.NewBuffer(params.Content) 146 147 doc, err := goquery.NewDocumentFromReader(buf) 148 149 if err != nil { 150 return nil, errors.Wrap(err, "failed to parse a document") 151 } 152 153 return NewHTMLPage(doc, "#blank", drivers.HTTPResponse{}, nil) 154 } 155 156 func (drv *Driver) Close() error { 157 drv.client = nil 158 159 return nil 160 } 161 162 func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers.StatusCodeFilter) bool { 163 var allowed bool 164 reqURL := resp.Request.URL.String() 165 166 // OK is by default 167 if resp.StatusCode >= 200 && resp.StatusCode <= 299 { 168 return true 169 } 170 171 // Try to use those that are passed within a query 172 for _, filter := range additional { 173 allowed = filter.Code == resp.StatusCode 174 175 // check url 176 if allowed && filter.URL != "" { 177 allowed = glob.MustCompile(filter.URL).Match(reqURL) 178 } 179 180 if allowed { 181 break 182 } 183 } 184 185 // if still not allowed, try the default ones 186 if !allowed { 187 for _, filter := range drv.options.HTTPCodesFilter { 188 allowed = filter.Code == resp.StatusCode 189 190 if allowed && filter.URL != nil { 191 allowed = filter.URL.Match(reqURL) 192 } 193 194 if allowed { 195 break 196 } 197 } 198 } 199 200 return allowed 201 } 202 203 func (drv *Driver) convertToUTF8(reader io.Reader, srcCharset string) (data io.Reader, err error) { 204 data, err = charset.NewReader(reader, srcCharset) 205 if err != nil { 206 return nil, err 207 } 208 209 return 210 } 211 212 func (drv *Driver) makeRequest(ctx context.Context, req *http.Request, params drivers.Params) { 213 logger := logging.FromContext(ctx) 214 215 req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8") 216 req.Header.Set("Accept-Language", "en-US,en;q=0.9,ru;q=0.8") 217 req.Header.Set("Cache-Control", "no-cache") 218 req.Header.Set("Pragma", "no-cache") 219 220 if params.Headers != nil { 221 params.Headers.ForEach(func(value []string, key string) bool { 222 v := params.Headers.Get(key) 223 224 req.Header.Set(key, v) 225 226 logger. 227 Debug(). 228 Timestamp(). 229 Str("header", key). 230 Msg("set header") 231 232 return true 233 }) 234 } 235 236 if params.Cookies != nil { 237 params.Cookies.ForEach(func(value drivers.HTTPCookie, key values.String) bool { 238 v, exist := params.Cookies.Get(key) 239 if !exist { 240 return false 241 } 242 243 req.AddCookie(fromDriverCookie(v)) 244 245 logger. 246 Debug(). 247 Timestamp(). 248 Str("cookie", key.String()). 249 Msg("set cookie") 250 251 return true 252 }) 253 } 254 255 ua := common.GetUserAgent(params.UserAgent) 256 logger. 257 Debug(). 258 Timestamp(). 259 Str("user-agent", ua). 260 Msg("using User-Agent") 261 262 if ua != "" { 263 req.Header.Set("User-Agent", ua) 264 } 265 266 req = req.WithContext(ctx) 267 }