github.com/MontFerret/ferret@v0.18.0/pkg/drivers/http/driver.go (about)

     1  package http
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"io"
     7  	"net/http"
     8  	"net/url"
     9  
    10  	"github.com/gobwas/glob"
    11  
    12  	"github.com/MontFerret/ferret/pkg/runtime/logging"
    13  	"github.com/MontFerret/ferret/pkg/runtime/values"
    14  
    15  	"golang.org/x/net/html/charset"
    16  
    17  	"github.com/PuerkitoBio/goquery"
    18  	"github.com/pkg/errors"
    19  	"github.com/sethgrid/pester"
    20  
    21  	"github.com/MontFerret/ferret/pkg/drivers"
    22  	"github.com/MontFerret/ferret/pkg/drivers/common"
    23  )
    24  
    25  const DriverName = "http"
    26  
    27  type Driver struct {
    28  	client  *pester.Client
    29  	options *Options
    30  }
    31  
    32  func NewDriver(opts ...Option) *Driver {
    33  	drv := new(Driver)
    34  	drv.options = NewOptions(opts)
    35  
    36  	drv.client = newHTTPClient(drv.options)
    37  
    38  	return drv
    39  }
    40  
    41  func newHTTPClient(options *Options) (httpClient *pester.Client) {
    42  	httpClient = pester.New()
    43  
    44  	httpClient.Concurrency = options.Concurrency
    45  	httpClient.MaxRetries = options.MaxRetries
    46  	httpClient.Backoff = options.Backoff
    47  	httpClient.Timeout = options.Timeout
    48  
    49  	if options.HTTPTransport != nil {
    50  		httpClient.Transport = options.HTTPTransport
    51  	}
    52  
    53  	if options.Proxy == "" {
    54  		return
    55  	}
    56  
    57  	if err := addProxy(httpClient, options.Proxy); err != nil {
    58  		return
    59  	}
    60  
    61  	return
    62  }
    63  
    64  func addProxy(httpClient *pester.Client, proxyStr string) error {
    65  	if proxyStr == "" {
    66  		return nil
    67  	}
    68  
    69  	proxyURL, err := url.Parse(proxyStr)
    70  	if err != nil {
    71  		return err
    72  	}
    73  
    74  	proxy := http.ProxyURL(proxyURL)
    75  
    76  	if httpClient.Transport != nil {
    77  		httpClient.Transport.(*http.Transport).Proxy = proxy
    78  
    79  		return nil
    80  	}
    81  
    82  	httpClient.Transport = &http.Transport{Proxy: proxy}
    83  
    84  	return nil
    85  }
    86  
    87  func (drv *Driver) Name() string {
    88  	return drv.options.Name
    89  }
    90  
    91  func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTMLPage, error) {
    92  	req, err := http.NewRequest(http.MethodGet, params.URL, nil)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  
    97  	params = drivers.SetDefaultParams(drv.options.Options, params)
    98  
    99  	drv.makeRequest(ctx, req, params)
   100  
   101  	resp, err := drv.client.Do(req)
   102  	if err != nil {
   103  		return nil, errors.Wrapf(err, "failed to retrieve a document %s", params.URL)
   104  	}
   105  	defer resp.Body.Close()
   106  
   107  	var queryFilters []drivers.StatusCodeFilter
   108  
   109  	if params.Ignore != nil {
   110  		queryFilters = params.Ignore.StatusCodes
   111  	}
   112  
   113  	if !drv.responseCodeAllowed(resp, queryFilters) {
   114  		return nil, errors.New(resp.Status)
   115  	}
   116  
   117  	body := io.Reader(resp.Body)
   118  	if params.Charset != "" {
   119  		body, err = drv.convertToUTF8(body, params.Charset)
   120  		if err != nil {
   121  			return nil, errors.Wrapf(err, "failed convert to UTF-8 a document %s", params.URL)
   122  		}
   123  	}
   124  
   125  	doc, err := goquery.NewDocumentFromReader(body)
   126  	if err != nil {
   127  		return nil, errors.Wrapf(err, "failed to parse a document %s", params.URL)
   128  	}
   129  
   130  	cookies, err := toDriverCookies(resp.Cookies())
   131  	if err != nil {
   132  		return nil, err
   133  	}
   134  
   135  	r := drivers.HTTPResponse{
   136  		StatusCode: resp.StatusCode,
   137  		Status:     resp.Status,
   138  		Headers:    drivers.NewHTTPHeadersWith(resp.Header),
   139  	}
   140  
   141  	return NewHTMLPage(doc, params.URL, r, cookies)
   142  }
   143  
   144  func (drv *Driver) Parse(_ context.Context, params drivers.ParseParams) (drivers.HTMLPage, error) {
   145  	buf := bytes.NewBuffer(params.Content)
   146  
   147  	doc, err := goquery.NewDocumentFromReader(buf)
   148  
   149  	if err != nil {
   150  		return nil, errors.Wrap(err, "failed to parse a document")
   151  	}
   152  
   153  	return NewHTMLPage(doc, "#blank", drivers.HTTPResponse{}, nil)
   154  }
   155  
   156  func (drv *Driver) Close() error {
   157  	drv.client = nil
   158  
   159  	return nil
   160  }
   161  
   162  func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers.StatusCodeFilter) bool {
   163  	var allowed bool
   164  	reqURL := resp.Request.URL.String()
   165  
   166  	// OK is by default
   167  	if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
   168  		return true
   169  	}
   170  
   171  	// Try to use those that are passed within a query
   172  	for _, filter := range additional {
   173  		allowed = filter.Code == resp.StatusCode
   174  
   175  		// check url
   176  		if allowed && filter.URL != "" {
   177  			allowed = glob.MustCompile(filter.URL).Match(reqURL)
   178  		}
   179  
   180  		if allowed {
   181  			break
   182  		}
   183  	}
   184  
   185  	// if still not allowed, try the default ones
   186  	if !allowed {
   187  		for _, filter := range drv.options.HTTPCodesFilter {
   188  			allowed = filter.Code == resp.StatusCode
   189  
   190  			if allowed && filter.URL != nil {
   191  				allowed = filter.URL.Match(reqURL)
   192  			}
   193  
   194  			if allowed {
   195  				break
   196  			}
   197  		}
   198  	}
   199  
   200  	return allowed
   201  }
   202  
   203  func (drv *Driver) convertToUTF8(reader io.Reader, srcCharset string) (data io.Reader, err error) {
   204  	data, err = charset.NewReader(reader, srcCharset)
   205  	if err != nil {
   206  		return nil, err
   207  	}
   208  
   209  	return
   210  }
   211  
   212  func (drv *Driver) makeRequest(ctx context.Context, req *http.Request, params drivers.Params) {
   213  	logger := logging.FromContext(ctx)
   214  
   215  	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
   216  	req.Header.Set("Accept-Language", "en-US,en;q=0.9,ru;q=0.8")
   217  	req.Header.Set("Cache-Control", "no-cache")
   218  	req.Header.Set("Pragma", "no-cache")
   219  
   220  	if params.Headers != nil {
   221  		params.Headers.ForEach(func(value []string, key string) bool {
   222  			v := params.Headers.Get(key)
   223  
   224  			req.Header.Set(key, v)
   225  
   226  			logger.
   227  				Debug().
   228  				Timestamp().
   229  				Str("header", key).
   230  				Msg("set header")
   231  
   232  			return true
   233  		})
   234  	}
   235  
   236  	if params.Cookies != nil {
   237  		params.Cookies.ForEach(func(value drivers.HTTPCookie, key values.String) bool {
   238  			v, exist := params.Cookies.Get(key)
   239  			if !exist {
   240  				return false
   241  			}
   242  
   243  			req.AddCookie(fromDriverCookie(v))
   244  
   245  			logger.
   246  				Debug().
   247  				Timestamp().
   248  				Str("cookie", key.String()).
   249  				Msg("set cookie")
   250  
   251  			return true
   252  		})
   253  	}
   254  
   255  	ua := common.GetUserAgent(params.UserAgent)
   256  	logger.
   257  		Debug().
   258  		Timestamp().
   259  		Str("user-agent", ua).
   260  		Msg("using User-Agent")
   261  
   262  	if ua != "" {
   263  		req.Header.Set("User-Agent", ua)
   264  	}
   265  
   266  	req = req.WithContext(ctx)
   267  }