github.com/10XDev/rclone@v1.52.3-0.20200626220027-16af9ab76b2a/backend/http/http.go (about)

     1  // Package http provides a filesystem interface using golang.org/net/http
     2  //
     3  // It treats HTML pages served from the endpoint as directory
     4  // listings, and includes any links found as files.
     5  package http
     6  
     7  import (
     8  	"context"
     9  	"io"
    10  	"mime"
    11  	"net/http"
    12  	"net/url"
    13  	"path"
    14  	"strconv"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/pkg/errors"
    20  	"github.com/rclone/rclone/fs"
    21  	"github.com/rclone/rclone/fs/config/configmap"
    22  	"github.com/rclone/rclone/fs/config/configstruct"
    23  	"github.com/rclone/rclone/fs/fshttp"
    24  	"github.com/rclone/rclone/fs/hash"
    25  	"github.com/rclone/rclone/lib/rest"
    26  	"golang.org/x/net/html"
    27  )
    28  
    29  var (
    30  	errorReadOnly = errors.New("http remotes are read only")
    31  	timeUnset     = time.Unix(0, 0)
    32  )
    33  
    34  func init() {
    35  	fsi := &fs.RegInfo{
    36  		Name:        "http",
    37  		Description: "http Connection",
    38  		NewFs:       NewFs,
    39  		Options: []fs.Option{{
    40  			Name:     "url",
    41  			Help:     "URL of http host to connect to",
    42  			Required: true,
    43  			Examples: []fs.OptionExample{{
    44  				Value: "https://example.com",
    45  				Help:  "Connect to example.com",
    46  			}, {
    47  				Value: "https://user:pass@example.com",
    48  				Help:  "Connect to example.com using a username and password",
    49  			}},
    50  		}, {
    51  			Name: "headers",
    52  			Help: `Set HTTP headers for all transactions
    53  
    54  Use this to set additional HTTP headers for all transactions
    55  
    56  The input format is comma separated list of key,value pairs.  Standard
    57  [CSV encoding](https://godoc.org/encoding/csv) may be used.
    58  
    59  For example to set a Cookie use 'Cookie,name=value', or '"Cookie","name=value"'.
    60  
    61  You can set multiple headers, eg '"Cookie","name=value","Authorization","xxx"'.
    62  `,
    63  			Default:  fs.CommaSepList{},
    64  			Advanced: true,
    65  		}, {
    66  			Name: "no_slash",
    67  			Help: `Set this if the site doesn't end directories with /
    68  
    69  Use this if your target website does not use / on the end of
    70  directories.
    71  
    72  A / on the end of a path is how rclone normally tells the difference
    73  between files and directories.  If this flag is set, then rclone will
    74  treat all files with Content-Type: text/html as directories and read
    75  URLs from them rather than downloading them.
    76  
    77  Note that this may cause rclone to confuse genuine HTML files with
    78  directories.`,
    79  			Default:  false,
    80  			Advanced: true,
    81  		}, {
    82  			Name: "no_head",
    83  			Help: `Don't use HEAD requests to find file sizes in dir listing
    84  
    85  If your site is being very slow to load then you can try this option.
    86  Normally rclone does a HEAD request for each potential file in a
    87  directory listing to:
    88  
    89  - find its size
    90  - check it really exists
    91  - check to see if it is a directory
    92  
    93  If you set this option, rclone will not do the HEAD request.  This will mean
    94  
    95  - directory listings are much quicker
    96  - rclone won't have the times or sizes of any files
    97  - some files that don't exist may be in the listing
    98  `,
    99  			Default:  false,
   100  			Advanced: true,
   101  		}},
   102  	}
   103  	fs.Register(fsi)
   104  }
   105  
   106  // Options defines the configuration for this backend
   107  type Options struct {
   108  	Endpoint string          `config:"url"`
   109  	NoSlash  bool            `config:"no_slash"`
   110  	NoHead   bool            `config:"no_head"`
   111  	Headers  fs.CommaSepList `config:"headers"`
   112  }
   113  
   114  // Fs stores the interface to the remote HTTP files
   115  type Fs struct {
   116  	name        string
   117  	root        string
   118  	features    *fs.Features // optional features
   119  	opt         Options      // options for this backend
   120  	endpoint    *url.URL
   121  	endpointURL string // endpoint as a string
   122  	httpClient  *http.Client
   123  }
   124  
   125  // Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading)
   126  type Object struct {
   127  	fs          *Fs
   128  	remote      string
   129  	size        int64
   130  	modTime     time.Time
   131  	contentType string
   132  }
   133  
   134  // statusError returns an error if the res contained an error
   135  func statusError(res *http.Response, err error) error {
   136  	if err != nil {
   137  		return err
   138  	}
   139  	if res.StatusCode < 200 || res.StatusCode > 299 {
   140  		_ = res.Body.Close()
   141  		return errors.Errorf("HTTP Error %d: %s", res.StatusCode, res.Status)
   142  	}
   143  	return nil
   144  }
   145  
   146  // NewFs creates a new Fs object from the name and root. It connects to
   147  // the host specified in the config file.
   148  func NewFs(name, root string, m configmap.Mapper) (fs.Fs, error) {
   149  	ctx := context.TODO()
   150  	// Parse config into Options struct
   151  	opt := new(Options)
   152  	err := configstruct.Set(m, opt)
   153  	if err != nil {
   154  		return nil, err
   155  	}
   156  
   157  	if len(opt.Headers)%2 != 0 {
   158  		return nil, errors.New("odd number of headers supplied")
   159  	}
   160  
   161  	if !strings.HasSuffix(opt.Endpoint, "/") {
   162  		opt.Endpoint += "/"
   163  	}
   164  
   165  	// Parse the endpoint and stick the root onto it
   166  	base, err := url.Parse(opt.Endpoint)
   167  	if err != nil {
   168  		return nil, err
   169  	}
   170  	u, err := rest.URLJoin(base, rest.URLPathEscape(root))
   171  	if err != nil {
   172  		return nil, err
   173  	}
   174  
   175  	client := fshttp.NewClient(fs.Config)
   176  
   177  	var isFile = false
   178  	if !strings.HasSuffix(u.String(), "/") {
   179  		// Make a client which doesn't follow redirects so the server
   180  		// doesn't redirect http://host/dir to http://host/dir/
   181  		noRedir := *client
   182  		noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error {
   183  			return http.ErrUseLastResponse
   184  		}
   185  		// check to see if points to a file
   186  		req, err := http.NewRequest("HEAD", u.String(), nil)
   187  		if err == nil {
   188  			req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
   189  			addHeaders(req, opt)
   190  			res, err := noRedir.Do(req)
   191  			err = statusError(res, err)
   192  			if err == nil {
   193  				isFile = true
   194  			}
   195  		}
   196  	}
   197  
   198  	newRoot := u.String()
   199  	if isFile {
   200  		// Point to the parent if this is a file
   201  		newRoot, _ = path.Split(u.String())
   202  	} else {
   203  		if !strings.HasSuffix(newRoot, "/") {
   204  			newRoot += "/"
   205  		}
   206  	}
   207  
   208  	u, err = url.Parse(newRoot)
   209  	if err != nil {
   210  		return nil, err
   211  	}
   212  
   213  	f := &Fs{
   214  		name:        name,
   215  		root:        root,
   216  		opt:         *opt,
   217  		httpClient:  client,
   218  		endpoint:    u,
   219  		endpointURL: u.String(),
   220  	}
   221  	f.features = (&fs.Features{
   222  		CanHaveEmptyDirectories: true,
   223  	}).Fill(f)
   224  	if isFile {
   225  		return f, fs.ErrorIsFile
   226  	}
   227  	if !strings.HasSuffix(f.endpointURL, "/") {
   228  		return nil, errors.New("internal error: url doesn't end with /")
   229  	}
   230  	return f, nil
   231  }
   232  
   233  // Name returns the configured name of the file system
   234  func (f *Fs) Name() string {
   235  	return f.name
   236  }
   237  
   238  // Root returns the root for the filesystem
   239  func (f *Fs) Root() string {
   240  	return f.root
   241  }
   242  
   243  // String returns the URL for the filesystem
   244  func (f *Fs) String() string {
   245  	return f.endpointURL
   246  }
   247  
   248  // Features returns the optional features of this Fs
   249  func (f *Fs) Features() *fs.Features {
   250  	return f.features
   251  }
   252  
   253  // Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s
   254  func (f *Fs) Precision() time.Duration {
   255  	return time.Second
   256  }
   257  
   258  // NewObject creates a new remote http file object
   259  func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
   260  	o := &Object{
   261  		fs:     f,
   262  		remote: remote,
   263  	}
   264  	err := o.stat(ctx)
   265  	if err != nil {
   266  		return nil, err
   267  	}
   268  	return o, nil
   269  }
   270  
   271  // Join's the remote onto the base URL
   272  func (f *Fs) url(remote string) string {
   273  	return f.endpointURL + rest.URLPathEscape(remote)
   274  }
   275  
   276  // parse s into an int64, on failure return def
   277  func parseInt64(s string, def int64) int64 {
   278  	n, e := strconv.ParseInt(s, 10, 64)
   279  	if e != nil {
   280  		return def
   281  	}
   282  	return n
   283  }
   284  
   285  // Errors returned by parseName
   286  var (
   287  	errURLJoinFailed     = errors.New("URLJoin failed")
   288  	errFoundQuestionMark = errors.New("found ? in URL")
   289  	errHostMismatch      = errors.New("host mismatch")
   290  	errSchemeMismatch    = errors.New("scheme mismatch")
   291  	errNotUnderRoot      = errors.New("not under root")
   292  	errNameIsEmpty       = errors.New("name is empty")
   293  	errNameContainsSlash = errors.New("name contains /")
   294  )
   295  
   296  // parseName turns a name as found in the page into a remote path or returns an error
   297  func parseName(base *url.URL, name string) (string, error) {
   298  	// make URL absolute
   299  	u, err := rest.URLJoin(base, name)
   300  	if err != nil {
   301  		return "", errURLJoinFailed
   302  	}
   303  	// check it doesn't have URL parameters
   304  	uStr := u.String()
   305  	if strings.Index(uStr, "?") >= 0 {
   306  		return "", errFoundQuestionMark
   307  	}
   308  	// check that this is going back to the same host and scheme
   309  	if base.Host != u.Host {
   310  		return "", errHostMismatch
   311  	}
   312  	if base.Scheme != u.Scheme {
   313  		return "", errSchemeMismatch
   314  	}
   315  	// check has path prefix
   316  	if !strings.HasPrefix(u.Path, base.Path) {
   317  		return "", errNotUnderRoot
   318  	}
   319  	// calculate the name relative to the base
   320  	name = u.Path[len(base.Path):]
   321  	// mustn't be empty
   322  	if name == "" {
   323  		return "", errNameIsEmpty
   324  	}
   325  	// mustn't contain a / - we are looking for a single level directory
   326  	slash := strings.Index(name, "/")
   327  	if slash >= 0 && slash != len(name)-1 {
   328  		return "", errNameContainsSlash
   329  	}
   330  	return name, nil
   331  }
   332  
   333  // Parse turns HTML for a directory into names
   334  // base should be the base URL to resolve any relative names from
   335  func parse(base *url.URL, in io.Reader) (names []string, err error) {
   336  	doc, err := html.Parse(in)
   337  	if err != nil {
   338  		return nil, err
   339  	}
   340  	var (
   341  		walk func(*html.Node)
   342  		seen = make(map[string]struct{})
   343  	)
   344  	walk = func(n *html.Node) {
   345  		if n.Type == html.ElementNode && n.Data == "a" {
   346  			for _, a := range n.Attr {
   347  				if a.Key == "href" {
   348  					name, err := parseName(base, a.Val)
   349  					if err == nil {
   350  						if _, found := seen[name]; !found {
   351  							names = append(names, name)
   352  							seen[name] = struct{}{}
   353  						}
   354  					}
   355  					break
   356  				}
   357  			}
   358  		}
   359  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   360  			walk(c)
   361  		}
   362  	}
   363  	walk(doc)
   364  	return names, nil
   365  }
   366  
   367  // Adds the configured headers to the request if any
   368  func addHeaders(req *http.Request, opt *Options) {
   369  	for i := 0; i < len(opt.Headers); i += 2 {
   370  		key := opt.Headers[i]
   371  		value := opt.Headers[i+1]
   372  		req.Header.Add(key, value)
   373  	}
   374  }
   375  
   376  // Adds the configured headers to the request if any
   377  func (f *Fs) addHeaders(req *http.Request) {
   378  	addHeaders(req, &f.opt)
   379  }
   380  
   381  // Read the directory passed in
   382  func (f *Fs) readDir(ctx context.Context, dir string) (names []string, err error) {
   383  	URL := f.url(dir)
   384  	u, err := url.Parse(URL)
   385  	if err != nil {
   386  		return nil, errors.Wrap(err, "failed to readDir")
   387  	}
   388  	if !strings.HasSuffix(URL, "/") {
   389  		return nil, errors.Errorf("internal error: readDir URL %q didn't end in /", URL)
   390  	}
   391  	// Do the request
   392  	req, err := http.NewRequest("GET", URL, nil)
   393  	if err != nil {
   394  		return nil, errors.Wrap(err, "readDir failed")
   395  	}
   396  	req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
   397  	f.addHeaders(req)
   398  	res, err := f.httpClient.Do(req)
   399  	if err == nil {
   400  		defer fs.CheckClose(res.Body, &err)
   401  		if res.StatusCode == http.StatusNotFound {
   402  			return nil, fs.ErrorDirNotFound
   403  		}
   404  	}
   405  	err = statusError(res, err)
   406  	if err != nil {
   407  		return nil, errors.Wrap(err, "failed to readDir")
   408  	}
   409  
   410  	contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0]
   411  	switch contentType {
   412  	case "text/html":
   413  		names, err = parse(u, res.Body)
   414  		if err != nil {
   415  			return nil, errors.Wrap(err, "readDir")
   416  		}
   417  	default:
   418  		return nil, errors.Errorf("Can't parse content type %q", contentType)
   419  	}
   420  	return names, nil
   421  }
   422  
   423  // List the objects and directories in dir into entries.  The
   424  // entries can be returned in any order but should be for a
   425  // complete directory.
   426  //
   427  // dir should be "" to list the root, and should not have
   428  // trailing slashes.
   429  //
   430  // This should return ErrDirNotFound if the directory isn't
   431  // found.
   432  func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
   433  	if !strings.HasSuffix(dir, "/") && dir != "" {
   434  		dir += "/"
   435  	}
   436  	names, err := f.readDir(ctx, dir)
   437  	if err != nil {
   438  		return nil, errors.Wrapf(err, "error listing %q", dir)
   439  	}
   440  	var (
   441  		entriesMu sync.Mutex // to protect entries
   442  		wg        sync.WaitGroup
   443  		in        = make(chan string, fs.Config.Checkers)
   444  	)
   445  	add := func(entry fs.DirEntry) {
   446  		entriesMu.Lock()
   447  		entries = append(entries, entry)
   448  		entriesMu.Unlock()
   449  	}
   450  	for i := 0; i < fs.Config.Checkers; i++ {
   451  		wg.Add(1)
   452  		go func() {
   453  			defer wg.Done()
   454  			for remote := range in {
   455  				file := &Object{
   456  					fs:     f,
   457  					remote: remote,
   458  				}
   459  				switch err := file.stat(ctx); err {
   460  				case nil:
   461  					add(file)
   462  				case fs.ErrorNotAFile:
   463  					// ...found a directory not a file
   464  					add(fs.NewDir(remote, timeUnset))
   465  				default:
   466  					fs.Debugf(remote, "skipping because of error: %v", err)
   467  				}
   468  			}
   469  		}()
   470  	}
   471  	for _, name := range names {
   472  		isDir := name[len(name)-1] == '/'
   473  		name = strings.TrimRight(name, "/")
   474  		remote := path.Join(dir, name)
   475  		if isDir {
   476  			add(fs.NewDir(remote, timeUnset))
   477  		} else {
   478  			in <- remote
   479  		}
   480  	}
   481  	close(in)
   482  	wg.Wait()
   483  	return entries, nil
   484  }
   485  
   486  // Put in to the remote path with the modTime given of the given size
   487  //
   488  // May create the object even if it returns an error - if so
   489  // will return the object and the error, otherwise will return
   490  // nil and the error
   491  func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
   492  	return nil, errorReadOnly
   493  }
   494  
   495  // PutStream uploads to the remote path with the modTime given of indeterminate size
   496  func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
   497  	return nil, errorReadOnly
   498  }
   499  
   500  // Fs is the filesystem this remote http file object is located within
   501  func (o *Object) Fs() fs.Info {
   502  	return o.fs
   503  }
   504  
   505  // String returns the URL to the remote HTTP file
   506  func (o *Object) String() string {
   507  	if o == nil {
   508  		return "<nil>"
   509  	}
   510  	return o.remote
   511  }
   512  
   513  // Remote the name of the remote HTTP file, relative to the fs root
   514  func (o *Object) Remote() string {
   515  	return o.remote
   516  }
   517  
   518  // Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes
   519  func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) {
   520  	return "", hash.ErrUnsupported
   521  }
   522  
   523  // Size returns the size in bytes of the remote http file
   524  func (o *Object) Size() int64 {
   525  	return o.size
   526  }
   527  
   528  // ModTime returns the modification time of the remote http file
   529  func (o *Object) ModTime(ctx context.Context) time.Time {
   530  	return o.modTime
   531  }
   532  
   533  // url returns the native url of the object
   534  func (o *Object) url() string {
   535  	return o.fs.url(o.remote)
   536  }
   537  
   538  // stat updates the info field in the Object
   539  func (o *Object) stat(ctx context.Context) error {
   540  	if o.fs.opt.NoHead {
   541  		o.size = -1
   542  		o.modTime = timeUnset
   543  		o.contentType = fs.MimeType(ctx, o)
   544  		return nil
   545  	}
   546  	url := o.url()
   547  	req, err := http.NewRequest("HEAD", url, nil)
   548  	if err != nil {
   549  		return errors.Wrap(err, "stat failed")
   550  	}
   551  	req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
   552  	o.fs.addHeaders(req)
   553  	res, err := o.fs.httpClient.Do(req)
   554  	if err == nil && res.StatusCode == http.StatusNotFound {
   555  		return fs.ErrorObjectNotFound
   556  	}
   557  	err = statusError(res, err)
   558  	if err != nil {
   559  		return errors.Wrap(err, "failed to stat")
   560  	}
   561  	t, err := http.ParseTime(res.Header.Get("Last-Modified"))
   562  	if err != nil {
   563  		t = timeUnset
   564  	}
   565  	o.size = parseInt64(res.Header.Get("Content-Length"), -1)
   566  	o.modTime = t
   567  	o.contentType = res.Header.Get("Content-Type")
   568  	// If NoSlash is set then check ContentType to see if it is a directory
   569  	if o.fs.opt.NoSlash {
   570  		mediaType, _, err := mime.ParseMediaType(o.contentType)
   571  		if err != nil {
   572  			return errors.Wrapf(err, "failed to parse Content-Type: %q", o.contentType)
   573  		}
   574  		if mediaType == "text/html" {
   575  			return fs.ErrorNotAFile
   576  		}
   577  	}
   578  	return nil
   579  }
   580  
   581  // SetModTime sets the modification and access time to the specified time
   582  //
   583  // it also updates the info field
   584  func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
   585  	return errorReadOnly
   586  }
   587  
   588  // Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc)
   589  func (o *Object) Storable() bool {
   590  	return true
   591  }
   592  
   593  // Open a remote http file object for reading. Seek is supported
   594  func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
   595  	url := o.url()
   596  	req, err := http.NewRequest("GET", url, nil)
   597  	if err != nil {
   598  		return nil, errors.Wrap(err, "Open failed")
   599  	}
   600  	req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext
   601  
   602  	// Add optional headers
   603  	for k, v := range fs.OpenOptionHeaders(options) {
   604  		req.Header.Add(k, v)
   605  	}
   606  	o.fs.addHeaders(req)
   607  
   608  	// Do the request
   609  	res, err := o.fs.httpClient.Do(req)
   610  	err = statusError(res, err)
   611  	if err != nil {
   612  		return nil, errors.Wrap(err, "Open failed")
   613  	}
   614  	return res.Body, nil
   615  }
   616  
   617  // Hashes returns hash.HashNone to indicate remote hashing is unavailable
   618  func (f *Fs) Hashes() hash.Set {
   619  	return hash.Set(hash.None)
   620  }
   621  
   622  // Mkdir makes the root directory of the Fs object
   623  func (f *Fs) Mkdir(ctx context.Context, dir string) error {
   624  	return errorReadOnly
   625  }
   626  
   627  // Remove a remote http file object
   628  func (o *Object) Remove(ctx context.Context) error {
   629  	return errorReadOnly
   630  }
   631  
   632  // Rmdir removes the root directory of the Fs object
   633  func (f *Fs) Rmdir(ctx context.Context, dir string) error {
   634  	return errorReadOnly
   635  }
   636  
   637  // Update in to the object with the modTime given of the given size
   638  func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
   639  	return errorReadOnly
   640  }
   641  
   642  // MimeType of an Object if known, "" otherwise
   643  func (o *Object) MimeType(ctx context.Context) string {
   644  	return o.contentType
   645  }
   646  
   647  // Check the interfaces are satisfied
   648  var (
   649  	_ fs.Fs          = &Fs{}
   650  	_ fs.PutStreamer = &Fs{}
   651  	_ fs.Object      = &Object{}
   652  	_ fs.MimeTyper   = &Object{}
   653  )