github.com/ncw/rclone@v1.48.1-0.20190724201158-a35aa1360e3e/backend/http/http.go (about)

     1  // Package http provides a filesystem interface using golang.org/net/http
     2  //
     3  // It treats HTML pages served from the endpoint as directory
     4  // listings, and includes any links found as files.
     5  package http
     6  
     7  import (
     8  	"context"
     9  	"io"
    10  	"mime"
    11  	"net/http"
    12  	"net/url"
    13  	"path"
    14  	"strconv"
    15  	"strings"
    16  	"time"
    17  
    18  	"github.com/ncw/rclone/fs"
    19  	"github.com/ncw/rclone/fs/config/configmap"
    20  	"github.com/ncw/rclone/fs/config/configstruct"
    21  	"github.com/ncw/rclone/fs/fshttp"
    22  	"github.com/ncw/rclone/fs/hash"
    23  	"github.com/ncw/rclone/lib/rest"
    24  	"github.com/pkg/errors"
    25  	"golang.org/x/net/html"
    26  )
    27  
    28  var (
    29  	errorReadOnly = errors.New("http remotes are read only")
    30  	timeUnset     = time.Unix(0, 0)
    31  )
    32  
    33  func init() {
    34  	fsi := &fs.RegInfo{
    35  		Name:        "http",
    36  		Description: "http Connection",
    37  		NewFs:       NewFs,
    38  		Options: []fs.Option{{
    39  			Name:     "url",
    40  			Help:     "URL of http host to connect to",
    41  			Required: true,
    42  			Examples: []fs.OptionExample{{
    43  				Value: "https://example.com",
    44  				Help:  "Connect to example.com",
    45  			}, {
    46  				Value: "https://user:pass@example.com",
    47  				Help:  "Connect to example.com using a username and password",
    48  			}},
    49  		}, {
    50  			Name: "no_slash",
    51  			Help: `Set this if the site doesn't end directories with /
    52  
    53  Use this if your target website does not use / on the end of
    54  directories.
    55  
    56  A / on the end of a path is how rclone normally tells the difference
    57  between files and directories.  If this flag is set, then rclone will
    58  treat all files with Content-Type: text/html as directories and read
    59  URLs from them rather than downloading them.
    60  
    61  Note that this may cause rclone to confuse genuine HTML files with
    62  directories.`,
    63  			Default:  false,
    64  			Advanced: true,
    65  		}},
    66  	}
    67  	fs.Register(fsi)
    68  }
    69  
    70  // Options defines the configuration for this backend
    71  type Options struct {
    72  	Endpoint string `config:"url"`
    73  	NoSlash  bool   `config:"no_slash"`
    74  }
    75  
    76  // Fs stores the interface to the remote HTTP files
    77  type Fs struct {
    78  	name        string
    79  	root        string
    80  	features    *fs.Features // optional features
    81  	opt         Options      // options for this backend
    82  	endpoint    *url.URL
    83  	endpointURL string // endpoint as a string
    84  	httpClient  *http.Client
    85  }
    86  
    87  // Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading)
    88  type Object struct {
    89  	fs          *Fs
    90  	remote      string
    91  	size        int64
    92  	modTime     time.Time
    93  	contentType string
    94  }
    95  
    96  // statusError returns an error if the res contained an error
    97  func statusError(res *http.Response, err error) error {
    98  	if err != nil {
    99  		return err
   100  	}
   101  	if res.StatusCode < 200 || res.StatusCode > 299 {
   102  		_ = res.Body.Close()
   103  		return errors.Errorf("HTTP Error %d: %s", res.StatusCode, res.Status)
   104  	}
   105  	return nil
   106  }
   107  
   108  // NewFs creates a new Fs object from the name and root. It connects to
   109  // the host specified in the config file.
   110  func NewFs(name, root string, m configmap.Mapper) (fs.Fs, error) {
   111  	// Parse config into Options struct
   112  	opt := new(Options)
   113  	err := configstruct.Set(m, opt)
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  
   118  	if !strings.HasSuffix(opt.Endpoint, "/") {
   119  		opt.Endpoint += "/"
   120  	}
   121  
   122  	// Parse the endpoint and stick the root onto it
   123  	base, err := url.Parse(opt.Endpoint)
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  	u, err := rest.URLJoin(base, rest.URLPathEscape(root))
   128  	if err != nil {
   129  		return nil, err
   130  	}
   131  
   132  	client := fshttp.NewClient(fs.Config)
   133  
   134  	var isFile = false
   135  	if !strings.HasSuffix(u.String(), "/") {
   136  		// Make a client which doesn't follow redirects so the server
   137  		// doesn't redirect http://host/dir to http://host/dir/
   138  		noRedir := *client
   139  		noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error {
   140  			return http.ErrUseLastResponse
   141  		}
   142  		// check to see if points to a file
   143  		res, err := noRedir.Head(u.String())
   144  		err = statusError(res, err)
   145  		if err == nil {
   146  			isFile = true
   147  		}
   148  	}
   149  
   150  	newRoot := u.String()
   151  	if isFile {
   152  		// Point to the parent if this is a file
   153  		newRoot, _ = path.Split(u.String())
   154  	} else {
   155  		if !strings.HasSuffix(newRoot, "/") {
   156  			newRoot += "/"
   157  		}
   158  	}
   159  
   160  	u, err = url.Parse(newRoot)
   161  	if err != nil {
   162  		return nil, err
   163  	}
   164  
   165  	f := &Fs{
   166  		name:        name,
   167  		root:        root,
   168  		opt:         *opt,
   169  		httpClient:  client,
   170  		endpoint:    u,
   171  		endpointURL: u.String(),
   172  	}
   173  	f.features = (&fs.Features{
   174  		CanHaveEmptyDirectories: true,
   175  	}).Fill(f)
   176  	if isFile {
   177  		return f, fs.ErrorIsFile
   178  	}
   179  	if !strings.HasSuffix(f.endpointURL, "/") {
   180  		return nil, errors.New("internal error: url doesn't end with /")
   181  	}
   182  	return f, nil
   183  }
   184  
   185  // Name returns the configured name of the file system
   186  func (f *Fs) Name() string {
   187  	return f.name
   188  }
   189  
   190  // Root returns the root for the filesystem
   191  func (f *Fs) Root() string {
   192  	return f.root
   193  }
   194  
   195  // String returns the URL for the filesystem
   196  func (f *Fs) String() string {
   197  	return f.endpointURL
   198  }
   199  
   200  // Features returns the optional features of this Fs
   201  func (f *Fs) Features() *fs.Features {
   202  	return f.features
   203  }
   204  
   205  // Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s
   206  func (f *Fs) Precision() time.Duration {
   207  	return time.Second
   208  }
   209  
   210  // NewObject creates a new remote http file object
   211  func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
   212  	o := &Object{
   213  		fs:     f,
   214  		remote: remote,
   215  	}
   216  	err := o.stat()
   217  	if err != nil {
   218  		return nil, err
   219  	}
   220  	return o, nil
   221  }
   222  
   223  // Join's the remote onto the base URL
   224  func (f *Fs) url(remote string) string {
   225  	return f.endpointURL + rest.URLPathEscape(remote)
   226  }
   227  
   228  // parse s into an int64, on failure return def
   229  func parseInt64(s string, def int64) int64 {
   230  	n, e := strconv.ParseInt(s, 10, 64)
   231  	if e != nil {
   232  		return def
   233  	}
   234  	return n
   235  }
   236  
   237  // Errors returned by parseName
   238  var (
   239  	errURLJoinFailed     = errors.New("URLJoin failed")
   240  	errFoundQuestionMark = errors.New("found ? in URL")
   241  	errHostMismatch      = errors.New("host mismatch")
   242  	errSchemeMismatch    = errors.New("scheme mismatch")
   243  	errNotUnderRoot      = errors.New("not under root")
   244  	errNameIsEmpty       = errors.New("name is empty")
   245  	errNameContainsSlash = errors.New("name contains /")
   246  )
   247  
   248  // parseName turns a name as found in the page into a remote path or returns an error
   249  func parseName(base *url.URL, name string) (string, error) {
   250  	// make URL absolute
   251  	u, err := rest.URLJoin(base, name)
   252  	if err != nil {
   253  		return "", errURLJoinFailed
   254  	}
   255  	// check it doesn't have URL parameters
   256  	uStr := u.String()
   257  	if strings.Index(uStr, "?") >= 0 {
   258  		return "", errFoundQuestionMark
   259  	}
   260  	// check that this is going back to the same host and scheme
   261  	if base.Host != u.Host {
   262  		return "", errHostMismatch
   263  	}
   264  	if base.Scheme != u.Scheme {
   265  		return "", errSchemeMismatch
   266  	}
   267  	// check has path prefix
   268  	if !strings.HasPrefix(u.Path, base.Path) {
   269  		return "", errNotUnderRoot
   270  	}
   271  	// calculate the name relative to the base
   272  	name = u.Path[len(base.Path):]
   273  	// mustn't be empty
   274  	if name == "" {
   275  		return "", errNameIsEmpty
   276  	}
   277  	// mustn't contain a / - we are looking for a single level directory
   278  	slash := strings.Index(name, "/")
   279  	if slash >= 0 && slash != len(name)-1 {
   280  		return "", errNameContainsSlash
   281  	}
   282  	return name, nil
   283  }
   284  
   285  // Parse turns HTML for a directory into names
   286  // base should be the base URL to resolve any relative names from
   287  func parse(base *url.URL, in io.Reader) (names []string, err error) {
   288  	doc, err := html.Parse(in)
   289  	if err != nil {
   290  		return nil, err
   291  	}
   292  	var (
   293  		walk func(*html.Node)
   294  		seen = make(map[string]struct{})
   295  	)
   296  	walk = func(n *html.Node) {
   297  		if n.Type == html.ElementNode && n.Data == "a" {
   298  			for _, a := range n.Attr {
   299  				if a.Key == "href" {
   300  					name, err := parseName(base, a.Val)
   301  					if err == nil {
   302  						if _, found := seen[name]; !found {
   303  							names = append(names, name)
   304  							seen[name] = struct{}{}
   305  						}
   306  					}
   307  					break
   308  				}
   309  			}
   310  		}
   311  		for c := n.FirstChild; c != nil; c = c.NextSibling {
   312  			walk(c)
   313  		}
   314  	}
   315  	walk(doc)
   316  	return names, nil
   317  }
   318  
   319  // Read the directory passed in
   320  func (f *Fs) readDir(dir string) (names []string, err error) {
   321  	URL := f.url(dir)
   322  	u, err := url.Parse(URL)
   323  	if err != nil {
   324  		return nil, errors.Wrap(err, "failed to readDir")
   325  	}
   326  	if !strings.HasSuffix(URL, "/") {
   327  		return nil, errors.Errorf("internal error: readDir URL %q didn't end in /", URL)
   328  	}
   329  	res, err := f.httpClient.Get(URL)
   330  	if err == nil {
   331  		defer fs.CheckClose(res.Body, &err)
   332  		if res.StatusCode == http.StatusNotFound {
   333  			return nil, fs.ErrorDirNotFound
   334  		}
   335  	}
   336  	err = statusError(res, err)
   337  	if err != nil {
   338  		return nil, errors.Wrap(err, "failed to readDir")
   339  	}
   340  
   341  	contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0]
   342  	switch contentType {
   343  	case "text/html":
   344  		names, err = parse(u, res.Body)
   345  		if err != nil {
   346  			return nil, errors.Wrap(err, "readDir")
   347  		}
   348  	default:
   349  		return nil, errors.Errorf("Can't parse content type %q", contentType)
   350  	}
   351  	return names, nil
   352  }
   353  
   354  // List the objects and directories in dir into entries.  The
   355  // entries can be returned in any order but should be for a
   356  // complete directory.
   357  //
   358  // dir should be "" to list the root, and should not have
   359  // trailing slashes.
   360  //
   361  // This should return ErrDirNotFound if the directory isn't
   362  // found.
   363  func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
   364  	if !strings.HasSuffix(dir, "/") && dir != "" {
   365  		dir += "/"
   366  	}
   367  	names, err := f.readDir(dir)
   368  	if err != nil {
   369  		return nil, errors.Wrapf(err, "error listing %q", dir)
   370  	}
   371  	for _, name := range names {
   372  		isDir := name[len(name)-1] == '/'
   373  		name = strings.TrimRight(name, "/")
   374  		remote := path.Join(dir, name)
   375  		if isDir {
   376  			dir := fs.NewDir(remote, timeUnset)
   377  			entries = append(entries, dir)
   378  		} else {
   379  			file := &Object{
   380  				fs:     f,
   381  				remote: remote,
   382  			}
   383  			switch err = file.stat(); err {
   384  			case nil:
   385  				entries = append(entries, file)
   386  			case fs.ErrorNotAFile:
   387  				// ...found a directory not a file
   388  				dir := fs.NewDir(remote, timeUnset)
   389  				entries = append(entries, dir)
   390  			default:
   391  				fs.Debugf(remote, "skipping because of error: %v", err)
   392  			}
   393  		}
   394  	}
   395  	return entries, nil
   396  }
   397  
   398  // Put in to the remote path with the modTime given of the given size
   399  //
   400  // May create the object even if it returns an error - if so
   401  // will return the object and the error, otherwise will return
   402  // nil and the error
   403  func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
   404  	return nil, errorReadOnly
   405  }
   406  
   407  // PutStream uploads to the remote path with the modTime given of indeterminate size
   408  func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
   409  	return nil, errorReadOnly
   410  }
   411  
   412  // Fs is the filesystem this remote http file object is located within
   413  func (o *Object) Fs() fs.Info {
   414  	return o.fs
   415  }
   416  
   417  // String returns the URL to the remote HTTP file
   418  func (o *Object) String() string {
   419  	if o == nil {
   420  		return "<nil>"
   421  	}
   422  	return o.remote
   423  }
   424  
   425  // Remote the name of the remote HTTP file, relative to the fs root
   426  func (o *Object) Remote() string {
   427  	return o.remote
   428  }
   429  
   430  // Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes
   431  func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) {
   432  	return "", hash.ErrUnsupported
   433  }
   434  
   435  // Size returns the size in bytes of the remote http file
   436  func (o *Object) Size() int64 {
   437  	return o.size
   438  }
   439  
   440  // ModTime returns the modification time of the remote http file
   441  func (o *Object) ModTime(ctx context.Context) time.Time {
   442  	return o.modTime
   443  }
   444  
   445  // url returns the native url of the object
   446  func (o *Object) url() string {
   447  	return o.fs.url(o.remote)
   448  }
   449  
   450  // stat updates the info field in the Object
   451  func (o *Object) stat() error {
   452  	url := o.url()
   453  	res, err := o.fs.httpClient.Head(url)
   454  	if err == nil && res.StatusCode == http.StatusNotFound {
   455  		return fs.ErrorObjectNotFound
   456  	}
   457  	err = statusError(res, err)
   458  	if err != nil {
   459  		return errors.Wrap(err, "failed to stat")
   460  	}
   461  	t, err := http.ParseTime(res.Header.Get("Last-Modified"))
   462  	if err != nil {
   463  		t = timeUnset
   464  	}
   465  	o.size = parseInt64(res.Header.Get("Content-Length"), -1)
   466  	o.modTime = t
   467  	o.contentType = res.Header.Get("Content-Type")
   468  	// If NoSlash is set then check ContentType to see if it is a directory
   469  	if o.fs.opt.NoSlash {
   470  		mediaType, _, err := mime.ParseMediaType(o.contentType)
   471  		if err != nil {
   472  			return errors.Wrapf(err, "failed to parse Content-Type: %q", o.contentType)
   473  		}
   474  		if mediaType == "text/html" {
   475  			return fs.ErrorNotAFile
   476  		}
   477  	}
   478  	return nil
   479  }
   480  
   481  // SetModTime sets the modification and access time to the specified time
   482  //
   483  // it also updates the info field
   484  func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
   485  	return errorReadOnly
   486  }
   487  
   488  // Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc)
   489  func (o *Object) Storable() bool {
   490  	return true
   491  }
   492  
   493  // Open a remote http file object for reading. Seek is supported
   494  func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
   495  	url := o.url()
   496  	req, err := http.NewRequest("GET", url, nil)
   497  	if err != nil {
   498  		return nil, errors.Wrap(err, "Open failed")
   499  	}
   500  
   501  	// Add optional headers
   502  	for k, v := range fs.OpenOptionHeaders(options) {
   503  		req.Header.Add(k, v)
   504  	}
   505  
   506  	// Do the request
   507  	res, err := o.fs.httpClient.Do(req)
   508  	err = statusError(res, err)
   509  	if err != nil {
   510  		return nil, errors.Wrap(err, "Open failed")
   511  	}
   512  	return res.Body, nil
   513  }
   514  
   515  // Hashes returns hash.HashNone to indicate remote hashing is unavailable
   516  func (f *Fs) Hashes() hash.Set {
   517  	return hash.Set(hash.None)
   518  }
   519  
   520  // Mkdir makes the root directory of the Fs object
   521  func (f *Fs) Mkdir(ctx context.Context, dir string) error {
   522  	return errorReadOnly
   523  }
   524  
   525  // Remove a remote http file object
   526  func (o *Object) Remove(ctx context.Context) error {
   527  	return errorReadOnly
   528  }
   529  
   530  // Rmdir removes the root directory of the Fs object
   531  func (f *Fs) Rmdir(ctx context.Context, dir string) error {
   532  	return errorReadOnly
   533  }
   534  
   535  // Update in to the object with the modTime given of the given size
   536  func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
   537  	return errorReadOnly
   538  }
   539  
   540  // MimeType of an Object if known, "" otherwise
   541  func (o *Object) MimeType(ctx context.Context) string {
   542  	return o.contentType
   543  }
   544  
   545  // Check the interfaces are satisfied
   546  var (
   547  	_ fs.Fs          = &Fs{}
   548  	_ fs.PutStreamer = &Fs{}
   549  	_ fs.Object      = &Object{}
   550  	_ fs.MimeTyper   = &Object{}
   551  )