github.com/10XDev/rclone@v1.52.3-0.20200626220027-16af9ab76b2a/backend/http/http.go (about) 1 // Package http provides a filesystem interface using golang.org/net/http 2 // 3 // It treats HTML pages served from the endpoint as directory 4 // listings, and includes any links found as files. 5 package http 6 7 import ( 8 "context" 9 "io" 10 "mime" 11 "net/http" 12 "net/url" 13 "path" 14 "strconv" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/pkg/errors" 20 "github.com/rclone/rclone/fs" 21 "github.com/rclone/rclone/fs/config/configmap" 22 "github.com/rclone/rclone/fs/config/configstruct" 23 "github.com/rclone/rclone/fs/fshttp" 24 "github.com/rclone/rclone/fs/hash" 25 "github.com/rclone/rclone/lib/rest" 26 "golang.org/x/net/html" 27 ) 28 29 var ( 30 errorReadOnly = errors.New("http remotes are read only") 31 timeUnset = time.Unix(0, 0) 32 ) 33 34 func init() { 35 fsi := &fs.RegInfo{ 36 Name: "http", 37 Description: "http Connection", 38 NewFs: NewFs, 39 Options: []fs.Option{{ 40 Name: "url", 41 Help: "URL of http host to connect to", 42 Required: true, 43 Examples: []fs.OptionExample{{ 44 Value: "https://example.com", 45 Help: "Connect to example.com", 46 }, { 47 Value: "https://user:pass@example.com", 48 Help: "Connect to example.com using a username and password", 49 }}, 50 }, { 51 Name: "headers", 52 Help: `Set HTTP headers for all transactions 53 54 Use this to set additional HTTP headers for all transactions 55 56 The input format is comma separated list of key,value pairs. Standard 57 [CSV encoding](https://godoc.org/encoding/csv) may be used. 58 59 For example to set a Cookie use 'Cookie,name=value', or '"Cookie","name=value"'. 60 61 You can set multiple headers, eg '"Cookie","name=value","Authorization","xxx"'. 62 `, 63 Default: fs.CommaSepList{}, 64 Advanced: true, 65 }, { 66 Name: "no_slash", 67 Help: `Set this if the site doesn't end directories with / 68 69 Use this if your target website does not use / on the end of 70 directories. 71 72 A / on the end of a path is how rclone normally tells the difference 73 between files and directories. If this flag is set, then rclone will 74 treat all files with Content-Type: text/html as directories and read 75 URLs from them rather than downloading them. 76 77 Note that this may cause rclone to confuse genuine HTML files with 78 directories.`, 79 Default: false, 80 Advanced: true, 81 }, { 82 Name: "no_head", 83 Help: `Don't use HEAD requests to find file sizes in dir listing 84 85 If your site is being very slow to load then you can try this option. 86 Normally rclone does a HEAD request for each potential file in a 87 directory listing to: 88 89 - find its size 90 - check it really exists 91 - check to see if it is a directory 92 93 If you set this option, rclone will not do the HEAD request. This will mean 94 95 - directory listings are much quicker 96 - rclone won't have the times or sizes of any files 97 - some files that don't exist may be in the listing 98 `, 99 Default: false, 100 Advanced: true, 101 }}, 102 } 103 fs.Register(fsi) 104 } 105 106 // Options defines the configuration for this backend 107 type Options struct { 108 Endpoint string `config:"url"` 109 NoSlash bool `config:"no_slash"` 110 NoHead bool `config:"no_head"` 111 Headers fs.CommaSepList `config:"headers"` 112 } 113 114 // Fs stores the interface to the remote HTTP files 115 type Fs struct { 116 name string 117 root string 118 features *fs.Features // optional features 119 opt Options // options for this backend 120 endpoint *url.URL 121 endpointURL string // endpoint as a string 122 httpClient *http.Client 123 } 124 125 // Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading) 126 type Object struct { 127 fs *Fs 128 remote string 129 size int64 130 modTime time.Time 131 contentType string 132 } 133 134 // statusError returns an error if the res contained an error 135 func statusError(res *http.Response, err error) error { 136 if err != nil { 137 return err 138 } 139 if res.StatusCode < 200 || res.StatusCode > 299 { 140 _ = res.Body.Close() 141 return errors.Errorf("HTTP Error %d: %s", res.StatusCode, res.Status) 142 } 143 return nil 144 } 145 146 // NewFs creates a new Fs object from the name and root. It connects to 147 // the host specified in the config file. 148 func NewFs(name, root string, m configmap.Mapper) (fs.Fs, error) { 149 ctx := context.TODO() 150 // Parse config into Options struct 151 opt := new(Options) 152 err := configstruct.Set(m, opt) 153 if err != nil { 154 return nil, err 155 } 156 157 if len(opt.Headers)%2 != 0 { 158 return nil, errors.New("odd number of headers supplied") 159 } 160 161 if !strings.HasSuffix(opt.Endpoint, "/") { 162 opt.Endpoint += "/" 163 } 164 165 // Parse the endpoint and stick the root onto it 166 base, err := url.Parse(opt.Endpoint) 167 if err != nil { 168 return nil, err 169 } 170 u, err := rest.URLJoin(base, rest.URLPathEscape(root)) 171 if err != nil { 172 return nil, err 173 } 174 175 client := fshttp.NewClient(fs.Config) 176 177 var isFile = false 178 if !strings.HasSuffix(u.String(), "/") { 179 // Make a client which doesn't follow redirects so the server 180 // doesn't redirect http://host/dir to http://host/dir/ 181 noRedir := *client 182 noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error { 183 return http.ErrUseLastResponse 184 } 185 // check to see if points to a file 186 req, err := http.NewRequest("HEAD", u.String(), nil) 187 if err == nil { 188 req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext 189 addHeaders(req, opt) 190 res, err := noRedir.Do(req) 191 err = statusError(res, err) 192 if err == nil { 193 isFile = true 194 } 195 } 196 } 197 198 newRoot := u.String() 199 if isFile { 200 // Point to the parent if this is a file 201 newRoot, _ = path.Split(u.String()) 202 } else { 203 if !strings.HasSuffix(newRoot, "/") { 204 newRoot += "/" 205 } 206 } 207 208 u, err = url.Parse(newRoot) 209 if err != nil { 210 return nil, err 211 } 212 213 f := &Fs{ 214 name: name, 215 root: root, 216 opt: *opt, 217 httpClient: client, 218 endpoint: u, 219 endpointURL: u.String(), 220 } 221 f.features = (&fs.Features{ 222 CanHaveEmptyDirectories: true, 223 }).Fill(f) 224 if isFile { 225 return f, fs.ErrorIsFile 226 } 227 if !strings.HasSuffix(f.endpointURL, "/") { 228 return nil, errors.New("internal error: url doesn't end with /") 229 } 230 return f, nil 231 } 232 233 // Name returns the configured name of the file system 234 func (f *Fs) Name() string { 235 return f.name 236 } 237 238 // Root returns the root for the filesystem 239 func (f *Fs) Root() string { 240 return f.root 241 } 242 243 // String returns the URL for the filesystem 244 func (f *Fs) String() string { 245 return f.endpointURL 246 } 247 248 // Features returns the optional features of this Fs 249 func (f *Fs) Features() *fs.Features { 250 return f.features 251 } 252 253 // Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s 254 func (f *Fs) Precision() time.Duration { 255 return time.Second 256 } 257 258 // NewObject creates a new remote http file object 259 func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) { 260 o := &Object{ 261 fs: f, 262 remote: remote, 263 } 264 err := o.stat(ctx) 265 if err != nil { 266 return nil, err 267 } 268 return o, nil 269 } 270 271 // Join's the remote onto the base URL 272 func (f *Fs) url(remote string) string { 273 return f.endpointURL + rest.URLPathEscape(remote) 274 } 275 276 // parse s into an int64, on failure return def 277 func parseInt64(s string, def int64) int64 { 278 n, e := strconv.ParseInt(s, 10, 64) 279 if e != nil { 280 return def 281 } 282 return n 283 } 284 285 // Errors returned by parseName 286 var ( 287 errURLJoinFailed = errors.New("URLJoin failed") 288 errFoundQuestionMark = errors.New("found ? in URL") 289 errHostMismatch = errors.New("host mismatch") 290 errSchemeMismatch = errors.New("scheme mismatch") 291 errNotUnderRoot = errors.New("not under root") 292 errNameIsEmpty = errors.New("name is empty") 293 errNameContainsSlash = errors.New("name contains /") 294 ) 295 296 // parseName turns a name as found in the page into a remote path or returns an error 297 func parseName(base *url.URL, name string) (string, error) { 298 // make URL absolute 299 u, err := rest.URLJoin(base, name) 300 if err != nil { 301 return "", errURLJoinFailed 302 } 303 // check it doesn't have URL parameters 304 uStr := u.String() 305 if strings.Index(uStr, "?") >= 0 { 306 return "", errFoundQuestionMark 307 } 308 // check that this is going back to the same host and scheme 309 if base.Host != u.Host { 310 return "", errHostMismatch 311 } 312 if base.Scheme != u.Scheme { 313 return "", errSchemeMismatch 314 } 315 // check has path prefix 316 if !strings.HasPrefix(u.Path, base.Path) { 317 return "", errNotUnderRoot 318 } 319 // calculate the name relative to the base 320 name = u.Path[len(base.Path):] 321 // mustn't be empty 322 if name == "" { 323 return "", errNameIsEmpty 324 } 325 // mustn't contain a / - we are looking for a single level directory 326 slash := strings.Index(name, "/") 327 if slash >= 0 && slash != len(name)-1 { 328 return "", errNameContainsSlash 329 } 330 return name, nil 331 } 332 333 // Parse turns HTML for a directory into names 334 // base should be the base URL to resolve any relative names from 335 func parse(base *url.URL, in io.Reader) (names []string, err error) { 336 doc, err := html.Parse(in) 337 if err != nil { 338 return nil, err 339 } 340 var ( 341 walk func(*html.Node) 342 seen = make(map[string]struct{}) 343 ) 344 walk = func(n *html.Node) { 345 if n.Type == html.ElementNode && n.Data == "a" { 346 for _, a := range n.Attr { 347 if a.Key == "href" { 348 name, err := parseName(base, a.Val) 349 if err == nil { 350 if _, found := seen[name]; !found { 351 names = append(names, name) 352 seen[name] = struct{}{} 353 } 354 } 355 break 356 } 357 } 358 } 359 for c := n.FirstChild; c != nil; c = c.NextSibling { 360 walk(c) 361 } 362 } 363 walk(doc) 364 return names, nil 365 } 366 367 // Adds the configured headers to the request if any 368 func addHeaders(req *http.Request, opt *Options) { 369 for i := 0; i < len(opt.Headers); i += 2 { 370 key := opt.Headers[i] 371 value := opt.Headers[i+1] 372 req.Header.Add(key, value) 373 } 374 } 375 376 // Adds the configured headers to the request if any 377 func (f *Fs) addHeaders(req *http.Request) { 378 addHeaders(req, &f.opt) 379 } 380 381 // Read the directory passed in 382 func (f *Fs) readDir(ctx context.Context, dir string) (names []string, err error) { 383 URL := f.url(dir) 384 u, err := url.Parse(URL) 385 if err != nil { 386 return nil, errors.Wrap(err, "failed to readDir") 387 } 388 if !strings.HasSuffix(URL, "/") { 389 return nil, errors.Errorf("internal error: readDir URL %q didn't end in /", URL) 390 } 391 // Do the request 392 req, err := http.NewRequest("GET", URL, nil) 393 if err != nil { 394 return nil, errors.Wrap(err, "readDir failed") 395 } 396 req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext 397 f.addHeaders(req) 398 res, err := f.httpClient.Do(req) 399 if err == nil { 400 defer fs.CheckClose(res.Body, &err) 401 if res.StatusCode == http.StatusNotFound { 402 return nil, fs.ErrorDirNotFound 403 } 404 } 405 err = statusError(res, err) 406 if err != nil { 407 return nil, errors.Wrap(err, "failed to readDir") 408 } 409 410 contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0] 411 switch contentType { 412 case "text/html": 413 names, err = parse(u, res.Body) 414 if err != nil { 415 return nil, errors.Wrap(err, "readDir") 416 } 417 default: 418 return nil, errors.Errorf("Can't parse content type %q", contentType) 419 } 420 return names, nil 421 } 422 423 // List the objects and directories in dir into entries. The 424 // entries can be returned in any order but should be for a 425 // complete directory. 426 // 427 // dir should be "" to list the root, and should not have 428 // trailing slashes. 429 // 430 // This should return ErrDirNotFound if the directory isn't 431 // found. 432 func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { 433 if !strings.HasSuffix(dir, "/") && dir != "" { 434 dir += "/" 435 } 436 names, err := f.readDir(ctx, dir) 437 if err != nil { 438 return nil, errors.Wrapf(err, "error listing %q", dir) 439 } 440 var ( 441 entriesMu sync.Mutex // to protect entries 442 wg sync.WaitGroup 443 in = make(chan string, fs.Config.Checkers) 444 ) 445 add := func(entry fs.DirEntry) { 446 entriesMu.Lock() 447 entries = append(entries, entry) 448 entriesMu.Unlock() 449 } 450 for i := 0; i < fs.Config.Checkers; i++ { 451 wg.Add(1) 452 go func() { 453 defer wg.Done() 454 for remote := range in { 455 file := &Object{ 456 fs: f, 457 remote: remote, 458 } 459 switch err := file.stat(ctx); err { 460 case nil: 461 add(file) 462 case fs.ErrorNotAFile: 463 // ...found a directory not a file 464 add(fs.NewDir(remote, timeUnset)) 465 default: 466 fs.Debugf(remote, "skipping because of error: %v", err) 467 } 468 } 469 }() 470 } 471 for _, name := range names { 472 isDir := name[len(name)-1] == '/' 473 name = strings.TrimRight(name, "/") 474 remote := path.Join(dir, name) 475 if isDir { 476 add(fs.NewDir(remote, timeUnset)) 477 } else { 478 in <- remote 479 } 480 } 481 close(in) 482 wg.Wait() 483 return entries, nil 484 } 485 486 // Put in to the remote path with the modTime given of the given size 487 // 488 // May create the object even if it returns an error - if so 489 // will return the object and the error, otherwise will return 490 // nil and the error 491 func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { 492 return nil, errorReadOnly 493 } 494 495 // PutStream uploads to the remote path with the modTime given of indeterminate size 496 func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { 497 return nil, errorReadOnly 498 } 499 500 // Fs is the filesystem this remote http file object is located within 501 func (o *Object) Fs() fs.Info { 502 return o.fs 503 } 504 505 // String returns the URL to the remote HTTP file 506 func (o *Object) String() string { 507 if o == nil { 508 return "<nil>" 509 } 510 return o.remote 511 } 512 513 // Remote the name of the remote HTTP file, relative to the fs root 514 func (o *Object) Remote() string { 515 return o.remote 516 } 517 518 // Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes 519 func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) { 520 return "", hash.ErrUnsupported 521 } 522 523 // Size returns the size in bytes of the remote http file 524 func (o *Object) Size() int64 { 525 return o.size 526 } 527 528 // ModTime returns the modification time of the remote http file 529 func (o *Object) ModTime(ctx context.Context) time.Time { 530 return o.modTime 531 } 532 533 // url returns the native url of the object 534 func (o *Object) url() string { 535 return o.fs.url(o.remote) 536 } 537 538 // stat updates the info field in the Object 539 func (o *Object) stat(ctx context.Context) error { 540 if o.fs.opt.NoHead { 541 o.size = -1 542 o.modTime = timeUnset 543 o.contentType = fs.MimeType(ctx, o) 544 return nil 545 } 546 url := o.url() 547 req, err := http.NewRequest("HEAD", url, nil) 548 if err != nil { 549 return errors.Wrap(err, "stat failed") 550 } 551 req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext 552 o.fs.addHeaders(req) 553 res, err := o.fs.httpClient.Do(req) 554 if err == nil && res.StatusCode == http.StatusNotFound { 555 return fs.ErrorObjectNotFound 556 } 557 err = statusError(res, err) 558 if err != nil { 559 return errors.Wrap(err, "failed to stat") 560 } 561 t, err := http.ParseTime(res.Header.Get("Last-Modified")) 562 if err != nil { 563 t = timeUnset 564 } 565 o.size = parseInt64(res.Header.Get("Content-Length"), -1) 566 o.modTime = t 567 o.contentType = res.Header.Get("Content-Type") 568 // If NoSlash is set then check ContentType to see if it is a directory 569 if o.fs.opt.NoSlash { 570 mediaType, _, err := mime.ParseMediaType(o.contentType) 571 if err != nil { 572 return errors.Wrapf(err, "failed to parse Content-Type: %q", o.contentType) 573 } 574 if mediaType == "text/html" { 575 return fs.ErrorNotAFile 576 } 577 } 578 return nil 579 } 580 581 // SetModTime sets the modification and access time to the specified time 582 // 583 // it also updates the info field 584 func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error { 585 return errorReadOnly 586 } 587 588 // Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc) 589 func (o *Object) Storable() bool { 590 return true 591 } 592 593 // Open a remote http file object for reading. Seek is supported 594 func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) { 595 url := o.url() 596 req, err := http.NewRequest("GET", url, nil) 597 if err != nil { 598 return nil, errors.Wrap(err, "Open failed") 599 } 600 req = req.WithContext(ctx) // go1.13 can use NewRequestWithContext 601 602 // Add optional headers 603 for k, v := range fs.OpenOptionHeaders(options) { 604 req.Header.Add(k, v) 605 } 606 o.fs.addHeaders(req) 607 608 // Do the request 609 res, err := o.fs.httpClient.Do(req) 610 err = statusError(res, err) 611 if err != nil { 612 return nil, errors.Wrap(err, "Open failed") 613 } 614 return res.Body, nil 615 } 616 617 // Hashes returns hash.HashNone to indicate remote hashing is unavailable 618 func (f *Fs) Hashes() hash.Set { 619 return hash.Set(hash.None) 620 } 621 622 // Mkdir makes the root directory of the Fs object 623 func (f *Fs) Mkdir(ctx context.Context, dir string) error { 624 return errorReadOnly 625 } 626 627 // Remove a remote http file object 628 func (o *Object) Remove(ctx context.Context) error { 629 return errorReadOnly 630 } 631 632 // Rmdir removes the root directory of the Fs object 633 func (f *Fs) Rmdir(ctx context.Context, dir string) error { 634 return errorReadOnly 635 } 636 637 // Update in to the object with the modTime given of the given size 638 func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error { 639 return errorReadOnly 640 } 641 642 // MimeType of an Object if known, "" otherwise 643 func (o *Object) MimeType(ctx context.Context) string { 644 return o.contentType 645 } 646 647 // Check the interfaces are satisfied 648 var ( 649 _ fs.Fs = &Fs{} 650 _ fs.PutStreamer = &Fs{} 651 _ fs.Object = &Object{} 652 _ fs.MimeTyper = &Object{} 653 )