github.com/ncw/rclone@v1.48.1-0.20190724201158-a35aa1360e3e/backend/http/http.go (about) 1 // Package http provides a filesystem interface using golang.org/net/http 2 // 3 // It treats HTML pages served from the endpoint as directory 4 // listings, and includes any links found as files. 5 package http 6 7 import ( 8 "context" 9 "io" 10 "mime" 11 "net/http" 12 "net/url" 13 "path" 14 "strconv" 15 "strings" 16 "time" 17 18 "github.com/ncw/rclone/fs" 19 "github.com/ncw/rclone/fs/config/configmap" 20 "github.com/ncw/rclone/fs/config/configstruct" 21 "github.com/ncw/rclone/fs/fshttp" 22 "github.com/ncw/rclone/fs/hash" 23 "github.com/ncw/rclone/lib/rest" 24 "github.com/pkg/errors" 25 "golang.org/x/net/html" 26 ) 27 28 var ( 29 errorReadOnly = errors.New("http remotes are read only") 30 timeUnset = time.Unix(0, 0) 31 ) 32 33 func init() { 34 fsi := &fs.RegInfo{ 35 Name: "http", 36 Description: "http Connection", 37 NewFs: NewFs, 38 Options: []fs.Option{{ 39 Name: "url", 40 Help: "URL of http host to connect to", 41 Required: true, 42 Examples: []fs.OptionExample{{ 43 Value: "https://example.com", 44 Help: "Connect to example.com", 45 }, { 46 Value: "https://user:pass@example.com", 47 Help: "Connect to example.com using a username and password", 48 }}, 49 }, { 50 Name: "no_slash", 51 Help: `Set this if the site doesn't end directories with / 52 53 Use this if your target website does not use / on the end of 54 directories. 55 56 A / on the end of a path is how rclone normally tells the difference 57 between files and directories. If this flag is set, then rclone will 58 treat all files with Content-Type: text/html as directories and read 59 URLs from them rather than downloading them. 60 61 Note that this may cause rclone to confuse genuine HTML files with 62 directories.`, 63 Default: false, 64 Advanced: true, 65 }}, 66 } 67 fs.Register(fsi) 68 } 69 70 // Options defines the configuration for this backend 71 type Options struct { 72 Endpoint string `config:"url"` 73 NoSlash bool `config:"no_slash"` 74 } 75 76 // Fs stores the interface to the remote HTTP files 77 type Fs struct { 78 name string 79 root string 80 features *fs.Features // optional features 81 opt Options // options for this backend 82 endpoint *url.URL 83 endpointURL string // endpoint as a string 84 httpClient *http.Client 85 } 86 87 // Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading) 88 type Object struct { 89 fs *Fs 90 remote string 91 size int64 92 modTime time.Time 93 contentType string 94 } 95 96 // statusError returns an error if the res contained an error 97 func statusError(res *http.Response, err error) error { 98 if err != nil { 99 return err 100 } 101 if res.StatusCode < 200 || res.StatusCode > 299 { 102 _ = res.Body.Close() 103 return errors.Errorf("HTTP Error %d: %s", res.StatusCode, res.Status) 104 } 105 return nil 106 } 107 108 // NewFs creates a new Fs object from the name and root. It connects to 109 // the host specified in the config file. 110 func NewFs(name, root string, m configmap.Mapper) (fs.Fs, error) { 111 // Parse config into Options struct 112 opt := new(Options) 113 err := configstruct.Set(m, opt) 114 if err != nil { 115 return nil, err 116 } 117 118 if !strings.HasSuffix(opt.Endpoint, "/") { 119 opt.Endpoint += "/" 120 } 121 122 // Parse the endpoint and stick the root onto it 123 base, err := url.Parse(opt.Endpoint) 124 if err != nil { 125 return nil, err 126 } 127 u, err := rest.URLJoin(base, rest.URLPathEscape(root)) 128 if err != nil { 129 return nil, err 130 } 131 132 client := fshttp.NewClient(fs.Config) 133 134 var isFile = false 135 if !strings.HasSuffix(u.String(), "/") { 136 // Make a client which doesn't follow redirects so the server 137 // doesn't redirect http://host/dir to http://host/dir/ 138 noRedir := *client 139 noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error { 140 return http.ErrUseLastResponse 141 } 142 // check to see if points to a file 143 res, err := noRedir.Head(u.String()) 144 err = statusError(res, err) 145 if err == nil { 146 isFile = true 147 } 148 } 149 150 newRoot := u.String() 151 if isFile { 152 // Point to the parent if this is a file 153 newRoot, _ = path.Split(u.String()) 154 } else { 155 if !strings.HasSuffix(newRoot, "/") { 156 newRoot += "/" 157 } 158 } 159 160 u, err = url.Parse(newRoot) 161 if err != nil { 162 return nil, err 163 } 164 165 f := &Fs{ 166 name: name, 167 root: root, 168 opt: *opt, 169 httpClient: client, 170 endpoint: u, 171 endpointURL: u.String(), 172 } 173 f.features = (&fs.Features{ 174 CanHaveEmptyDirectories: true, 175 }).Fill(f) 176 if isFile { 177 return f, fs.ErrorIsFile 178 } 179 if !strings.HasSuffix(f.endpointURL, "/") { 180 return nil, errors.New("internal error: url doesn't end with /") 181 } 182 return f, nil 183 } 184 185 // Name returns the configured name of the file system 186 func (f *Fs) Name() string { 187 return f.name 188 } 189 190 // Root returns the root for the filesystem 191 func (f *Fs) Root() string { 192 return f.root 193 } 194 195 // String returns the URL for the filesystem 196 func (f *Fs) String() string { 197 return f.endpointURL 198 } 199 200 // Features returns the optional features of this Fs 201 func (f *Fs) Features() *fs.Features { 202 return f.features 203 } 204 205 // Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s 206 func (f *Fs) Precision() time.Duration { 207 return time.Second 208 } 209 210 // NewObject creates a new remote http file object 211 func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) { 212 o := &Object{ 213 fs: f, 214 remote: remote, 215 } 216 err := o.stat() 217 if err != nil { 218 return nil, err 219 } 220 return o, nil 221 } 222 223 // Join's the remote onto the base URL 224 func (f *Fs) url(remote string) string { 225 return f.endpointURL + rest.URLPathEscape(remote) 226 } 227 228 // parse s into an int64, on failure return def 229 func parseInt64(s string, def int64) int64 { 230 n, e := strconv.ParseInt(s, 10, 64) 231 if e != nil { 232 return def 233 } 234 return n 235 } 236 237 // Errors returned by parseName 238 var ( 239 errURLJoinFailed = errors.New("URLJoin failed") 240 errFoundQuestionMark = errors.New("found ? in URL") 241 errHostMismatch = errors.New("host mismatch") 242 errSchemeMismatch = errors.New("scheme mismatch") 243 errNotUnderRoot = errors.New("not under root") 244 errNameIsEmpty = errors.New("name is empty") 245 errNameContainsSlash = errors.New("name contains /") 246 ) 247 248 // parseName turns a name as found in the page into a remote path or returns an error 249 func parseName(base *url.URL, name string) (string, error) { 250 // make URL absolute 251 u, err := rest.URLJoin(base, name) 252 if err != nil { 253 return "", errURLJoinFailed 254 } 255 // check it doesn't have URL parameters 256 uStr := u.String() 257 if strings.Index(uStr, "?") >= 0 { 258 return "", errFoundQuestionMark 259 } 260 // check that this is going back to the same host and scheme 261 if base.Host != u.Host { 262 return "", errHostMismatch 263 } 264 if base.Scheme != u.Scheme { 265 return "", errSchemeMismatch 266 } 267 // check has path prefix 268 if !strings.HasPrefix(u.Path, base.Path) { 269 return "", errNotUnderRoot 270 } 271 // calculate the name relative to the base 272 name = u.Path[len(base.Path):] 273 // mustn't be empty 274 if name == "" { 275 return "", errNameIsEmpty 276 } 277 // mustn't contain a / - we are looking for a single level directory 278 slash := strings.Index(name, "/") 279 if slash >= 0 && slash != len(name)-1 { 280 return "", errNameContainsSlash 281 } 282 return name, nil 283 } 284 285 // Parse turns HTML for a directory into names 286 // base should be the base URL to resolve any relative names from 287 func parse(base *url.URL, in io.Reader) (names []string, err error) { 288 doc, err := html.Parse(in) 289 if err != nil { 290 return nil, err 291 } 292 var ( 293 walk func(*html.Node) 294 seen = make(map[string]struct{}) 295 ) 296 walk = func(n *html.Node) { 297 if n.Type == html.ElementNode && n.Data == "a" { 298 for _, a := range n.Attr { 299 if a.Key == "href" { 300 name, err := parseName(base, a.Val) 301 if err == nil { 302 if _, found := seen[name]; !found { 303 names = append(names, name) 304 seen[name] = struct{}{} 305 } 306 } 307 break 308 } 309 } 310 } 311 for c := n.FirstChild; c != nil; c = c.NextSibling { 312 walk(c) 313 } 314 } 315 walk(doc) 316 return names, nil 317 } 318 319 // Read the directory passed in 320 func (f *Fs) readDir(dir string) (names []string, err error) { 321 URL := f.url(dir) 322 u, err := url.Parse(URL) 323 if err != nil { 324 return nil, errors.Wrap(err, "failed to readDir") 325 } 326 if !strings.HasSuffix(URL, "/") { 327 return nil, errors.Errorf("internal error: readDir URL %q didn't end in /", URL) 328 } 329 res, err := f.httpClient.Get(URL) 330 if err == nil { 331 defer fs.CheckClose(res.Body, &err) 332 if res.StatusCode == http.StatusNotFound { 333 return nil, fs.ErrorDirNotFound 334 } 335 } 336 err = statusError(res, err) 337 if err != nil { 338 return nil, errors.Wrap(err, "failed to readDir") 339 } 340 341 contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0] 342 switch contentType { 343 case "text/html": 344 names, err = parse(u, res.Body) 345 if err != nil { 346 return nil, errors.Wrap(err, "readDir") 347 } 348 default: 349 return nil, errors.Errorf("Can't parse content type %q", contentType) 350 } 351 return names, nil 352 } 353 354 // List the objects and directories in dir into entries. The 355 // entries can be returned in any order but should be for a 356 // complete directory. 357 // 358 // dir should be "" to list the root, and should not have 359 // trailing slashes. 360 // 361 // This should return ErrDirNotFound if the directory isn't 362 // found. 363 func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { 364 if !strings.HasSuffix(dir, "/") && dir != "" { 365 dir += "/" 366 } 367 names, err := f.readDir(dir) 368 if err != nil { 369 return nil, errors.Wrapf(err, "error listing %q", dir) 370 } 371 for _, name := range names { 372 isDir := name[len(name)-1] == '/' 373 name = strings.TrimRight(name, "/") 374 remote := path.Join(dir, name) 375 if isDir { 376 dir := fs.NewDir(remote, timeUnset) 377 entries = append(entries, dir) 378 } else { 379 file := &Object{ 380 fs: f, 381 remote: remote, 382 } 383 switch err = file.stat(); err { 384 case nil: 385 entries = append(entries, file) 386 case fs.ErrorNotAFile: 387 // ...found a directory not a file 388 dir := fs.NewDir(remote, timeUnset) 389 entries = append(entries, dir) 390 default: 391 fs.Debugf(remote, "skipping because of error: %v", err) 392 } 393 } 394 } 395 return entries, nil 396 } 397 398 // Put in to the remote path with the modTime given of the given size 399 // 400 // May create the object even if it returns an error - if so 401 // will return the object and the error, otherwise will return 402 // nil and the error 403 func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { 404 return nil, errorReadOnly 405 } 406 407 // PutStream uploads to the remote path with the modTime given of indeterminate size 408 func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { 409 return nil, errorReadOnly 410 } 411 412 // Fs is the filesystem this remote http file object is located within 413 func (o *Object) Fs() fs.Info { 414 return o.fs 415 } 416 417 // String returns the URL to the remote HTTP file 418 func (o *Object) String() string { 419 if o == nil { 420 return "<nil>" 421 } 422 return o.remote 423 } 424 425 // Remote the name of the remote HTTP file, relative to the fs root 426 func (o *Object) Remote() string { 427 return o.remote 428 } 429 430 // Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes 431 func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) { 432 return "", hash.ErrUnsupported 433 } 434 435 // Size returns the size in bytes of the remote http file 436 func (o *Object) Size() int64 { 437 return o.size 438 } 439 440 // ModTime returns the modification time of the remote http file 441 func (o *Object) ModTime(ctx context.Context) time.Time { 442 return o.modTime 443 } 444 445 // url returns the native url of the object 446 func (o *Object) url() string { 447 return o.fs.url(o.remote) 448 } 449 450 // stat updates the info field in the Object 451 func (o *Object) stat() error { 452 url := o.url() 453 res, err := o.fs.httpClient.Head(url) 454 if err == nil && res.StatusCode == http.StatusNotFound { 455 return fs.ErrorObjectNotFound 456 } 457 err = statusError(res, err) 458 if err != nil { 459 return errors.Wrap(err, "failed to stat") 460 } 461 t, err := http.ParseTime(res.Header.Get("Last-Modified")) 462 if err != nil { 463 t = timeUnset 464 } 465 o.size = parseInt64(res.Header.Get("Content-Length"), -1) 466 o.modTime = t 467 o.contentType = res.Header.Get("Content-Type") 468 // If NoSlash is set then check ContentType to see if it is a directory 469 if o.fs.opt.NoSlash { 470 mediaType, _, err := mime.ParseMediaType(o.contentType) 471 if err != nil { 472 return errors.Wrapf(err, "failed to parse Content-Type: %q", o.contentType) 473 } 474 if mediaType == "text/html" { 475 return fs.ErrorNotAFile 476 } 477 } 478 return nil 479 } 480 481 // SetModTime sets the modification and access time to the specified time 482 // 483 // it also updates the info field 484 func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error { 485 return errorReadOnly 486 } 487 488 // Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc) 489 func (o *Object) Storable() bool { 490 return true 491 } 492 493 // Open a remote http file object for reading. Seek is supported 494 func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) { 495 url := o.url() 496 req, err := http.NewRequest("GET", url, nil) 497 if err != nil { 498 return nil, errors.Wrap(err, "Open failed") 499 } 500 501 // Add optional headers 502 for k, v := range fs.OpenOptionHeaders(options) { 503 req.Header.Add(k, v) 504 } 505 506 // Do the request 507 res, err := o.fs.httpClient.Do(req) 508 err = statusError(res, err) 509 if err != nil { 510 return nil, errors.Wrap(err, "Open failed") 511 } 512 return res.Body, nil 513 } 514 515 // Hashes returns hash.HashNone to indicate remote hashing is unavailable 516 func (f *Fs) Hashes() hash.Set { 517 return hash.Set(hash.None) 518 } 519 520 // Mkdir makes the root directory of the Fs object 521 func (f *Fs) Mkdir(ctx context.Context, dir string) error { 522 return errorReadOnly 523 } 524 525 // Remove a remote http file object 526 func (o *Object) Remove(ctx context.Context) error { 527 return errorReadOnly 528 } 529 530 // Rmdir removes the root directory of the Fs object 531 func (f *Fs) Rmdir(ctx context.Context, dir string) error { 532 return errorReadOnly 533 } 534 535 // Update in to the object with the modTime given of the given size 536 func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error { 537 return errorReadOnly 538 } 539 540 // MimeType of an Object if known, "" otherwise 541 func (o *Object) MimeType(ctx context.Context) string { 542 return o.contentType 543 } 544 545 // Check the interfaces are satisfied 546 var ( 547 _ fs.Fs = &Fs{} 548 _ fs.PutStreamer = &Fs{} 549 _ fs.Object = &Object{} 550 _ fs.MimeTyper = &Object{} 551 )