github.com/rclone/rclone@v1.66.1-0.20240517100346-7b89735ae726/backend/internetarchive/internetarchive.go (about) 1 // Package internetarchive provides an interface to Internet Archive's Item 2 // via their native API than using S3-compatible endpoints. 3 package internetarchive 4 5 import ( 6 "bytes" 7 "context" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "io" 12 "net/http" 13 "net/url" 14 "path" 15 "regexp" 16 "strconv" 17 "strings" 18 "time" 19 20 "github.com/ncw/swift/v2" 21 "github.com/rclone/rclone/fs" 22 "github.com/rclone/rclone/fs/config" 23 "github.com/rclone/rclone/fs/config/configmap" 24 "github.com/rclone/rclone/fs/config/configstruct" 25 "github.com/rclone/rclone/fs/fserrors" 26 "github.com/rclone/rclone/fs/fshttp" 27 "github.com/rclone/rclone/fs/hash" 28 "github.com/rclone/rclone/lib/bucket" 29 "github.com/rclone/rclone/lib/encoder" 30 "github.com/rclone/rclone/lib/pacer" 31 "github.com/rclone/rclone/lib/random" 32 "github.com/rclone/rclone/lib/rest" 33 ) 34 35 // Register with Fs 36 func init() { 37 fs.Register(&fs.RegInfo{ 38 Name: "internetarchive", 39 Description: "Internet Archive", 40 NewFs: NewFs, 41 42 MetadataInfo: &fs.MetadataInfo{ 43 System: map[string]fs.MetadataHelp{ 44 "name": { 45 Help: "Full file path, without the bucket part", 46 Type: "filename", 47 Example: "backend/internetarchive/internetarchive.go", 48 ReadOnly: true, 49 }, 50 "source": { 51 Help: "The source of the file", 52 Type: "string", 53 Example: "original", 54 ReadOnly: true, 55 }, 56 "mtime": { 57 Help: "Time of last modification, managed by Rclone", 58 Type: "RFC 3339", 59 Example: "2006-01-02T15:04:05.999999999Z", 60 ReadOnly: true, 61 }, 62 "size": { 63 Help: "File size in bytes", 64 Type: "decimal number", 65 Example: "123456", 66 ReadOnly: true, 67 }, 68 "md5": { 69 Help: "MD5 hash calculated by Internet Archive", 70 Type: "string", 71 Example: "01234567012345670123456701234567", 72 ReadOnly: true, 73 }, 74 "crc32": { 75 Help: "CRC32 calculated by Internet Archive", 76 Type: "string", 77 Example: "01234567", 78 ReadOnly: true, 79 }, 80 "sha1": { 81 Help: "SHA1 hash calculated by Internet Archive", 82 Type: "string", 83 Example: "0123456701234567012345670123456701234567", 84 ReadOnly: true, 85 }, 86 "format": { 87 Help: "Name of format identified by Internet Archive", 88 Type: "string", 89 Example: "Comma-Separated Values", 90 ReadOnly: true, 91 }, 92 "old_version": { 93 Help: "Whether the file was replaced and moved by keep-old-version flag", 94 Type: "boolean", 95 Example: "true", 96 ReadOnly: true, 97 }, 98 "viruscheck": { 99 Help: "The last time viruscheck process was run for the file (?)", 100 Type: "unixtime", 101 Example: "1654191352", 102 ReadOnly: true, 103 }, 104 "summation": { 105 Help: "Check https://forum.rclone.org/t/31922 for how it is used", 106 Type: "string", 107 Example: "md5", 108 ReadOnly: true, 109 }, 110 111 "rclone-ia-mtime": { 112 Help: "Time of last modification, managed by Internet Archive", 113 Type: "RFC 3339", 114 Example: "2006-01-02T15:04:05.999999999Z", 115 }, 116 "rclone-mtime": { 117 Help: "Time of last modification, managed by Rclone", 118 Type: "RFC 3339", 119 Example: "2006-01-02T15:04:05.999999999Z", 120 }, 121 "rclone-update-track": { 122 Help: "Random value used by Rclone for tracking changes inside Internet Archive", 123 Type: "string", 124 Example: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 125 }, 126 }, 127 Help: `Metadata fields provided by Internet Archive. 128 If there are multiple values for a key, only the first one is returned. 129 This is a limitation of Rclone, that supports one value per one key. 130 131 Owner is able to add custom keys. Metadata feature grabs all the keys including them. 132 `, 133 }, 134 135 Options: []fs.Option{{ 136 Name: "access_key_id", 137 Help: "IAS3 Access Key.\n\nLeave blank for anonymous access.\nYou can find one here: https://archive.org/account/s3.php", 138 Sensitive: true, 139 }, { 140 Name: "secret_access_key", 141 Help: "IAS3 Secret Key (password).\n\nLeave blank for anonymous access.", 142 Sensitive: true, 143 }, { 144 // their official client (https://github.com/jjjake/internetarchive) hardcodes following the two 145 Name: "endpoint", 146 Help: "IAS3 Endpoint.\n\nLeave blank for default value.", 147 Default: "https://s3.us.archive.org", 148 Advanced: true, 149 }, { 150 Name: "front_endpoint", 151 Help: "Host of InternetArchive Frontend.\n\nLeave blank for default value.", 152 Default: "https://archive.org", 153 Advanced: true, 154 }, { 155 Name: "disable_checksum", 156 Help: `Don't ask the server to test against MD5 checksum calculated by rclone. 157 Normally rclone will calculate the MD5 checksum of the input before 158 uploading it so it can ask the server to check the object against checksum. 159 This is great for data integrity checking but can cause long delays for 160 large files to start uploading.`, 161 Default: true, 162 Advanced: true, 163 }, { 164 Name: "wait_archive", 165 Help: `Timeout for waiting the server's processing tasks (specifically archive and book_op) to finish. 166 Only enable if you need to be guaranteed to be reflected after write operations. 167 0 to disable waiting. No errors to be thrown in case of timeout.`, 168 Default: fs.Duration(0), 169 Advanced: true, 170 }, { 171 Name: config.ConfigEncoding, 172 Help: config.ConfigEncodingHelp, 173 Advanced: true, 174 Default: encoder.EncodeZero | 175 encoder.EncodeSlash | 176 encoder.EncodeLtGt | 177 encoder.EncodeCrLf | 178 encoder.EncodeDel | 179 encoder.EncodeCtl | 180 encoder.EncodeInvalidUtf8 | 181 encoder.EncodeDot, 182 }, 183 }}) 184 } 185 186 // maximum size of an item. this is constant across all items 187 const iaItemMaxSize int64 = 1099511627776 188 189 // metadata keys that are not writeable 190 var roMetadataKey = map[string]interface{}{ 191 // do not add mtime here, it's a documented exception 192 "name": nil, "source": nil, "size": nil, "md5": nil, 193 "crc32": nil, "sha1": nil, "format": nil, "old_version": nil, 194 "viruscheck": nil, "summation": nil, 195 } 196 197 // Options defines the configuration for this backend 198 type Options struct { 199 AccessKeyID string `config:"access_key_id"` 200 SecretAccessKey string `config:"secret_access_key"` 201 Endpoint string `config:"endpoint"` 202 FrontEndpoint string `config:"front_endpoint"` 203 DisableChecksum bool `config:"disable_checksum"` 204 WaitArchive fs.Duration `config:"wait_archive"` 205 Enc encoder.MultiEncoder `config:"encoding"` 206 } 207 208 // Fs represents an IAS3 remote 209 type Fs struct { 210 name string // name of this remote 211 root string // the path we are working on if any 212 opt Options // parsed config options 213 features *fs.Features // optional features 214 srv *rest.Client // the connection to IAS3 215 front *rest.Client // the connection to frontend 216 pacer *fs.Pacer // pacer for API calls 217 ctx context.Context 218 } 219 220 // Object describes a file at IA 221 type Object struct { 222 fs *Fs // reference to Fs 223 remote string // the remote path 224 modTime time.Time // last modified time 225 size int64 // size of the file in bytes 226 md5 string // md5 hash of the file presented by the server 227 sha1 string // sha1 hash of the file presented by the server 228 crc32 string // crc32 of the file presented by the server 229 rawData json.RawMessage 230 } 231 232 // IAFile represents a subset of object in MetadataResponse.Files 233 type IAFile struct { 234 Name string `json:"name"` 235 // Source string `json:"source"` 236 Mtime string `json:"mtime"` 237 RcloneMtime json.RawMessage `json:"rclone-mtime"` 238 UpdateTrack json.RawMessage `json:"rclone-update-track"` 239 Size string `json:"size"` 240 Md5 string `json:"md5"` 241 Crc32 string `json:"crc32"` 242 Sha1 string `json:"sha1"` 243 Summation string `json:"summation"` 244 245 rawData json.RawMessage 246 } 247 248 // MetadataResponse represents subset of the JSON object returned by (frontend)/metadata/ 249 type MetadataResponse struct { 250 Files []IAFile `json:"files"` 251 ItemSize int64 `json:"item_size"` 252 } 253 254 // MetadataResponseRaw is the form of MetadataResponse to deal with metadata 255 type MetadataResponseRaw struct { 256 Files []json.RawMessage `json:"files"` 257 ItemSize int64 `json:"item_size"` 258 } 259 260 // ModMetadataResponse represents response for amending metadata 261 type ModMetadataResponse struct { 262 // https://archive.org/services/docs/api/md-write.html#example 263 Success bool `json:"success"` 264 Error string `json:"error"` 265 } 266 267 // Name of the remote (as passed into NewFs) 268 func (f *Fs) Name() string { 269 return f.name 270 } 271 272 // Root of the remote (as passed into NewFs) 273 func (f *Fs) Root() string { 274 return f.root 275 } 276 277 // String converts this Fs to a string 278 func (f *Fs) String() string { 279 bucket, file := f.split("") 280 if bucket == "" { 281 return "Internet Archive root" 282 } 283 if file == "" { 284 return fmt.Sprintf("Internet Archive item %s", bucket) 285 } 286 return fmt.Sprintf("Internet Archive item %s path %s", bucket, file) 287 } 288 289 // Features returns the optional features of this Fs 290 func (f *Fs) Features() *fs.Features { 291 return f.features 292 } 293 294 // Hashes returns type of hashes supported by IA 295 func (f *Fs) Hashes() hash.Set { 296 return hash.NewHashSet(hash.MD5, hash.SHA1, hash.CRC32) 297 } 298 299 // Precision returns the precision of mtime that the server responds 300 func (f *Fs) Precision() time.Duration { 301 if f.opt.WaitArchive == 0 { 302 return fs.ModTimeNotSupported 303 } 304 return time.Nanosecond 305 } 306 307 // retryErrorCodes is a slice of error codes that we will retry 308 // See: https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html 309 var retryErrorCodes = []int{ 310 429, // Too Many Requests 311 500, // Internal Server Error - "We encountered an internal error. Please try again." 312 503, // Service Unavailable/Slow Down - "Reduce your request rate" 313 } 314 315 // NewFs constructs an Fs from the path 316 func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) { 317 // Parse config into Options struct 318 opt := new(Options) 319 err := configstruct.Set(m, opt) 320 if err != nil { 321 return nil, err 322 } 323 324 // Parse the endpoints 325 ep, err := url.Parse(opt.Endpoint) 326 if err != nil { 327 return nil, err 328 } 329 fe, err := url.Parse(opt.FrontEndpoint) 330 if err != nil { 331 return nil, err 332 } 333 334 root = strings.Trim(root, "/") 335 336 f := &Fs{ 337 name: name, 338 opt: *opt, 339 ctx: ctx, 340 } 341 f.setRoot(root) 342 f.features = (&fs.Features{ 343 BucketBased: true, 344 ReadMetadata: true, 345 WriteMetadata: true, 346 UserMetadata: true, 347 }).Fill(ctx, f) 348 349 f.srv = rest.NewClient(fshttp.NewClient(ctx)) 350 f.srv.SetRoot(ep.String()) 351 352 f.front = rest.NewClient(fshttp.NewClient(ctx)) 353 f.front.SetRoot(fe.String()) 354 355 if opt.AccessKeyID != "" && opt.SecretAccessKey != "" { 356 auth := fmt.Sprintf("LOW %s:%s", opt.AccessKeyID, opt.SecretAccessKey) 357 f.srv.SetHeader("Authorization", auth) 358 f.front.SetHeader("Authorization", auth) 359 } 360 361 f.pacer = fs.NewPacer(ctx, pacer.NewS3(pacer.MinSleep(10*time.Millisecond))) 362 363 // test if the root exists as a file 364 _, err = f.NewObject(ctx, "/") 365 if err == nil { 366 f.setRoot(betterPathDir(root)) 367 return f, fs.ErrorIsFile 368 } 369 return f, nil 370 } 371 372 // setRoot changes the root of the Fs 373 func (f *Fs) setRoot(root string) { 374 f.root = strings.Trim(root, "/") 375 } 376 377 // Remote returns the remote path 378 func (o *Object) Remote() string { 379 return o.remote 380 } 381 382 // ModTime is the last modified time (read-only) 383 func (o *Object) ModTime(ctx context.Context) time.Time { 384 return o.modTime 385 } 386 387 // Size is the file length 388 func (o *Object) Size() int64 { 389 return o.size 390 } 391 392 // Fs returns the parent Fs 393 func (o *Object) Fs() fs.Info { 394 return o.fs 395 } 396 397 // Hash returns the hash value presented by IA 398 func (o *Object) Hash(ctx context.Context, ty hash.Type) (string, error) { 399 if ty == hash.MD5 { 400 return o.md5, nil 401 } 402 if ty == hash.SHA1 { 403 return o.sha1, nil 404 } 405 if ty == hash.CRC32 { 406 return o.crc32, nil 407 } 408 return "", hash.ErrUnsupported 409 } 410 411 // Storable returns if this object is storable 412 func (o *Object) Storable() bool { 413 return true 414 } 415 416 // SetModTime sets modTime on a particular file 417 func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) { 418 bucket, reqDir := o.split() 419 if bucket == "" { 420 return fs.ErrorCantSetModTime 421 } 422 if reqDir == "" { 423 return fs.ErrorCantSetModTime 424 } 425 426 // https://archive.org/services/docs/api/md-write.html 427 // the following code might be useful for modifying metadata of an uploaded file 428 patch := []map[string]string{ 429 // we should drop it first to clear all rclone-provided mtimes 430 { 431 "op": "remove", 432 "path": "/rclone-mtime", 433 }, { 434 "op": "add", 435 "path": "/rclone-mtime", 436 "value": t.Format(time.RFC3339Nano), 437 }} 438 res, err := json.Marshal(patch) 439 if err != nil { 440 return err 441 } 442 params := url.Values{} 443 params.Add("-target", fmt.Sprintf("files/%s", reqDir)) 444 params.Add("-patch", string(res)) 445 body := []byte(params.Encode()) 446 bodyLen := int64(len(body)) 447 448 var resp *http.Response 449 var result ModMetadataResponse 450 // make a POST request to (frontend)/metadata/:item/ 451 opts := rest.Opts{ 452 Method: "POST", 453 Path: path.Join("/metadata/", bucket), 454 Body: bytes.NewReader(body), 455 ContentLength: &bodyLen, 456 ContentType: "application/x-www-form-urlencoded", 457 } 458 459 err = o.fs.pacer.Call(func() (bool, error) { 460 resp, err = o.fs.front.CallJSON(ctx, &opts, nil, &result) 461 return o.fs.shouldRetry(resp, err) 462 }) 463 if err != nil { 464 return err 465 } 466 467 if result.Success { 468 o.modTime = t 469 return nil 470 } 471 472 return errors.New(result.Error) 473 } 474 475 // List files and directories in a directory 476 func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { 477 bucket, reqDir := f.split(dir) 478 if bucket == "" { 479 if reqDir != "" { 480 return nil, fs.ErrorListBucketRequired 481 } 482 return entries, nil 483 } 484 grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, reqDir), "/") + "/") 485 486 allEntries, err := f.listAllUnconstrained(ctx, bucket) 487 if err != nil { 488 return entries, err 489 } 490 for _, ent := range allEntries { 491 obj, ok := ent.(*Object) 492 if ok && strings.HasPrefix(obj.remote, grandparent) { 493 path := trimPathPrefix(obj.remote, grandparent, f.opt.Enc) 494 if !strings.Contains(path, "/") { 495 obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc) 496 entries = append(entries, obj) 497 } 498 } 499 dire, ok := ent.(*fs.Dir) 500 if ok && strings.HasPrefix(dire.Remote(), grandparent) { 501 path := trimPathPrefix(dire.Remote(), grandparent, f.opt.Enc) 502 if !strings.Contains(path, "/") { 503 dire.SetRemote(trimPathPrefix(dire.Remote(), f.root, f.opt.Enc)) 504 entries = append(entries, dire) 505 } 506 } 507 } 508 509 return entries, nil 510 } 511 512 // Mkdir can't be performed on IA like git repositories 513 func (f *Fs) Mkdir(ctx context.Context, dir string) (err error) { 514 return nil 515 } 516 517 // Rmdir as well, unless we're asked for recursive deletion 518 func (f *Fs) Rmdir(ctx context.Context, dir string) error { 519 return nil 520 } 521 522 // NewObject finds the Object at remote. If it can't be found 523 // it returns the error fs.ErrorObjectNotFound. 524 func (f *Fs) NewObject(ctx context.Context, remote string) (ret fs.Object, err error) { 525 bucket, filepath := f.split(remote) 526 filepath = strings.Trim(filepath, "/") 527 if bucket == "" { 528 if filepath != "" { 529 return nil, fs.ErrorListBucketRequired 530 } 531 return nil, fs.ErrorIsDir 532 } 533 534 grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, filepath), "/")) 535 536 allEntries, err := f.listAllUnconstrained(ctx, bucket) 537 if err != nil { 538 return nil, err 539 } 540 for _, ent := range allEntries { 541 obj, ok := ent.(*Object) 542 if ok && obj.remote == grandparent { 543 obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc) 544 return obj, nil 545 } 546 } 547 548 return nil, fs.ErrorObjectNotFound 549 } 550 551 // Put uploads a file 552 func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { 553 o := &Object{ 554 fs: f, 555 remote: src.Remote(), 556 modTime: src.ModTime(ctx), 557 size: src.Size(), 558 } 559 560 err := o.Update(ctx, in, src, options...) 561 if err == nil { 562 return o, nil 563 } 564 565 return nil, err 566 } 567 568 // PublicLink generates a public link to the remote path (usually readable by anyone) 569 func (f *Fs) PublicLink(ctx context.Context, remote string, expire fs.Duration, unlink bool) (link string, err error) { 570 if strings.HasSuffix(remote, "/") { 571 return "", fs.ErrorCantShareDirectories 572 } 573 if _, err := f.NewObject(ctx, remote); err != nil { 574 return "", err 575 } 576 bucket, bucketPath := f.split(remote) 577 return path.Join(f.opt.FrontEndpoint, "/download/", bucket, quotePath(bucketPath)), nil 578 } 579 580 // Copy src to this remote using server-side copy operations. 581 // 582 // This is stored with the remote path given. 583 // 584 // It returns the destination Object and a possible error. 585 // 586 // Will only be called if src.Fs().Name() == f.Name() 587 // 588 // If it isn't possible then return fs.ErrorCantCopy 589 func (f *Fs) Copy(ctx context.Context, src fs.Object, remote string) (_ fs.Object, err error) { 590 dstBucket, dstPath := f.split(remote) 591 srcObj, ok := src.(*Object) 592 if !ok { 593 fs.Debugf(src, "Can't copy - not same remote type") 594 return nil, fs.ErrorCantCopy 595 } 596 srcBucket, srcPath := srcObj.split() 597 598 if dstBucket == srcBucket && dstPath == srcPath { 599 // https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/cli/ia_copy.py#L68 600 fs.Debugf(src, "Can't copy - the source and destination files cannot be the same!") 601 return nil, fs.ErrorCantCopy 602 } 603 604 updateTracker := random.String(32) 605 headers := map[string]string{ 606 "x-archive-auto-make-bucket": "1", 607 "x-archive-queue-derive": "0", 608 "x-archive-keep-old-version": "0", 609 "x-amz-copy-source": quotePath(path.Join("/", srcBucket, srcPath)), 610 "x-amz-metadata-directive": "COPY", 611 "x-archive-filemeta-sha1": srcObj.sha1, 612 "x-archive-filemeta-md5": srcObj.md5, 613 "x-archive-filemeta-crc32": srcObj.crc32, 614 "x-archive-filemeta-size": fmt.Sprint(srcObj.size), 615 // add this too for sure 616 "x-archive-filemeta-rclone-mtime": srcObj.modTime.Format(time.RFC3339Nano), 617 "x-archive-filemeta-rclone-update-track": updateTracker, 618 } 619 620 // make a PUT request at (IAS3)/:item/:path without body 621 var resp *http.Response 622 opts := rest.Opts{ 623 Method: "PUT", 624 Path: "/" + url.PathEscape(path.Join(dstBucket, dstPath)), 625 ExtraHeaders: headers, 626 } 627 628 err = f.pacer.Call(func() (bool, error) { 629 resp, err = f.srv.Call(ctx, &opts) 630 return f.shouldRetry(resp, err) 631 }) 632 if err != nil { 633 return nil, err 634 } 635 636 // we can't update/find metadata here as IA will also 637 // queue server-side copy as well as upload/delete. 638 return f.waitFileUpload(ctx, trimPathPrefix(path.Join(dstBucket, dstPath), f.root, f.opt.Enc), updateTracker, srcObj.size) 639 } 640 641 // ListR lists the objects and directories of the Fs starting 642 // from dir recursively into out. 643 // 644 // dir should be "" to start from the root, and should not 645 // have trailing slashes. 646 // 647 // This should return ErrDirNotFound if the directory isn't 648 // found. 649 // 650 // It should call callback for each tranche of entries read. 651 // These need not be returned in any particular order. If 652 // callback returns an error then the listing will stop 653 // immediately. 654 // 655 // Don't implement this unless you have a more efficient way 656 // of listing recursively than doing a directory traversal. 657 func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) (err error) { 658 var allEntries, entries fs.DirEntries 659 bucket, reqDir := f.split(dir) 660 if bucket == "" { 661 if reqDir != "" { 662 return fs.ErrorListBucketRequired 663 } 664 return callback(entries) 665 } 666 grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, reqDir), "/") + "/") 667 668 allEntries, err = f.listAllUnconstrained(ctx, bucket) 669 if err != nil { 670 return err 671 } 672 for _, ent := range allEntries { 673 obj, ok := ent.(*Object) 674 if ok && strings.HasPrefix(obj.remote, grandparent) { 675 obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc) 676 entries = append(entries, obj) 677 } 678 dire, ok := ent.(*fs.Dir) 679 if ok && strings.HasPrefix(dire.Remote(), grandparent) { 680 dire.SetRemote(trimPathPrefix(dire.Remote(), f.root, f.opt.Enc)) 681 entries = append(entries, dire) 682 } 683 } 684 685 return callback(entries) 686 } 687 688 // CleanUp removes all files inside history/ 689 func (f *Fs) CleanUp(ctx context.Context) (err error) { 690 bucket, _ := f.split("/") 691 if bucket == "" { 692 return fs.ErrorListBucketRequired 693 } 694 entries, err := f.listAllUnconstrained(ctx, bucket) 695 if err != nil { 696 return err 697 } 698 699 for _, ent := range entries { 700 obj, ok := ent.(*Object) 701 if ok && strings.HasPrefix(obj.remote, bucket+"/history/") { 702 err = obj.Remove(ctx) 703 if err != nil { 704 return err 705 } 706 } 707 // we can fully ignore directories, as they're just virtual entries to 708 // comply with rclone's requirement 709 } 710 711 return nil 712 } 713 714 // About returns things about remaining and used spaces 715 func (f *Fs) About(ctx context.Context) (_ *fs.Usage, err error) { 716 bucket, _ := f.split("/") 717 if bucket == "" { 718 return nil, fs.ErrorListBucketRequired 719 } 720 721 result, err := f.requestMetadata(ctx, bucket) 722 if err != nil { 723 return nil, err 724 } 725 726 // perform low-level operation here since it's ridiculous to make 2 same requests 727 var historySize int64 728 for _, ent := range result.Files { 729 if strings.HasPrefix(ent.Name, "history/") { 730 size := parseSize(ent.Size) 731 if size < 0 { 732 // parse error can be ignored since it's not fatal 733 continue 734 } 735 historySize += size 736 } 737 } 738 739 usage := &fs.Usage{ 740 Total: fs.NewUsageValue(iaItemMaxSize), 741 Free: fs.NewUsageValue(iaItemMaxSize - result.ItemSize), 742 Used: fs.NewUsageValue(result.ItemSize), 743 Trashed: fs.NewUsageValue(historySize), // bytes in trash 744 } 745 return usage, nil 746 } 747 748 // Open an object for read 749 func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) { 750 var optionsFixed []fs.OpenOption 751 for _, opt := range options { 752 if optRange, ok := opt.(*fs.RangeOption); ok { 753 // Ignore range option if file is empty 754 if o.Size() == 0 && optRange.Start == 0 && optRange.End > 0 { 755 continue 756 } 757 } 758 optionsFixed = append(optionsFixed, opt) 759 } 760 761 var resp *http.Response 762 // make a GET request to (frontend)/download/:item/:path 763 opts := rest.Opts{ 764 Method: "GET", 765 Path: path.Join("/download/", o.fs.root, quotePath(o.fs.opt.Enc.FromStandardPath(o.remote))), 766 Options: optionsFixed, 767 } 768 err = o.fs.pacer.Call(func() (bool, error) { 769 resp, err = o.fs.front.Call(ctx, &opts) 770 return o.fs.shouldRetry(resp, err) 771 }) 772 if err != nil { 773 return nil, err 774 } 775 return resp.Body, nil 776 } 777 778 // Update the Object from in with modTime and size 779 func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (err error) { 780 bucket, bucketPath := o.split() 781 modTime := src.ModTime(ctx) 782 size := src.Size() 783 updateTracker := random.String(32) 784 785 // Set the mtime in the metadata 786 // internetarchive backend builds at header level as IAS3 has extension outside X-Amz- 787 headers := map[string]string{ 788 // https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/iarequest.py#L158 789 "x-amz-filemeta-rclone-mtime": modTime.Format(time.RFC3339Nano), 790 "x-amz-filemeta-rclone-update-track": updateTracker, 791 792 // we add some more headers for intuitive actions 793 "x-amz-auto-make-bucket": "1", // create an item if does not exist, do nothing if already 794 "x-archive-auto-make-bucket": "1", // same as above in IAS3 original way 795 "x-archive-keep-old-version": "0", // do not keep old versions (a.k.a. trashes in other clouds) 796 "x-archive-meta-mediatype": "data", // mark media type of the uploading file as "data" 797 "x-archive-queue-derive": "0", // skip derivation process (e.g. encoding to smaller files, OCR on PDFs) 798 "x-archive-cascade-delete": "1", // enable "cascate delete" (delete all derived files in addition to the file itself) 799 } 800 if size >= 0 { 801 headers["Content-Length"] = fmt.Sprintf("%d", size) 802 headers["x-archive-size-hint"] = fmt.Sprintf("%d", size) 803 } 804 var mdata fs.Metadata 805 mdata, err = fs.GetMetadataOptions(ctx, o.fs, src, options) 806 if err == nil && mdata != nil { 807 for mk, mv := range mdata { 808 mk = strings.ToLower(mk) 809 if strings.HasPrefix(mk, "rclone-") { 810 fs.LogPrintf(fs.LogLevelWarning, o, "reserved metadata key %s is about to set", mk) 811 } else if _, ok := roMetadataKey[mk]; ok { 812 fs.LogPrintf(fs.LogLevelWarning, o, "setting or modifying read-only key %s is requested, skipping", mk) 813 continue 814 } else if mk == "mtime" { 815 // redirect to make it work 816 mk = "rclone-mtime" 817 } 818 headers[fmt.Sprintf("x-amz-filemeta-%s", mk)] = mv 819 } 820 } 821 822 // read the md5sum if available 823 var md5sumHex string 824 if !o.fs.opt.DisableChecksum { 825 md5sumHex, err = src.Hash(ctx, hash.MD5) 826 if err == nil && matchMd5.MatchString(md5sumHex) { 827 // Set the md5sum in header on the object if 828 // the user wants it 829 // https://github.com/jjjake/internetarchive/blob/245637653/internetarchive/item.py#L969 830 headers["Content-MD5"] = md5sumHex 831 } 832 } 833 834 // make a PUT request at (IAS3)/encoded(:item/:path) 835 var resp *http.Response 836 opts := rest.Opts{ 837 Method: "PUT", 838 Path: "/" + url.PathEscape(path.Join(bucket, bucketPath)), 839 Body: in, 840 ContentLength: &size, 841 ExtraHeaders: headers, 842 } 843 844 err = o.fs.pacer.Call(func() (bool, error) { 845 resp, err = o.fs.srv.Call(ctx, &opts) 846 return o.fs.shouldRetry(resp, err) 847 }) 848 849 // we can't update/find metadata here as IA will "ingest" uploaded file(s) 850 // upon uploads. (you can find its progress at https://archive.org/history/ItemNameHere ) 851 // or we have to wait for finish? (needs polling (frontend)/metadata/:item or scraping (frontend)/history/:item) 852 var newObj *Object 853 if err == nil { 854 newObj, err = o.fs.waitFileUpload(ctx, o.remote, updateTracker, size) 855 } else { 856 newObj = &Object{} 857 } 858 o.crc32 = newObj.crc32 859 o.md5 = newObj.md5 860 o.sha1 = newObj.sha1 861 o.modTime = newObj.modTime 862 o.size = newObj.size 863 return err 864 } 865 866 // Remove an object 867 func (o *Object) Remove(ctx context.Context) (err error) { 868 bucket, bucketPath := o.split() 869 870 // make a DELETE request at (IAS3)/:item/:path 871 var resp *http.Response 872 opts := rest.Opts{ 873 Method: "DELETE", 874 Path: "/" + url.PathEscape(path.Join(bucket, bucketPath)), 875 } 876 877 err = o.fs.pacer.Call(func() (bool, error) { 878 resp, err = o.fs.srv.Call(ctx, &opts) 879 return o.fs.shouldRetry(resp, err) 880 }) 881 882 // deleting files can take bit longer as 883 // it'll be processed on same queue as uploads 884 if err == nil { 885 err = o.fs.waitDelete(ctx, bucket, bucketPath) 886 } 887 return err 888 } 889 890 // String converts this Fs to a string 891 func (o *Object) String() string { 892 if o == nil { 893 return "<nil>" 894 } 895 return o.remote 896 } 897 898 // Metadata returns all file metadata provided by Internet Archive 899 func (o *Object) Metadata(ctx context.Context) (m fs.Metadata, err error) { 900 if o.rawData == nil { 901 return nil, nil 902 } 903 raw := make(map[string]json.RawMessage) 904 err = json.Unmarshal(o.rawData, &raw) 905 if err != nil { 906 // fatal: json parsing failed 907 return 908 } 909 for k, v := range raw { 910 items, err := listOrString(v) 911 if len(items) == 0 || err != nil { 912 // skip: an entry failed to parse 913 continue 914 } 915 m.Set(k, items[0]) 916 } 917 // move the old mtime to an another key 918 if v, ok := m["mtime"]; ok { 919 m["rclone-ia-mtime"] = v 920 } 921 // overwrite with a correct mtime 922 m["mtime"] = o.modTime.Format(time.RFC3339Nano) 923 return 924 } 925 926 func (f *Fs) shouldRetry(resp *http.Response, err error) (bool, error) { 927 if resp != nil { 928 for _, e := range retryErrorCodes { 929 if resp.StatusCode == e { 930 return true, err 931 } 932 } 933 } 934 // Ok, not an awserr, check for generic failure conditions 935 return fserrors.ShouldRetry(err), err 936 } 937 938 var matchMd5 = regexp.MustCompile(`^[0-9a-f]{32}$`) 939 940 // split returns bucket and bucketPath from the rootRelativePath 941 // relative to f.root 942 func (f *Fs) split(rootRelativePath string) (bucketName, bucketPath string) { 943 bucketName, bucketPath = bucket.Split(path.Join(f.root, rootRelativePath)) 944 return f.opt.Enc.FromStandardName(bucketName), f.opt.Enc.FromStandardPath(bucketPath) 945 } 946 947 // split returns bucket and bucketPath from the object 948 func (o *Object) split() (bucket, bucketPath string) { 949 return o.fs.split(o.remote) 950 } 951 952 func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result *MetadataResponse, err error) { 953 var resp *http.Response 954 // make a GET request to (frontend)/metadata/:item/ 955 opts := rest.Opts{ 956 Method: "GET", 957 Path: path.Join("/metadata/", bucket), 958 } 959 960 var temp MetadataResponseRaw 961 err = f.pacer.Call(func() (bool, error) { 962 resp, err = f.front.CallJSON(ctx, &opts, nil, &temp) 963 return f.shouldRetry(resp, err) 964 }) 965 if err != nil { 966 return 967 } 968 return temp.unraw() 969 } 970 971 // list up all files/directories without any filters 972 func (f *Fs) listAllUnconstrained(ctx context.Context, bucket string) (entries fs.DirEntries, err error) { 973 result, err := f.requestMetadata(ctx, bucket) 974 if err != nil { 975 return nil, err 976 } 977 978 knownDirs := map[string]time.Time{ 979 "": time.Unix(0, 0), 980 } 981 for _, file := range result.Files { 982 dir := strings.Trim(betterPathDir(file.Name), "/") 983 nameWithBucket := path.Join(bucket, file.Name) 984 985 mtimeTime := file.parseMtime() 986 987 // populate children directories 988 child := dir 989 for { 990 if _, ok := knownDirs[child]; ok { 991 break 992 } 993 // directory 994 d := fs.NewDir(f.opt.Enc.ToStandardPath(path.Join(bucket, child)), mtimeTime) 995 entries = append(entries, d) 996 997 knownDirs[child] = mtimeTime 998 child = strings.Trim(betterPathDir(child), "/") 999 } 1000 if _, ok := knownDirs[betterPathDir(file.Name)]; !ok { 1001 continue 1002 } 1003 1004 size := parseSize(file.Size) 1005 1006 o := makeValidObject(f, f.opt.Enc.ToStandardPath(nameWithBucket), file, mtimeTime, size) 1007 entries = append(entries, o) 1008 } 1009 1010 return entries, nil 1011 } 1012 1013 func (f *Fs) waitFileUpload(ctx context.Context, reqPath, tracker string, newSize int64) (ret *Object, err error) { 1014 bucket, bucketPath := f.split(reqPath) 1015 1016 ret = &Object{ 1017 fs: f, 1018 remote: trimPathPrefix(path.Join(bucket, bucketPath), f.root, f.opt.Enc), 1019 modTime: time.Unix(0, 0), 1020 size: -1, 1021 } 1022 1023 if f.opt.WaitArchive == 0 { 1024 // user doesn't want to poll, let's not 1025 ret2, err := f.NewObject(ctx, reqPath) 1026 if err == nil { 1027 ret2, ok := ret2.(*Object) 1028 if ok { 1029 ret = ret2 1030 ret.crc32 = "" 1031 ret.md5 = "" 1032 ret.sha1 = "" 1033 ret.size = -1 1034 } 1035 } 1036 return ret, nil 1037 } 1038 1039 retC := make(chan struct { 1040 *Object 1041 error 1042 }, 1) 1043 go func() { 1044 isFirstTime := true 1045 existed := false 1046 for { 1047 if !isFirstTime { 1048 // depending on the queue, it takes time 1049 time.Sleep(10 * time.Second) 1050 } 1051 metadata, err := f.requestMetadata(ctx, bucket) 1052 if err != nil { 1053 retC <- struct { 1054 *Object 1055 error 1056 }{ret, err} 1057 return 1058 } 1059 1060 var iaFile *IAFile 1061 for _, f := range metadata.Files { 1062 if f.Name == bucketPath { 1063 iaFile = &f 1064 break 1065 } 1066 } 1067 if isFirstTime { 1068 isFirstTime = false 1069 existed = iaFile != nil 1070 } 1071 if iaFile == nil { 1072 continue 1073 } 1074 if !existed && !isFirstTime { 1075 // fast path: file wasn't exited before 1076 retC <- struct { 1077 *Object 1078 error 1079 }{makeValidObject2(f, *iaFile, bucket), nil} 1080 return 1081 } 1082 1083 fileTrackers, _ := listOrString(iaFile.UpdateTrack) 1084 trackerMatch := false 1085 for _, v := range fileTrackers { 1086 if v == tracker { 1087 trackerMatch = true 1088 break 1089 } 1090 } 1091 if !trackerMatch { 1092 continue 1093 } 1094 if !compareSize(parseSize(iaFile.Size), newSize) { 1095 continue 1096 } 1097 1098 // voila! 1099 retC <- struct { 1100 *Object 1101 error 1102 }{makeValidObject2(f, *iaFile, bucket), nil} 1103 return 1104 } 1105 }() 1106 1107 select { 1108 case res := <-retC: 1109 return res.Object, res.error 1110 case <-time.After(time.Duration(f.opt.WaitArchive)): 1111 return ret, nil 1112 } 1113 } 1114 1115 func (f *Fs) waitDelete(ctx context.Context, bucket, bucketPath string) (err error) { 1116 if f.opt.WaitArchive == 0 { 1117 // user doesn't want to poll, let's not 1118 return nil 1119 } 1120 1121 retC := make(chan error, 1) 1122 go func() { 1123 for { 1124 metadata, err := f.requestMetadata(ctx, bucket) 1125 if err != nil { 1126 retC <- err 1127 return 1128 } 1129 1130 found := false 1131 for _, f := range metadata.Files { 1132 if f.Name == bucketPath { 1133 found = true 1134 break 1135 } 1136 } 1137 1138 if !found { 1139 retC <- nil 1140 return 1141 } 1142 1143 // depending on the queue, it takes time 1144 time.Sleep(10 * time.Second) 1145 } 1146 }() 1147 1148 select { 1149 case res := <-retC: 1150 return res 1151 case <-time.After(time.Duration(f.opt.WaitArchive)): 1152 return nil 1153 } 1154 } 1155 1156 func makeValidObject(f *Fs, remote string, file IAFile, mtime time.Time, size int64) *Object { 1157 ret := &Object{ 1158 fs: f, 1159 remote: remote, 1160 modTime: mtime, 1161 size: size, 1162 rawData: file.rawData, 1163 } 1164 // hashes from _files.xml (where summation != "") is different from one in other files 1165 // https://forum.rclone.org/t/internet-archive-md5-tag-in-id-files-xml-interpreted-incorrectly/31922 1166 if file.Summation == "" { 1167 ret.md5 = file.Md5 1168 ret.crc32 = file.Crc32 1169 ret.sha1 = file.Sha1 1170 } 1171 return ret 1172 } 1173 1174 func makeValidObject2(f *Fs, file IAFile, bucket string) *Object { 1175 mtimeTime := file.parseMtime() 1176 1177 size := parseSize(file.Size) 1178 1179 return makeValidObject(f, trimPathPrefix(path.Join(bucket, file.Name), f.root, f.opt.Enc), file, mtimeTime, size) 1180 } 1181 1182 func listOrString(jm json.RawMessage) (rmArray []string, err error) { 1183 // rclone-metadata can be an array or string 1184 // try to deserialize it as array first 1185 err = json.Unmarshal(jm, &rmArray) 1186 if err != nil { 1187 // if not, it's a string 1188 dst := new(string) 1189 err = json.Unmarshal(jm, dst) 1190 if err == nil { 1191 rmArray = []string{*dst} 1192 } 1193 } 1194 return 1195 } 1196 1197 func (file IAFile) parseMtime() (mtime time.Time) { 1198 // method 1: use metadata added by rclone 1199 rmArray, err := listOrString(file.RcloneMtime) 1200 // let's take the first value we can deserialize 1201 for _, value := range rmArray { 1202 mtime, err = time.Parse(time.RFC3339Nano, value) 1203 if err == nil { 1204 break 1205 } 1206 } 1207 if err != nil { 1208 // method 2: use metadata added by IAS3 1209 mtime, err = swift.FloatStringToTime(file.Mtime) 1210 } 1211 if err != nil { 1212 // metadata files don't have some of the fields 1213 mtime = time.Unix(0, 0) 1214 } 1215 return mtime 1216 } 1217 1218 func (mrr *MetadataResponseRaw) unraw() (_ *MetadataResponse, err error) { 1219 var files []IAFile 1220 for _, raw := range mrr.Files { 1221 var parsed IAFile 1222 err = json.Unmarshal(raw, &parsed) 1223 if err != nil { 1224 return nil, err 1225 } 1226 parsed.rawData = raw 1227 files = append(files, parsed) 1228 } 1229 return &MetadataResponse{ 1230 Files: files, 1231 ItemSize: mrr.ItemSize, 1232 }, nil 1233 } 1234 1235 func compareSize(a, b int64) bool { 1236 if a < 0 || b < 0 { 1237 // we won't compare if any of them is not known 1238 return true 1239 } 1240 return a == b 1241 } 1242 1243 func parseSize(str string) int64 { 1244 size, err := strconv.ParseInt(str, 10, 64) 1245 if err != nil { 1246 size = -1 1247 } 1248 return size 1249 } 1250 1251 func betterPathDir(p string) string { 1252 d := path.Dir(p) 1253 if d == "." { 1254 return "" 1255 } 1256 return d 1257 } 1258 1259 func betterPathClean(p string) string { 1260 d := path.Clean(p) 1261 if d == "." { 1262 return "" 1263 } 1264 return d 1265 } 1266 1267 func trimPathPrefix(s, prefix string, enc encoder.MultiEncoder) string { 1268 // we need to clean the paths to make tests pass! 1269 s = betterPathClean(s) 1270 prefix = betterPathClean(prefix) 1271 if s == prefix || s == prefix+"/" { 1272 return "" 1273 } 1274 prefix = enc.ToStandardPath(strings.TrimRight(prefix, "/")) 1275 return enc.ToStandardPath(strings.TrimPrefix(s, prefix+"/")) 1276 } 1277 1278 // mimics urllib.parse.quote() on Python; exclude / from url.PathEscape 1279 func quotePath(s string) string { 1280 seg := strings.Split(s, "/") 1281 newValues := []string{} 1282 for _, v := range seg { 1283 newValues = append(newValues, url.PathEscape(v)) 1284 } 1285 return strings.Join(newValues, "/") 1286 } 1287 1288 var ( 1289 _ fs.Fs = &Fs{} 1290 _ fs.Copier = &Fs{} 1291 _ fs.ListRer = &Fs{} 1292 _ fs.CleanUpper = &Fs{} 1293 _ fs.PublicLinker = &Fs{} 1294 _ fs.Abouter = &Fs{} 1295 _ fs.Object = &Object{} 1296 _ fs.Metadataer = &Object{} 1297 )