github.com/rclone/rclone@v1.66.1-0.20240517100346-7b89735ae726/fs/operations/dedupe.go (about) 1 // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega) 2 3 package operations 4 5 import ( 6 "context" 7 "fmt" 8 "log" 9 "path" 10 "sort" 11 "strings" 12 13 "github.com/rclone/rclone/fs" 14 "github.com/rclone/rclone/fs/accounting" 15 "github.com/rclone/rclone/fs/config" 16 "github.com/rclone/rclone/fs/hash" 17 "github.com/rclone/rclone/fs/walk" 18 ) 19 20 // dedupeRename renames the objs slice to different names 21 func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) { 22 doMove := f.Features().Move 23 if doMove == nil { 24 log.Fatalf("Fs %v doesn't support Move", f) 25 } 26 ext := path.Ext(remote) 27 base := remote[:len(remote)-len(ext)] 28 29 outer: 30 for i, o := range objs { 31 suffix := 1 32 newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 33 _, err := f.NewObject(ctx, newName) 34 for ; err != fs.ErrorObjectNotFound; suffix++ { 35 if err != nil { 36 err = fs.CountError(err) 37 fs.Errorf(o, "Failed to check for existing object: %v", err) 38 continue outer 39 } 40 if suffix > 100 { 41 fs.Errorf(o, "Could not find an available new name") 42 continue outer 43 } 44 newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 45 _, err = f.NewObject(ctx, newName) 46 } 47 if !SkipDestructive(ctx, o, "rename") { 48 newObj, err := doMove(ctx, o, newName) 49 if err != nil { 50 err = fs.CountError(err) 51 fs.Errorf(o, "Failed to rename: %v", err) 52 continue 53 } 54 fs.Infof(newObj, "renamed from: %v", o) 55 } 56 } 57 } 58 59 // dedupeDeleteAllButOne deletes all but the one in keep 60 func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) { 61 count := 0 62 for i, o := range objs { 63 if i == keep { 64 continue 65 } 66 err := DeleteFile(ctx, o) 67 if err == nil { 68 count++ 69 } 70 } 71 if count > 0 { 72 fs.Logf(remote, "Deleted %d extra copies", count) 73 } 74 } 75 76 // dedupeDeleteIdentical deletes all but one of identical (by hash) copies 77 func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) { 78 ci := fs.GetConfig(ctx) 79 80 // Make map of IDs 81 IDs := make(map[string]int, len(objs)) 82 for _, o := range objs { 83 if do, ok := o.(fs.IDer); ok { 84 if ID := do.ID(); ID != "" { 85 IDs[ID]++ 86 } 87 } 88 } 89 90 // Remove duplicate IDs 91 newObjs := objs[:0] 92 for _, o := range objs { 93 if do, ok := o.(fs.IDer); ok { 94 if ID := do.ID(); ID != "" { 95 if IDs[ID] <= 1 { 96 newObjs = append(newObjs, o) 97 } else { 98 fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID]) 99 } 100 } 101 } 102 } 103 objs = newObjs 104 105 // See how many of these duplicates are identical 106 dupesByID := make(map[string][]fs.Object, len(objs)) 107 for _, o := range objs { 108 ID := "" 109 if ci.SizeOnly && o.Size() >= 0 { 110 ID = fmt.Sprintf("size %d", o.Size()) 111 } else if ht != hash.None { 112 hashValue, err := o.Hash(ctx, ht) 113 if err == nil && hashValue != "" { 114 ID = fmt.Sprintf("%v %s", ht, hashValue) 115 } 116 } 117 if ID == "" { 118 remainingObjs = append(remainingObjs, o) 119 } else { 120 dupesByID[ID] = append(dupesByID[ID], o) 121 } 122 } 123 124 // Delete identical duplicates, filling remainingObjs with the ones remaining 125 for ID, dupes := range dupesByID { 126 remainingObjs = append(remainingObjs, dupes[0]) 127 if len(dupes) > 1 { 128 fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID) 129 for _, o := range dupes[1:] { 130 err := DeleteFile(ctx, o) 131 if err != nil { 132 remainingObjs = append(remainingObjs, o) 133 } 134 } 135 } 136 } 137 138 return remainingObjs 139 } 140 141 // dedupeList lists the duplicates and does nothing 142 func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) { 143 fmt.Printf("%s: %d duplicates\n", remote, len(objs)) 144 for i, o := range objs { 145 hashValue := "" 146 if ht != hash.None { 147 var err error 148 hashValue, err = o.Hash(ctx, ht) 149 if err != nil { 150 hashValue = err.Error() 151 } 152 } 153 if byHash { 154 fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote()) 155 } else { 156 fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue) 157 } 158 } 159 } 160 161 // dedupeInteractive interactively dedupes the slice of objects 162 func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool { 163 dedupeList(ctx, f, ht, remote, objs, byHash) 164 commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"} 165 if !byHash { 166 commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)") 167 } 168 commands = append(commands, "qQuit") 169 switch config.Command(commands) { 170 case 's': 171 case 'k': 172 keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) 173 dedupeDeleteAllButOne(ctx, keep-1, remote, objs) 174 case 'r': 175 dedupeRename(ctx, f, remote, objs) 176 case 'q': 177 return false 178 } 179 return true 180 } 181 182 // DeduplicateMode is how the dedupe command chooses what to do 183 type DeduplicateMode int 184 185 // Deduplicate modes 186 const ( 187 DeduplicateInteractive DeduplicateMode = iota // interactively ask the user 188 DeduplicateSkip // skip all conflicts 189 DeduplicateFirst // choose the first object 190 DeduplicateNewest // choose the newest object 191 DeduplicateOldest // choose the oldest object 192 DeduplicateRename // rename the objects 193 DeduplicateLargest // choose the largest object 194 DeduplicateSmallest // choose the smallest object 195 DeduplicateList // list duplicates only 196 ) 197 198 func (x DeduplicateMode) String() string { 199 switch x { 200 case DeduplicateInteractive: 201 return "interactive" 202 case DeduplicateSkip: 203 return "skip" 204 case DeduplicateFirst: 205 return "first" 206 case DeduplicateNewest: 207 return "newest" 208 case DeduplicateOldest: 209 return "oldest" 210 case DeduplicateRename: 211 return "rename" 212 case DeduplicateLargest: 213 return "largest" 214 case DeduplicateSmallest: 215 return "smallest" 216 case DeduplicateList: 217 return "list" 218 } 219 return "unknown" 220 } 221 222 // Set a DeduplicateMode from a string 223 func (x *DeduplicateMode) Set(s string) error { 224 switch strings.ToLower(s) { 225 case "interactive": 226 *x = DeduplicateInteractive 227 case "skip": 228 *x = DeduplicateSkip 229 case "first": 230 *x = DeduplicateFirst 231 case "newest": 232 *x = DeduplicateNewest 233 case "oldest": 234 *x = DeduplicateOldest 235 case "rename": 236 *x = DeduplicateRename 237 case "largest": 238 *x = DeduplicateLargest 239 case "smallest": 240 *x = DeduplicateSmallest 241 case "list": 242 *x = DeduplicateList 243 default: 244 return fmt.Errorf("unknown mode for dedupe %q", s) 245 } 246 return nil 247 } 248 249 // Type of the value 250 func (x *DeduplicateMode) Type() string { 251 return "string" 252 } 253 254 // Directory with entry count and links to parents 255 type dedupeDir struct { 256 dir fs.Directory 257 parent string 258 count int 259 } 260 261 // Map of directories by ID with recursive counts 262 type dedupeDirsMap map[string]*dedupeDir 263 264 func (dm dedupeDirsMap) get(id string) *dedupeDir { 265 d := dm[id] 266 if d == nil { 267 d = &dedupeDir{} 268 dm[id] = d 269 } 270 return d 271 } 272 273 func (dm dedupeDirsMap) increment(parent string) { 274 if parent != "" { 275 d := dm.get(parent) 276 d.count++ 277 dm.increment(d.parent) 278 } 279 } 280 281 // dedupeFindDuplicateDirs scans f for duplicate directories 282 func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) { 283 dirsByID := dedupeDirsMap{} 284 dirs := map[string][]*dedupeDir{} 285 286 ci := fs.GetConfig(ctx) 287 err = walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error { 288 for _, entry := range entries { 289 tr := accounting.Stats(ctx).NewCheckingTransfer(entry, "merging") 290 291 remote := entry.Remote() 292 parentRemote := path.Dir(remote) 293 if parentRemote == "." { 294 parentRemote = "" 295 } 296 297 // Obtain ID of the object parent, if known. 298 // (This usually means that backend allows duplicate paths) 299 // Fall back to remote parent path, if unavailable. 300 var parent string 301 if entryParentIDer, ok := entry.(fs.ParentIDer); ok { 302 parent = entryParentIDer.ParentID() 303 } 304 if parent == "" { 305 parent = parentRemote 306 } 307 308 var ID string 309 if entryIDer, ok := entry.(fs.IDer); ok { 310 ID = entryIDer.ID() 311 } 312 if ID == "" { 313 ID = remote 314 } 315 316 if fsDir, ok := entry.(fs.Directory); ok { 317 d := dirsByID.get(ID) 318 d.dir = fsDir 319 d.parent = parent 320 dirs[remote] = append(dirs[remote], d) 321 } 322 323 dirsByID.increment(parent) 324 tr.Done(ctx, nil) 325 } 326 return nil 327 }) 328 if err != nil { 329 return nil, fmt.Errorf("find duplicate dirs: %w", err) 330 } 331 332 // Make sure parents are before children 333 duplicateNames := []string{} 334 for name, ds := range dirs { 335 if len(ds) > 1 { 336 duplicateNames = append(duplicateNames, name) 337 } 338 } 339 sort.Strings(duplicateNames) 340 for _, name := range duplicateNames { 341 duplicateDirs = append(duplicateDirs, dirs[name]) 342 } 343 344 return 345 } 346 347 // dedupeMergeDuplicateDirs merges all the duplicate directories found 348 func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error { 349 mergeDirs := f.Features().MergeDirs 350 if mergeDirs == nil { 351 return fmt.Errorf("%v: can't merge directories", f) 352 } 353 dirCacheFlush := f.Features().DirCacheFlush 354 if dirCacheFlush == nil { 355 return fmt.Errorf("%v: can't flush dir cache", f) 356 } 357 for _, dedupeDirs := range duplicateDirs { 358 if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") { 359 continue 360 } 361 362 // Put largest directory in front to minimize movements 363 fsDirs := []fs.Directory{} 364 largestCount := -1 365 largestIdx := 0 366 for i, d := range dedupeDirs { 367 fsDirs = append(fsDirs, d.dir) 368 if d.count > largestCount { 369 largestIdx = i 370 largestCount = d.count 371 } 372 } 373 fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx] 374 375 fs.Infof(fsDirs[0], "Merging contents of duplicate directories") 376 err := mergeDirs(ctx, fsDirs) 377 if err != nil { 378 err = fs.CountError(err) 379 fs.Errorf(nil, "merge duplicate dirs: %v", err) 380 } 381 } 382 dirCacheFlush() 383 return nil 384 } 385 386 // sort oldest first 387 func sortOldestFirst(objs []fs.Object) { 388 sort.Slice(objs, func(i, j int) bool { 389 return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO())) 390 }) 391 } 392 393 // sort smallest first 394 func sortSmallestFirst(objs []fs.Object) { 395 sort.Slice(objs, func(i, j int) bool { 396 return objs[i].Size() < objs[j].Size() 397 }) 398 } 399 400 // Deduplicate interactively finds duplicate files and offers to 401 // delete all but one or rename them to be different. Only useful with 402 // Google Drive which can have duplicate file names. 403 func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error { 404 ci := fs.GetConfig(ctx) 405 // find a hash to use 406 ht := f.Hashes().GetOne() 407 what := "names" 408 if byHash { 409 if ht == hash.None { 410 return fmt.Errorf("%v has no hashes", f) 411 } 412 what = ht.String() + " hashes" 413 } 414 fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode) 415 416 // Find duplicate directories first and fix them 417 if !byHash { 418 duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) 419 if err != nil { 420 return err 421 } 422 if len(duplicateDirs) > 0 { 423 if mode != DeduplicateList { 424 err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs) 425 if err != nil { 426 return err 427 } 428 } else { 429 for _, dedupeDirs := range duplicateDirs { 430 remote := dedupeDirs[0].dir.Remote() 431 fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs)) 432 } 433 } 434 } 435 } 436 437 // Now find duplicate files 438 files := map[string][]fs.Object{} 439 err := walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { 440 entries.ForObject(func(o fs.Object) { 441 tr := accounting.Stats(ctx).NewCheckingTransfer(o, "checking") 442 defer tr.Done(ctx, nil) 443 444 var remote string 445 var err error 446 if byHash { 447 remote, err = o.Hash(ctx, ht) 448 if err != nil { 449 fs.Errorf(o, "Failed to hash: %v", err) 450 remote = "" 451 } 452 } else { 453 remote = o.Remote() 454 } 455 if remote != "" { 456 files[remote] = append(files[remote], o) 457 } 458 }) 459 return nil 460 }) 461 if err != nil { 462 return err 463 } 464 465 for remote, objs := range files { 466 if len(objs) <= 1 { 467 continue 468 } 469 fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what) 470 if !byHash && mode != DeduplicateList { 471 objs = dedupeDeleteIdentical(ctx, ht, remote, objs) 472 if len(objs) <= 1 { 473 fs.Logf(remote, "All duplicates removed") 474 continue 475 } 476 } 477 switch mode { 478 case DeduplicateInteractive: 479 if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) { 480 return nil 481 } 482 case DeduplicateFirst: 483 dedupeDeleteAllButOne(ctx, 0, remote, objs) 484 case DeduplicateNewest: 485 sortOldestFirst(objs) 486 dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) 487 case DeduplicateOldest: 488 sortOldestFirst(objs) 489 dedupeDeleteAllButOne(ctx, 0, remote, objs) 490 case DeduplicateRename: 491 dedupeRename(ctx, f, remote, objs) 492 case DeduplicateLargest: 493 sortSmallestFirst(objs) 494 dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) 495 case DeduplicateSmallest: 496 sortSmallestFirst(objs) 497 dedupeDeleteAllButOne(ctx, 0, remote, objs) 498 case DeduplicateSkip: 499 fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what) 500 case DeduplicateList: 501 dedupeList(ctx, f, ht, remote, objs, byHash) 502 default: 503 //skip 504 } 505 } 506 return nil 507 }