github.com/10XDev/rclone@v1.52.3-0.20200626220027-16af9ab76b2a/fs/operations/dedupe.go (about) 1 // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega) 2 3 package operations 4 5 import ( 6 "context" 7 "fmt" 8 "log" 9 "path" 10 "sort" 11 "strings" 12 13 "github.com/pkg/errors" 14 "github.com/rclone/rclone/fs" 15 "github.com/rclone/rclone/fs/config" 16 "github.com/rclone/rclone/fs/hash" 17 "github.com/rclone/rclone/fs/walk" 18 ) 19 20 // dedupeRename renames the objs slice to different names 21 func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) { 22 doMove := f.Features().Move 23 if doMove == nil { 24 log.Fatalf("Fs %v doesn't support Move", f) 25 } 26 ext := path.Ext(remote) 27 base := remote[:len(remote)-len(ext)] 28 29 outer: 30 for i, o := range objs { 31 suffix := 1 32 newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 33 _, err := f.NewObject(ctx, newName) 34 for ; err != fs.ErrorObjectNotFound; suffix++ { 35 if err != nil { 36 err = fs.CountError(err) 37 fs.Errorf(o, "Failed to check for existing object: %v", err) 38 continue outer 39 } 40 if suffix > 100 { 41 fs.Errorf(o, "Could not find an available new name") 42 continue outer 43 } 44 newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 45 _, err = f.NewObject(ctx, newName) 46 } 47 if !fs.Config.DryRun { 48 newObj, err := doMove(ctx, o, newName) 49 if err != nil { 50 err = fs.CountError(err) 51 fs.Errorf(o, "Failed to rename: %v", err) 52 continue 53 } 54 fs.Infof(newObj, "renamed from: %v", o) 55 } else { 56 fs.Logf(remote, "Not renaming to %q as --dry-run", newName) 57 } 58 } 59 } 60 61 // dedupeDeleteAllButOne deletes all but the one in keep 62 func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) { 63 count := 0 64 for i, o := range objs { 65 if i == keep { 66 continue 67 } 68 err := DeleteFile(ctx, o) 69 if err == nil { 70 count++ 71 } 72 } 73 if count > 0 { 74 fs.Logf(remote, "Deleted %d extra copies", count) 75 } 76 } 77 78 // dedupeDeleteIdentical deletes all but one of identical (by hash) copies 79 func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) { 80 // Make map of IDs 81 IDs := make(map[string]int, len(objs)) 82 for _, o := range objs { 83 if do, ok := o.(fs.IDer); ok { 84 if ID := do.ID(); ID != "" { 85 IDs[ID]++ 86 } 87 } 88 } 89 90 // Remove duplicate IDs 91 newObjs := objs[:0] 92 for _, o := range objs { 93 if do, ok := o.(fs.IDer); ok { 94 if ID := do.ID(); ID != "" { 95 if IDs[ID] <= 1 { 96 newObjs = append(newObjs, o) 97 } else { 98 fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID]) 99 } 100 } 101 } 102 } 103 objs = newObjs 104 105 // See how many of these duplicates are identical 106 byHash := make(map[string][]fs.Object, len(objs)) 107 for _, o := range objs { 108 md5sum, err := o.Hash(ctx, ht) 109 if err != nil || md5sum == "" { 110 remainingObjs = append(remainingObjs, o) 111 } else { 112 byHash[md5sum] = append(byHash[md5sum], o) 113 } 114 } 115 116 // Delete identical duplicates, filling remainingObjs with the ones remaining 117 for md5sum, hashObjs := range byHash { 118 remainingObjs = append(remainingObjs, hashObjs[0]) 119 if len(hashObjs) > 1 { 120 fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum) 121 for _, o := range hashObjs[1:] { 122 err := DeleteFile(ctx, o) 123 if err != nil { 124 remainingObjs = append(remainingObjs, o) 125 } 126 } 127 } 128 } 129 130 return remainingObjs 131 } 132 133 // dedupeInteractive interactively dedupes the slice of objects 134 func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) { 135 fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) 136 for i, o := range objs { 137 md5sum, err := o.Hash(ctx, ht) 138 if err != nil { 139 md5sum = err.Error() 140 } 141 fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum) 142 } 143 switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { 144 case 's': 145 case 'k': 146 keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) 147 dedupeDeleteAllButOne(ctx, keep-1, remote, objs) 148 case 'r': 149 dedupeRename(ctx, f, remote, objs) 150 } 151 } 152 153 // DeduplicateMode is how the dedupe command chooses what to do 154 type DeduplicateMode int 155 156 // Deduplicate modes 157 const ( 158 DeduplicateInteractive DeduplicateMode = iota // interactively ask the user 159 DeduplicateSkip // skip all conflicts 160 DeduplicateFirst // choose the first object 161 DeduplicateNewest // choose the newest object 162 DeduplicateOldest // choose the oldest object 163 DeduplicateRename // rename the objects 164 DeduplicateLargest // choose the largest object 165 DeduplicateSmallest // choose the smallest object 166 ) 167 168 func (x DeduplicateMode) String() string { 169 switch x { 170 case DeduplicateInteractive: 171 return "interactive" 172 case DeduplicateSkip: 173 return "skip" 174 case DeduplicateFirst: 175 return "first" 176 case DeduplicateNewest: 177 return "newest" 178 case DeduplicateOldest: 179 return "oldest" 180 case DeduplicateRename: 181 return "rename" 182 case DeduplicateLargest: 183 return "largest" 184 case DeduplicateSmallest: 185 return "smallest" 186 } 187 return "unknown" 188 } 189 190 // Set a DeduplicateMode from a string 191 func (x *DeduplicateMode) Set(s string) error { 192 switch strings.ToLower(s) { 193 case "interactive": 194 *x = DeduplicateInteractive 195 case "skip": 196 *x = DeduplicateSkip 197 case "first": 198 *x = DeduplicateFirst 199 case "newest": 200 *x = DeduplicateNewest 201 case "oldest": 202 *x = DeduplicateOldest 203 case "rename": 204 *x = DeduplicateRename 205 case "largest": 206 *x = DeduplicateLargest 207 case "smallest": 208 *x = DeduplicateSmallest 209 default: 210 return errors.Errorf("Unknown mode for dedupe %q.", s) 211 } 212 return nil 213 } 214 215 // Type of the value 216 func (x *DeduplicateMode) Type() string { 217 return "string" 218 } 219 220 // dedupeFindDuplicateDirs scans f for duplicate directories 221 func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) { 222 dirs := map[string][]fs.Directory{} 223 err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error { 224 entries.ForDir(func(d fs.Directory) { 225 dirs[d.Remote()] = append(dirs[d.Remote()], d) 226 }) 227 return nil 228 }) 229 if err != nil { 230 return nil, errors.Wrap(err, "find duplicate dirs") 231 } 232 // make sure parents are before children 233 duplicateNames := []string{} 234 for name, ds := range dirs { 235 if len(ds) > 1 { 236 duplicateNames = append(duplicateNames, name) 237 } 238 } 239 sort.Strings(duplicateNames) 240 duplicateDirs := [][]fs.Directory{} 241 for _, name := range duplicateNames { 242 duplicateDirs = append(duplicateDirs, dirs[name]) 243 } 244 return duplicateDirs, nil 245 } 246 247 // dedupeMergeDuplicateDirs merges all the duplicate directories found 248 func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error { 249 mergeDirs := f.Features().MergeDirs 250 if mergeDirs == nil { 251 return errors.Errorf("%v: can't merge directories", f) 252 } 253 dirCacheFlush := f.Features().DirCacheFlush 254 if dirCacheFlush == nil { 255 return errors.Errorf("%v: can't flush dir cache", f) 256 } 257 for _, dirs := range duplicateDirs { 258 if !fs.Config.DryRun { 259 fs.Infof(dirs[0], "Merging contents of duplicate directories") 260 err := mergeDirs(ctx, dirs) 261 if err != nil { 262 err = fs.CountError(err) 263 fs.Errorf(nil, "merge duplicate dirs: %v", err) 264 } 265 } else { 266 fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run") 267 } 268 } 269 dirCacheFlush() 270 return nil 271 } 272 273 // sort oldest first 274 func sortOldestFirst(objs []fs.Object) { 275 sort.Slice(objs, func(i, j int) bool { 276 return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO())) 277 }) 278 } 279 280 // sort smallest first 281 func sortSmallestFirst(objs []fs.Object) { 282 sort.Slice(objs, func(i, j int) bool { 283 return objs[i].Size() < objs[j].Size() 284 }) 285 } 286 287 // Deduplicate interactively finds duplicate files and offers to 288 // delete all but one or rename them to be different. Only useful with 289 // Google Drive which can have duplicate file names. 290 func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error { 291 fs.Infof(f, "Looking for duplicates using %v mode.", mode) 292 293 // Find duplicate directories first and fix them 294 duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) 295 if err != nil { 296 return err 297 } 298 if len(duplicateDirs) != 0 { 299 err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs) 300 if err != nil { 301 return err 302 } 303 } 304 305 // find a hash to use 306 ht := f.Hashes().GetOne() 307 308 // Now find duplicate files 309 files := map[string][]fs.Object{} 310 err = walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { 311 entries.ForObject(func(o fs.Object) { 312 remote := o.Remote() 313 files[remote] = append(files[remote], o) 314 }) 315 return nil 316 }) 317 if err != nil { 318 return err 319 } 320 321 for remote, objs := range files { 322 if len(objs) > 1 { 323 fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs)) 324 objs = dedupeDeleteIdentical(ctx, ht, remote, objs) 325 if len(objs) <= 1 { 326 fs.Logf(remote, "All duplicates removed") 327 continue 328 } 329 switch mode { 330 case DeduplicateInteractive: 331 dedupeInteractive(ctx, f, ht, remote, objs) 332 case DeduplicateFirst: 333 dedupeDeleteAllButOne(ctx, 0, remote, objs) 334 case DeduplicateNewest: 335 sortOldestFirst(objs) 336 dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) 337 case DeduplicateOldest: 338 sortOldestFirst(objs) 339 dedupeDeleteAllButOne(ctx, 0, remote, objs) 340 case DeduplicateRename: 341 dedupeRename(ctx, f, remote, objs) 342 case DeduplicateLargest: 343 sortSmallestFirst(objs) 344 dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) 345 case DeduplicateSmallest: 346 sortSmallestFirst(objs) 347 dedupeDeleteAllButOne(ctx, 0, remote, objs) 348 case DeduplicateSkip: 349 // skip 350 default: 351 //skip 352 } 353 } 354 } 355 return nil 356 }