github.com/xhghs/rclone@v1.51.1-0.20200430155106-e186a28cced8/fs/operations/dedupe.go (about) 1 // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega) 2 3 package operations 4 5 import ( 6 "context" 7 "fmt" 8 "log" 9 "path" 10 "sort" 11 "strings" 12 13 "github.com/pkg/errors" 14 "github.com/rclone/rclone/fs" 15 "github.com/rclone/rclone/fs/config" 16 "github.com/rclone/rclone/fs/hash" 17 "github.com/rclone/rclone/fs/walk" 18 ) 19 20 // dedupeRename renames the objs slice to different names 21 func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) { 22 doMove := f.Features().Move 23 if doMove == nil { 24 log.Fatalf("Fs %v doesn't support Move", f) 25 } 26 ext := path.Ext(remote) 27 base := remote[:len(remote)-len(ext)] 28 29 outer: 30 for i, o := range objs { 31 suffix := 1 32 newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 33 _, err := f.NewObject(ctx, newName) 34 for ; err != fs.ErrorObjectNotFound; suffix++ { 35 if err != nil { 36 err = fs.CountError(err) 37 fs.Errorf(o, "Failed to check for existing object: %v", err) 38 continue outer 39 } 40 if suffix > 100 { 41 fs.Errorf(o, "Could not find an available new name") 42 continue outer 43 } 44 newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 45 _, err = f.NewObject(ctx, newName) 46 } 47 if !fs.Config.DryRun { 48 newObj, err := doMove(ctx, o, newName) 49 if err != nil { 50 err = fs.CountError(err) 51 fs.Errorf(o, "Failed to rename: %v", err) 52 continue 53 } 54 fs.Infof(newObj, "renamed from: %v", o) 55 } else { 56 fs.Logf(remote, "Not renaming to %q as --dry-run", newName) 57 } 58 } 59 } 60 61 // dedupeDeleteAllButOne deletes all but the one in keep 62 func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) { 63 count := 0 64 for i, o := range objs { 65 if i == keep { 66 continue 67 } 68 err := DeleteFile(ctx, o) 69 if err == nil { 70 count++ 71 } 72 } 73 if count > 0 { 74 fs.Logf(remote, "Deleted %d extra copies", count) 75 } 76 } 77 78 // dedupeDeleteIdentical deletes all but one of identical (by hash) copies 79 func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) { 80 // See how many of these duplicates are identical 81 byHash := make(map[string][]fs.Object, len(objs)) 82 for _, o := range objs { 83 md5sum, err := o.Hash(ctx, ht) 84 if err != nil || md5sum == "" { 85 remainingObjs = append(remainingObjs, o) 86 } else { 87 byHash[md5sum] = append(byHash[md5sum], o) 88 } 89 } 90 91 // Delete identical duplicates, filling remainingObjs with the ones remaining 92 for md5sum, hashObjs := range byHash { 93 remainingObjs = append(remainingObjs, hashObjs[0]) 94 if len(hashObjs) > 1 { 95 fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum) 96 for _, o := range hashObjs[1:] { 97 err := DeleteFile(ctx, o) 98 if err != nil { 99 remainingObjs = append(remainingObjs, o) 100 } 101 } 102 } 103 } 104 105 return remainingObjs 106 } 107 108 // dedupeInteractive interactively dedupes the slice of objects 109 func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) { 110 fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) 111 for i, o := range objs { 112 md5sum, err := o.Hash(ctx, ht) 113 if err != nil { 114 md5sum = err.Error() 115 } 116 fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum) 117 } 118 switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { 119 case 's': 120 case 'k': 121 keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) 122 dedupeDeleteAllButOne(ctx, keep-1, remote, objs) 123 case 'r': 124 dedupeRename(ctx, f, remote, objs) 125 } 126 } 127 128 // DeduplicateMode is how the dedupe command chooses what to do 129 type DeduplicateMode int 130 131 // Deduplicate modes 132 const ( 133 DeduplicateInteractive DeduplicateMode = iota // interactively ask the user 134 DeduplicateSkip // skip all conflicts 135 DeduplicateFirst // choose the first object 136 DeduplicateNewest // choose the newest object 137 DeduplicateOldest // choose the oldest object 138 DeduplicateRename // rename the objects 139 DeduplicateLargest // choose the largest object 140 DeduplicateSmallest // choose the smallest object 141 ) 142 143 func (x DeduplicateMode) String() string { 144 switch x { 145 case DeduplicateInteractive: 146 return "interactive" 147 case DeduplicateSkip: 148 return "skip" 149 case DeduplicateFirst: 150 return "first" 151 case DeduplicateNewest: 152 return "newest" 153 case DeduplicateOldest: 154 return "oldest" 155 case DeduplicateRename: 156 return "rename" 157 case DeduplicateLargest: 158 return "largest" 159 case DeduplicateSmallest: 160 return "smallest" 161 } 162 return "unknown" 163 } 164 165 // Set a DeduplicateMode from a string 166 func (x *DeduplicateMode) Set(s string) error { 167 switch strings.ToLower(s) { 168 case "interactive": 169 *x = DeduplicateInteractive 170 case "skip": 171 *x = DeduplicateSkip 172 case "first": 173 *x = DeduplicateFirst 174 case "newest": 175 *x = DeduplicateNewest 176 case "oldest": 177 *x = DeduplicateOldest 178 case "rename": 179 *x = DeduplicateRename 180 case "largest": 181 *x = DeduplicateLargest 182 case "smallest": 183 *x = DeduplicateSmallest 184 default: 185 return errors.Errorf("Unknown mode for dedupe %q.", s) 186 } 187 return nil 188 } 189 190 // Type of the value 191 func (x *DeduplicateMode) Type() string { 192 return "string" 193 } 194 195 // dedupeFindDuplicateDirs scans f for duplicate directories 196 func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) { 197 dirs := map[string][]fs.Directory{} 198 err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error { 199 entries.ForDir(func(d fs.Directory) { 200 dirs[d.Remote()] = append(dirs[d.Remote()], d) 201 }) 202 return nil 203 }) 204 if err != nil { 205 return nil, errors.Wrap(err, "find duplicate dirs") 206 } 207 // make sure parents are before children 208 duplicateNames := []string{} 209 for name, ds := range dirs { 210 if len(ds) > 1 { 211 duplicateNames = append(duplicateNames, name) 212 } 213 } 214 sort.Strings(duplicateNames) 215 duplicateDirs := [][]fs.Directory{} 216 for _, name := range duplicateNames { 217 duplicateDirs = append(duplicateDirs, dirs[name]) 218 } 219 return duplicateDirs, nil 220 } 221 222 // dedupeMergeDuplicateDirs merges all the duplicate directories found 223 func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error { 224 mergeDirs := f.Features().MergeDirs 225 if mergeDirs == nil { 226 return errors.Errorf("%v: can't merge directories", f) 227 } 228 dirCacheFlush := f.Features().DirCacheFlush 229 if dirCacheFlush == nil { 230 return errors.Errorf("%v: can't flush dir cache", f) 231 } 232 for _, dirs := range duplicateDirs { 233 if !fs.Config.DryRun { 234 fs.Infof(dirs[0], "Merging contents of duplicate directories") 235 err := mergeDirs(ctx, dirs) 236 if err != nil { 237 err = fs.CountError(err) 238 fs.Errorf(nil, "merge duplicate dirs: %v", err) 239 } 240 } else { 241 fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run") 242 } 243 } 244 dirCacheFlush() 245 return nil 246 } 247 248 // sort oldest first 249 func sortOldestFirst(objs []fs.Object) { 250 sort.Slice(objs, func(i, j int) bool { 251 return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO())) 252 }) 253 } 254 255 // sort smallest first 256 func sortSmallestFirst(objs []fs.Object) { 257 sort.Slice(objs, func(i, j int) bool { 258 return objs[i].Size() < objs[j].Size() 259 }) 260 } 261 262 // Deduplicate interactively finds duplicate files and offers to 263 // delete all but one or rename them to be different. Only useful with 264 // Google Drive which can have duplicate file names. 265 func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error { 266 fs.Infof(f, "Looking for duplicates using %v mode.", mode) 267 268 // Find duplicate directories first and fix them 269 duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) 270 if err != nil { 271 return err 272 } 273 if len(duplicateDirs) != 0 { 274 err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs) 275 if err != nil { 276 return err 277 } 278 } 279 280 // find a hash to use 281 ht := f.Hashes().GetOne() 282 283 // Now find duplicate files 284 files := map[string][]fs.Object{} 285 err = walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { 286 entries.ForObject(func(o fs.Object) { 287 remote := o.Remote() 288 files[remote] = append(files[remote], o) 289 }) 290 return nil 291 }) 292 if err != nil { 293 return err 294 } 295 296 for remote, objs := range files { 297 if len(objs) > 1 { 298 fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs)) 299 objs = dedupeDeleteIdentical(ctx, ht, remote, objs) 300 if len(objs) <= 1 { 301 fs.Logf(remote, "All duplicates removed") 302 continue 303 } 304 switch mode { 305 case DeduplicateInteractive: 306 dedupeInteractive(ctx, f, ht, remote, objs) 307 case DeduplicateFirst: 308 dedupeDeleteAllButOne(ctx, 0, remote, objs) 309 case DeduplicateNewest: 310 sortOldestFirst(objs) 311 dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) 312 case DeduplicateOldest: 313 sortOldestFirst(objs) 314 dedupeDeleteAllButOne(ctx, 0, remote, objs) 315 case DeduplicateRename: 316 dedupeRename(ctx, f, remote, objs) 317 case DeduplicateLargest: 318 sortSmallestFirst(objs) 319 dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) 320 case DeduplicateSmallest: 321 sortSmallestFirst(objs) 322 dedupeDeleteAllButOne(ctx, 0, remote, objs) 323 case DeduplicateSkip: 324 // skip 325 default: 326 //skip 327 } 328 } 329 } 330 return nil 331 }