github.com/ncw/rclone@v1.48.1-0.20190724201158-a35aa1360e3e/fs/operations/dedupe.go (about) 1 // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega) 2 3 package operations 4 5 import ( 6 "context" 7 "fmt" 8 "log" 9 "path" 10 "sort" 11 "strings" 12 13 "github.com/ncw/rclone/fs" 14 "github.com/ncw/rclone/fs/config" 15 "github.com/ncw/rclone/fs/hash" 16 "github.com/ncw/rclone/fs/walk" 17 "github.com/pkg/errors" 18 "github.com/spf13/pflag" 19 ) 20 21 // dedupeRename renames the objs slice to different names 22 func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) { 23 doMove := f.Features().Move 24 if doMove == nil { 25 log.Fatalf("Fs %v doesn't support Move", f) 26 } 27 ext := path.Ext(remote) 28 base := remote[:len(remote)-len(ext)] 29 30 outer: 31 for i, o := range objs { 32 suffix := 1 33 newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 34 _, err := f.NewObject(ctx, newName) 35 for ; err != fs.ErrorObjectNotFound; suffix++ { 36 if err != nil { 37 fs.CountError(err) 38 fs.Errorf(o, "Failed to check for existing object: %v", err) 39 continue outer 40 } 41 if suffix > 100 { 42 fs.Errorf(o, "Could not find an available new name") 43 continue outer 44 } 45 newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext) 46 _, err = f.NewObject(ctx, newName) 47 } 48 if !fs.Config.DryRun { 49 newObj, err := doMove(ctx, o, newName) 50 if err != nil { 51 fs.CountError(err) 52 fs.Errorf(o, "Failed to rename: %v", err) 53 continue 54 } 55 fs.Infof(newObj, "renamed from: %v", o) 56 } else { 57 fs.Logf(remote, "Not renaming to %q as --dry-run", newName) 58 } 59 } 60 } 61 62 // dedupeDeleteAllButOne deletes all but the one in keep 63 func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) { 64 for i, o := range objs { 65 if i == keep { 66 continue 67 } 68 _ = DeleteFile(ctx, o) 69 } 70 fs.Logf(remote, "Deleted %d extra copies", len(objs)-1) 71 } 72 73 // dedupeDeleteIdentical deletes all but one of identical (by hash) copies 74 func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) { 75 // See how many of these duplicates are identical 76 byHash := make(map[string][]fs.Object, len(objs)) 77 for _, o := range objs { 78 md5sum, err := o.Hash(ctx, ht) 79 if err != nil || md5sum == "" { 80 remainingObjs = append(remainingObjs, o) 81 } else { 82 byHash[md5sum] = append(byHash[md5sum], o) 83 } 84 } 85 86 // Delete identical duplicates, filling remainingObjs with the ones remaining 87 for md5sum, hashObjs := range byHash { 88 if len(hashObjs) > 1 { 89 fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum) 90 for _, o := range hashObjs[1:] { 91 _ = DeleteFile(ctx, o) 92 } 93 } 94 remainingObjs = append(remainingObjs, hashObjs[0]) 95 } 96 97 return remainingObjs 98 } 99 100 // dedupeInteractive interactively dedupes the slice of objects 101 func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) { 102 fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) 103 for i, o := range objs { 104 md5sum, err := o.Hash(ctx, ht) 105 if err != nil { 106 md5sum = err.Error() 107 } 108 fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum) 109 } 110 switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { 111 case 's': 112 case 'k': 113 keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) 114 dedupeDeleteAllButOne(ctx, keep-1, remote, objs) 115 case 'r': 116 dedupeRename(ctx, f, remote, objs) 117 } 118 } 119 120 type objectsSortedByModTime []fs.Object 121 122 func (objs objectsSortedByModTime) Len() int { return len(objs) } 123 func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] } 124 func (objs objectsSortedByModTime) Less(i, j int) bool { 125 return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO())) 126 } 127 128 // DeduplicateMode is how the dedupe command chooses what to do 129 type DeduplicateMode int 130 131 // Deduplicate modes 132 const ( 133 DeduplicateInteractive DeduplicateMode = iota // interactively ask the user 134 DeduplicateSkip // skip all conflicts 135 DeduplicateFirst // choose the first object 136 DeduplicateNewest // choose the newest object 137 DeduplicateOldest // choose the oldest object 138 DeduplicateRename // rename the objects 139 DeduplicateLargest // choose the largest object 140 ) 141 142 func (x DeduplicateMode) String() string { 143 switch x { 144 case DeduplicateInteractive: 145 return "interactive" 146 case DeduplicateSkip: 147 return "skip" 148 case DeduplicateFirst: 149 return "first" 150 case DeduplicateNewest: 151 return "newest" 152 case DeduplicateOldest: 153 return "oldest" 154 case DeduplicateRename: 155 return "rename" 156 case DeduplicateLargest: 157 return "largest" 158 } 159 return "unknown" 160 } 161 162 // Set a DeduplicateMode from a string 163 func (x *DeduplicateMode) Set(s string) error { 164 switch strings.ToLower(s) { 165 case "interactive": 166 *x = DeduplicateInteractive 167 case "skip": 168 *x = DeduplicateSkip 169 case "first": 170 *x = DeduplicateFirst 171 case "newest": 172 *x = DeduplicateNewest 173 case "oldest": 174 *x = DeduplicateOldest 175 case "rename": 176 *x = DeduplicateRename 177 case "largest": 178 *x = DeduplicateLargest 179 default: 180 return errors.Errorf("Unknown mode for dedupe %q.", s) 181 } 182 return nil 183 } 184 185 // Type of the value 186 func (x *DeduplicateMode) Type() string { 187 return "string" 188 } 189 190 // Check it satisfies the interface 191 var _ pflag.Value = (*DeduplicateMode)(nil) 192 193 // dedupeFindDuplicateDirs scans f for duplicate directories 194 func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) { 195 dirs := map[string][]fs.Directory{} 196 err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error { 197 entries.ForDir(func(d fs.Directory) { 198 dirs[d.Remote()] = append(dirs[d.Remote()], d) 199 }) 200 return nil 201 }) 202 if err != nil { 203 return nil, errors.Wrap(err, "find duplicate dirs") 204 } 205 duplicateDirs := [][]fs.Directory{} 206 for _, ds := range dirs { 207 if len(ds) > 1 { 208 duplicateDirs = append(duplicateDirs, ds) 209 } 210 } 211 return duplicateDirs, nil 212 } 213 214 // dedupeMergeDuplicateDirs merges all the duplicate directories found 215 func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error { 216 mergeDirs := f.Features().MergeDirs 217 if mergeDirs == nil { 218 return errors.Errorf("%v: can't merge directories", f) 219 } 220 dirCacheFlush := f.Features().DirCacheFlush 221 if dirCacheFlush == nil { 222 return errors.Errorf("%v: can't flush dir cache", f) 223 } 224 for _, dirs := range duplicateDirs { 225 if !fs.Config.DryRun { 226 fs.Infof(dirs[0], "Merging contents of duplicate directories") 227 err := mergeDirs(ctx, dirs) 228 if err != nil { 229 return errors.Wrap(err, "merge duplicate dirs") 230 } 231 } else { 232 fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run") 233 } 234 } 235 dirCacheFlush() 236 return nil 237 } 238 239 // Deduplicate interactively finds duplicate files and offers to 240 // delete all but one or rename them to be different. Only useful with 241 // Google Drive which can have duplicate file names. 242 func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error { 243 fs.Infof(f, "Looking for duplicates using %v mode.", mode) 244 245 // Find duplicate directories first and fix them - repeat 246 // until all fixed 247 for { 248 duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) 249 if err != nil { 250 return err 251 } 252 if len(duplicateDirs) == 0 { 253 break 254 } 255 err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs) 256 if err != nil { 257 return err 258 } 259 if fs.Config.DryRun { 260 break 261 } 262 } 263 264 // find a hash to use 265 ht := f.Hashes().GetOne() 266 267 // Now find duplicate files 268 files := map[string][]fs.Object{} 269 err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { 270 entries.ForObject(func(o fs.Object) { 271 remote := o.Remote() 272 files[remote] = append(files[remote], o) 273 }) 274 return nil 275 }) 276 if err != nil { 277 return err 278 } 279 280 for remote, objs := range files { 281 if len(objs) > 1 { 282 fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs)) 283 objs = dedupeDeleteIdentical(ctx, ht, remote, objs) 284 if len(objs) <= 1 { 285 fs.Logf(remote, "All duplicates removed") 286 continue 287 } 288 switch mode { 289 case DeduplicateInteractive: 290 dedupeInteractive(ctx, f, ht, remote, objs) 291 case DeduplicateFirst: 292 dedupeDeleteAllButOne(ctx, 0, remote, objs) 293 case DeduplicateNewest: 294 sort.Sort(objectsSortedByModTime(objs)) // sort oldest first 295 dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) 296 case DeduplicateOldest: 297 sort.Sort(objectsSortedByModTime(objs)) // sort oldest first 298 dedupeDeleteAllButOne(ctx, 0, remote, objs) 299 case DeduplicateRename: 300 dedupeRename(ctx, f, remote, objs) 301 case DeduplicateLargest: 302 largest, largestIndex := int64(-1), -1 303 for i, obj := range objs { 304 size := obj.Size() 305 if size > largest { 306 largest, largestIndex = size, i 307 } 308 } 309 if largestIndex > -1 { 310 dedupeDeleteAllButOne(ctx, largestIndex, remote, objs) 311 } 312 case DeduplicateSkip: 313 // skip 314 default: 315 //skip 316 } 317 } 318 } 319 return nil 320 }