github.com/10XDev/rclone@v1.52.3-0.20200626220027-16af9ab76b2a/fs/operations/dedupe.go (about)

     1  // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
     2  
     3  package operations
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"log"
     9  	"path"
    10  	"sort"
    11  	"strings"
    12  
    13  	"github.com/pkg/errors"
    14  	"github.com/rclone/rclone/fs"
    15  	"github.com/rclone/rclone/fs/config"
    16  	"github.com/rclone/rclone/fs/hash"
    17  	"github.com/rclone/rclone/fs/walk"
    18  )
    19  
    20  // dedupeRename renames the objs slice to different names
    21  func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
    22  	doMove := f.Features().Move
    23  	if doMove == nil {
    24  		log.Fatalf("Fs %v doesn't support Move", f)
    25  	}
    26  	ext := path.Ext(remote)
    27  	base := remote[:len(remote)-len(ext)]
    28  
    29  outer:
    30  	for i, o := range objs {
    31  		suffix := 1
    32  		newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    33  		_, err := f.NewObject(ctx, newName)
    34  		for ; err != fs.ErrorObjectNotFound; suffix++ {
    35  			if err != nil {
    36  				err = fs.CountError(err)
    37  				fs.Errorf(o, "Failed to check for existing object: %v", err)
    38  				continue outer
    39  			}
    40  			if suffix > 100 {
    41  				fs.Errorf(o, "Could not find an available new name")
    42  				continue outer
    43  			}
    44  			newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    45  			_, err = f.NewObject(ctx, newName)
    46  		}
    47  		if !fs.Config.DryRun {
    48  			newObj, err := doMove(ctx, o, newName)
    49  			if err != nil {
    50  				err = fs.CountError(err)
    51  				fs.Errorf(o, "Failed to rename: %v", err)
    52  				continue
    53  			}
    54  			fs.Infof(newObj, "renamed from: %v", o)
    55  		} else {
    56  			fs.Logf(remote, "Not renaming to %q as --dry-run", newName)
    57  		}
    58  	}
    59  }
    60  
    61  // dedupeDeleteAllButOne deletes all but the one in keep
    62  func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
    63  	count := 0
    64  	for i, o := range objs {
    65  		if i == keep {
    66  			continue
    67  		}
    68  		err := DeleteFile(ctx, o)
    69  		if err == nil {
    70  			count++
    71  		}
    72  	}
    73  	if count > 0 {
    74  		fs.Logf(remote, "Deleted %d extra copies", count)
    75  	}
    76  }
    77  
    78  // dedupeDeleteIdentical deletes all but one of identical (by hash) copies
    79  func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
    80  	// Make map of IDs
    81  	IDs := make(map[string]int, len(objs))
    82  	for _, o := range objs {
    83  		if do, ok := o.(fs.IDer); ok {
    84  			if ID := do.ID(); ID != "" {
    85  				IDs[ID]++
    86  			}
    87  		}
    88  	}
    89  
    90  	// Remove duplicate IDs
    91  	newObjs := objs[:0]
    92  	for _, o := range objs {
    93  		if do, ok := o.(fs.IDer); ok {
    94  			if ID := do.ID(); ID != "" {
    95  				if IDs[ID] <= 1 {
    96  					newObjs = append(newObjs, o)
    97  				} else {
    98  					fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID])
    99  				}
   100  			}
   101  		}
   102  	}
   103  	objs = newObjs
   104  
   105  	// See how many of these duplicates are identical
   106  	byHash := make(map[string][]fs.Object, len(objs))
   107  	for _, o := range objs {
   108  		md5sum, err := o.Hash(ctx, ht)
   109  		if err != nil || md5sum == "" {
   110  			remainingObjs = append(remainingObjs, o)
   111  		} else {
   112  			byHash[md5sum] = append(byHash[md5sum], o)
   113  		}
   114  	}
   115  
   116  	// Delete identical duplicates, filling remainingObjs with the ones remaining
   117  	for md5sum, hashObjs := range byHash {
   118  		remainingObjs = append(remainingObjs, hashObjs[0])
   119  		if len(hashObjs) > 1 {
   120  			fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
   121  			for _, o := range hashObjs[1:] {
   122  				err := DeleteFile(ctx, o)
   123  				if err != nil {
   124  					remainingObjs = append(remainingObjs, o)
   125  				}
   126  			}
   127  		}
   128  	}
   129  
   130  	return remainingObjs
   131  }
   132  
   133  // dedupeInteractive interactively dedupes the slice of objects
   134  func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
   135  	fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
   136  	for i, o := range objs {
   137  		md5sum, err := o.Hash(ctx, ht)
   138  		if err != nil {
   139  			md5sum = err.Error()
   140  		}
   141  		fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
   142  	}
   143  	switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
   144  	case 's':
   145  	case 'k':
   146  		keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
   147  		dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
   148  	case 'r':
   149  		dedupeRename(ctx, f, remote, objs)
   150  	}
   151  }
   152  
   153  // DeduplicateMode is how the dedupe command chooses what to do
   154  type DeduplicateMode int
   155  
   156  // Deduplicate modes
   157  const (
   158  	DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
   159  	DeduplicateSkip                               // skip all conflicts
   160  	DeduplicateFirst                              // choose the first object
   161  	DeduplicateNewest                             // choose the newest object
   162  	DeduplicateOldest                             // choose the oldest object
   163  	DeduplicateRename                             // rename the objects
   164  	DeduplicateLargest                            // choose the largest object
   165  	DeduplicateSmallest                           // choose the smallest object
   166  )
   167  
   168  func (x DeduplicateMode) String() string {
   169  	switch x {
   170  	case DeduplicateInteractive:
   171  		return "interactive"
   172  	case DeduplicateSkip:
   173  		return "skip"
   174  	case DeduplicateFirst:
   175  		return "first"
   176  	case DeduplicateNewest:
   177  		return "newest"
   178  	case DeduplicateOldest:
   179  		return "oldest"
   180  	case DeduplicateRename:
   181  		return "rename"
   182  	case DeduplicateLargest:
   183  		return "largest"
   184  	case DeduplicateSmallest:
   185  		return "smallest"
   186  	}
   187  	return "unknown"
   188  }
   189  
   190  // Set a DeduplicateMode from a string
   191  func (x *DeduplicateMode) Set(s string) error {
   192  	switch strings.ToLower(s) {
   193  	case "interactive":
   194  		*x = DeduplicateInteractive
   195  	case "skip":
   196  		*x = DeduplicateSkip
   197  	case "first":
   198  		*x = DeduplicateFirst
   199  	case "newest":
   200  		*x = DeduplicateNewest
   201  	case "oldest":
   202  		*x = DeduplicateOldest
   203  	case "rename":
   204  		*x = DeduplicateRename
   205  	case "largest":
   206  		*x = DeduplicateLargest
   207  	case "smallest":
   208  		*x = DeduplicateSmallest
   209  	default:
   210  		return errors.Errorf("Unknown mode for dedupe %q.", s)
   211  	}
   212  	return nil
   213  }
   214  
   215  // Type of the value
   216  func (x *DeduplicateMode) Type() string {
   217  	return "string"
   218  }
   219  
   220  // dedupeFindDuplicateDirs scans f for duplicate directories
   221  func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) {
   222  	dirs := map[string][]fs.Directory{}
   223  	err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error {
   224  		entries.ForDir(func(d fs.Directory) {
   225  			dirs[d.Remote()] = append(dirs[d.Remote()], d)
   226  		})
   227  		return nil
   228  	})
   229  	if err != nil {
   230  		return nil, errors.Wrap(err, "find duplicate dirs")
   231  	}
   232  	// make sure parents are before children
   233  	duplicateNames := []string{}
   234  	for name, ds := range dirs {
   235  		if len(ds) > 1 {
   236  			duplicateNames = append(duplicateNames, name)
   237  		}
   238  	}
   239  	sort.Strings(duplicateNames)
   240  	duplicateDirs := [][]fs.Directory{}
   241  	for _, name := range duplicateNames {
   242  		duplicateDirs = append(duplicateDirs, dirs[name])
   243  	}
   244  	return duplicateDirs, nil
   245  }
   246  
   247  // dedupeMergeDuplicateDirs merges all the duplicate directories found
   248  func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error {
   249  	mergeDirs := f.Features().MergeDirs
   250  	if mergeDirs == nil {
   251  		return errors.Errorf("%v: can't merge directories", f)
   252  	}
   253  	dirCacheFlush := f.Features().DirCacheFlush
   254  	if dirCacheFlush == nil {
   255  		return errors.Errorf("%v: can't flush dir cache", f)
   256  	}
   257  	for _, dirs := range duplicateDirs {
   258  		if !fs.Config.DryRun {
   259  			fs.Infof(dirs[0], "Merging contents of duplicate directories")
   260  			err := mergeDirs(ctx, dirs)
   261  			if err != nil {
   262  				err = fs.CountError(err)
   263  				fs.Errorf(nil, "merge duplicate dirs: %v", err)
   264  			}
   265  		} else {
   266  			fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run")
   267  		}
   268  	}
   269  	dirCacheFlush()
   270  	return nil
   271  }
   272  
   273  // sort oldest first
   274  func sortOldestFirst(objs []fs.Object) {
   275  	sort.Slice(objs, func(i, j int) bool {
   276  		return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
   277  	})
   278  }
   279  
   280  // sort smallest first
   281  func sortSmallestFirst(objs []fs.Object) {
   282  	sort.Slice(objs, func(i, j int) bool {
   283  		return objs[i].Size() < objs[j].Size()
   284  	})
   285  }
   286  
   287  // Deduplicate interactively finds duplicate files and offers to
   288  // delete all but one or rename them to be different. Only useful with
   289  // Google Drive which can have duplicate file names.
   290  func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
   291  	fs.Infof(f, "Looking for duplicates using %v mode.", mode)
   292  
   293  	// Find duplicate directories first and fix them
   294  	duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
   295  	if err != nil {
   296  		return err
   297  	}
   298  	if len(duplicateDirs) != 0 {
   299  		err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
   300  		if err != nil {
   301  			return err
   302  		}
   303  	}
   304  
   305  	// find a hash to use
   306  	ht := f.Hashes().GetOne()
   307  
   308  	// Now find duplicate files
   309  	files := map[string][]fs.Object{}
   310  	err = walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
   311  		entries.ForObject(func(o fs.Object) {
   312  			remote := o.Remote()
   313  			files[remote] = append(files[remote], o)
   314  		})
   315  		return nil
   316  	})
   317  	if err != nil {
   318  		return err
   319  	}
   320  
   321  	for remote, objs := range files {
   322  		if len(objs) > 1 {
   323  			fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs))
   324  			objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
   325  			if len(objs) <= 1 {
   326  				fs.Logf(remote, "All duplicates removed")
   327  				continue
   328  			}
   329  			switch mode {
   330  			case DeduplicateInteractive:
   331  				dedupeInteractive(ctx, f, ht, remote, objs)
   332  			case DeduplicateFirst:
   333  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   334  			case DeduplicateNewest:
   335  				sortOldestFirst(objs)
   336  				dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
   337  			case DeduplicateOldest:
   338  				sortOldestFirst(objs)
   339  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   340  			case DeduplicateRename:
   341  				dedupeRename(ctx, f, remote, objs)
   342  			case DeduplicateLargest:
   343  				sortSmallestFirst(objs)
   344  				dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
   345  			case DeduplicateSmallest:
   346  				sortSmallestFirst(objs)
   347  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   348  			case DeduplicateSkip:
   349  				// skip
   350  			default:
   351  				//skip
   352  			}
   353  		}
   354  	}
   355  	return nil
   356  }