github.com/ncw/rclone@v1.48.1-0.20190724201158-a35aa1360e3e/fs/operations/dedupe.go (about)

     1  // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
     2  
     3  package operations
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"log"
     9  	"path"
    10  	"sort"
    11  	"strings"
    12  
    13  	"github.com/ncw/rclone/fs"
    14  	"github.com/ncw/rclone/fs/config"
    15  	"github.com/ncw/rclone/fs/hash"
    16  	"github.com/ncw/rclone/fs/walk"
    17  	"github.com/pkg/errors"
    18  	"github.com/spf13/pflag"
    19  )
    20  
    21  // dedupeRename renames the objs slice to different names
    22  func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
    23  	doMove := f.Features().Move
    24  	if doMove == nil {
    25  		log.Fatalf("Fs %v doesn't support Move", f)
    26  	}
    27  	ext := path.Ext(remote)
    28  	base := remote[:len(remote)-len(ext)]
    29  
    30  outer:
    31  	for i, o := range objs {
    32  		suffix := 1
    33  		newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    34  		_, err := f.NewObject(ctx, newName)
    35  		for ; err != fs.ErrorObjectNotFound; suffix++ {
    36  			if err != nil {
    37  				fs.CountError(err)
    38  				fs.Errorf(o, "Failed to check for existing object: %v", err)
    39  				continue outer
    40  			}
    41  			if suffix > 100 {
    42  				fs.Errorf(o, "Could not find an available new name")
    43  				continue outer
    44  			}
    45  			newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    46  			_, err = f.NewObject(ctx, newName)
    47  		}
    48  		if !fs.Config.DryRun {
    49  			newObj, err := doMove(ctx, o, newName)
    50  			if err != nil {
    51  				fs.CountError(err)
    52  				fs.Errorf(o, "Failed to rename: %v", err)
    53  				continue
    54  			}
    55  			fs.Infof(newObj, "renamed from: %v", o)
    56  		} else {
    57  			fs.Logf(remote, "Not renaming to %q as --dry-run", newName)
    58  		}
    59  	}
    60  }
    61  
    62  // dedupeDeleteAllButOne deletes all but the one in keep
    63  func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
    64  	for i, o := range objs {
    65  		if i == keep {
    66  			continue
    67  		}
    68  		_ = DeleteFile(ctx, o)
    69  	}
    70  	fs.Logf(remote, "Deleted %d extra copies", len(objs)-1)
    71  }
    72  
    73  // dedupeDeleteIdentical deletes all but one of identical (by hash) copies
    74  func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
    75  	// See how many of these duplicates are identical
    76  	byHash := make(map[string][]fs.Object, len(objs))
    77  	for _, o := range objs {
    78  		md5sum, err := o.Hash(ctx, ht)
    79  		if err != nil || md5sum == "" {
    80  			remainingObjs = append(remainingObjs, o)
    81  		} else {
    82  			byHash[md5sum] = append(byHash[md5sum], o)
    83  		}
    84  	}
    85  
    86  	// Delete identical duplicates, filling remainingObjs with the ones remaining
    87  	for md5sum, hashObjs := range byHash {
    88  		if len(hashObjs) > 1 {
    89  			fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
    90  			for _, o := range hashObjs[1:] {
    91  				_ = DeleteFile(ctx, o)
    92  			}
    93  		}
    94  		remainingObjs = append(remainingObjs, hashObjs[0])
    95  	}
    96  
    97  	return remainingObjs
    98  }
    99  
   100  // dedupeInteractive interactively dedupes the slice of objects
   101  func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
   102  	fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
   103  	for i, o := range objs {
   104  		md5sum, err := o.Hash(ctx, ht)
   105  		if err != nil {
   106  			md5sum = err.Error()
   107  		}
   108  		fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
   109  	}
   110  	switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
   111  	case 's':
   112  	case 'k':
   113  		keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
   114  		dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
   115  	case 'r':
   116  		dedupeRename(ctx, f, remote, objs)
   117  	}
   118  }
   119  
   120  type objectsSortedByModTime []fs.Object
   121  
   122  func (objs objectsSortedByModTime) Len() int      { return len(objs) }
   123  func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] }
   124  func (objs objectsSortedByModTime) Less(i, j int) bool {
   125  	return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
   126  }
   127  
   128  // DeduplicateMode is how the dedupe command chooses what to do
   129  type DeduplicateMode int
   130  
   131  // Deduplicate modes
   132  const (
   133  	DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
   134  	DeduplicateSkip                               // skip all conflicts
   135  	DeduplicateFirst                              // choose the first object
   136  	DeduplicateNewest                             // choose the newest object
   137  	DeduplicateOldest                             // choose the oldest object
   138  	DeduplicateRename                             // rename the objects
   139  	DeduplicateLargest                            // choose the largest object
   140  )
   141  
   142  func (x DeduplicateMode) String() string {
   143  	switch x {
   144  	case DeduplicateInteractive:
   145  		return "interactive"
   146  	case DeduplicateSkip:
   147  		return "skip"
   148  	case DeduplicateFirst:
   149  		return "first"
   150  	case DeduplicateNewest:
   151  		return "newest"
   152  	case DeduplicateOldest:
   153  		return "oldest"
   154  	case DeduplicateRename:
   155  		return "rename"
   156  	case DeduplicateLargest:
   157  		return "largest"
   158  	}
   159  	return "unknown"
   160  }
   161  
   162  // Set a DeduplicateMode from a string
   163  func (x *DeduplicateMode) Set(s string) error {
   164  	switch strings.ToLower(s) {
   165  	case "interactive":
   166  		*x = DeduplicateInteractive
   167  	case "skip":
   168  		*x = DeduplicateSkip
   169  	case "first":
   170  		*x = DeduplicateFirst
   171  	case "newest":
   172  		*x = DeduplicateNewest
   173  	case "oldest":
   174  		*x = DeduplicateOldest
   175  	case "rename":
   176  		*x = DeduplicateRename
   177  	case "largest":
   178  		*x = DeduplicateLargest
   179  	default:
   180  		return errors.Errorf("Unknown mode for dedupe %q.", s)
   181  	}
   182  	return nil
   183  }
   184  
   185  // Type of the value
   186  func (x *DeduplicateMode) Type() string {
   187  	return "string"
   188  }
   189  
   190  // Check it satisfies the interface
   191  var _ pflag.Value = (*DeduplicateMode)(nil)
   192  
   193  // dedupeFindDuplicateDirs scans f for duplicate directories
   194  func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) {
   195  	dirs := map[string][]fs.Directory{}
   196  	err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error {
   197  		entries.ForDir(func(d fs.Directory) {
   198  			dirs[d.Remote()] = append(dirs[d.Remote()], d)
   199  		})
   200  		return nil
   201  	})
   202  	if err != nil {
   203  		return nil, errors.Wrap(err, "find duplicate dirs")
   204  	}
   205  	duplicateDirs := [][]fs.Directory{}
   206  	for _, ds := range dirs {
   207  		if len(ds) > 1 {
   208  			duplicateDirs = append(duplicateDirs, ds)
   209  		}
   210  	}
   211  	return duplicateDirs, nil
   212  }
   213  
   214  // dedupeMergeDuplicateDirs merges all the duplicate directories found
   215  func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error {
   216  	mergeDirs := f.Features().MergeDirs
   217  	if mergeDirs == nil {
   218  		return errors.Errorf("%v: can't merge directories", f)
   219  	}
   220  	dirCacheFlush := f.Features().DirCacheFlush
   221  	if dirCacheFlush == nil {
   222  		return errors.Errorf("%v: can't flush dir cache", f)
   223  	}
   224  	for _, dirs := range duplicateDirs {
   225  		if !fs.Config.DryRun {
   226  			fs.Infof(dirs[0], "Merging contents of duplicate directories")
   227  			err := mergeDirs(ctx, dirs)
   228  			if err != nil {
   229  				return errors.Wrap(err, "merge duplicate dirs")
   230  			}
   231  		} else {
   232  			fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run")
   233  		}
   234  	}
   235  	dirCacheFlush()
   236  	return nil
   237  }
   238  
   239  // Deduplicate interactively finds duplicate files and offers to
   240  // delete all but one or rename them to be different. Only useful with
   241  // Google Drive which can have duplicate file names.
   242  func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
   243  	fs.Infof(f, "Looking for duplicates using %v mode.", mode)
   244  
   245  	// Find duplicate directories first and fix them - repeat
   246  	// until all fixed
   247  	for {
   248  		duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
   249  		if err != nil {
   250  			return err
   251  		}
   252  		if len(duplicateDirs) == 0 {
   253  			break
   254  		}
   255  		err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
   256  		if err != nil {
   257  			return err
   258  		}
   259  		if fs.Config.DryRun {
   260  			break
   261  		}
   262  	}
   263  
   264  	// find a hash to use
   265  	ht := f.Hashes().GetOne()
   266  
   267  	// Now find duplicate files
   268  	files := map[string][]fs.Object{}
   269  	err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
   270  		entries.ForObject(func(o fs.Object) {
   271  			remote := o.Remote()
   272  			files[remote] = append(files[remote], o)
   273  		})
   274  		return nil
   275  	})
   276  	if err != nil {
   277  		return err
   278  	}
   279  
   280  	for remote, objs := range files {
   281  		if len(objs) > 1 {
   282  			fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs))
   283  			objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
   284  			if len(objs) <= 1 {
   285  				fs.Logf(remote, "All duplicates removed")
   286  				continue
   287  			}
   288  			switch mode {
   289  			case DeduplicateInteractive:
   290  				dedupeInteractive(ctx, f, ht, remote, objs)
   291  			case DeduplicateFirst:
   292  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   293  			case DeduplicateNewest:
   294  				sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
   295  				dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
   296  			case DeduplicateOldest:
   297  				sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
   298  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   299  			case DeduplicateRename:
   300  				dedupeRename(ctx, f, remote, objs)
   301  			case DeduplicateLargest:
   302  				largest, largestIndex := int64(-1), -1
   303  				for i, obj := range objs {
   304  					size := obj.Size()
   305  					if size > largest {
   306  						largest, largestIndex = size, i
   307  					}
   308  				}
   309  				if largestIndex > -1 {
   310  					dedupeDeleteAllButOne(ctx, largestIndex, remote, objs)
   311  				}
   312  			case DeduplicateSkip:
   313  				// skip
   314  			default:
   315  				//skip
   316  			}
   317  		}
   318  	}
   319  	return nil
   320  }