github.com/xhghs/rclone@v1.51.1-0.20200430155106-e186a28cced8/fs/operations/dedupe.go (about)

     1  // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
     2  
     3  package operations
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"log"
     9  	"path"
    10  	"sort"
    11  	"strings"
    12  
    13  	"github.com/pkg/errors"
    14  	"github.com/rclone/rclone/fs"
    15  	"github.com/rclone/rclone/fs/config"
    16  	"github.com/rclone/rclone/fs/hash"
    17  	"github.com/rclone/rclone/fs/walk"
    18  )
    19  
    20  // dedupeRename renames the objs slice to different names
    21  func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
    22  	doMove := f.Features().Move
    23  	if doMove == nil {
    24  		log.Fatalf("Fs %v doesn't support Move", f)
    25  	}
    26  	ext := path.Ext(remote)
    27  	base := remote[:len(remote)-len(ext)]
    28  
    29  outer:
    30  	for i, o := range objs {
    31  		suffix := 1
    32  		newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    33  		_, err := f.NewObject(ctx, newName)
    34  		for ; err != fs.ErrorObjectNotFound; suffix++ {
    35  			if err != nil {
    36  				err = fs.CountError(err)
    37  				fs.Errorf(o, "Failed to check for existing object: %v", err)
    38  				continue outer
    39  			}
    40  			if suffix > 100 {
    41  				fs.Errorf(o, "Could not find an available new name")
    42  				continue outer
    43  			}
    44  			newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    45  			_, err = f.NewObject(ctx, newName)
    46  		}
    47  		if !fs.Config.DryRun {
    48  			newObj, err := doMove(ctx, o, newName)
    49  			if err != nil {
    50  				err = fs.CountError(err)
    51  				fs.Errorf(o, "Failed to rename: %v", err)
    52  				continue
    53  			}
    54  			fs.Infof(newObj, "renamed from: %v", o)
    55  		} else {
    56  			fs.Logf(remote, "Not renaming to %q as --dry-run", newName)
    57  		}
    58  	}
    59  }
    60  
    61  // dedupeDeleteAllButOne deletes all but the one in keep
    62  func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
    63  	count := 0
    64  	for i, o := range objs {
    65  		if i == keep {
    66  			continue
    67  		}
    68  		err := DeleteFile(ctx, o)
    69  		if err == nil {
    70  			count++
    71  		}
    72  	}
    73  	if count > 0 {
    74  		fs.Logf(remote, "Deleted %d extra copies", count)
    75  	}
    76  }
    77  
    78  // dedupeDeleteIdentical deletes all but one of identical (by hash) copies
    79  func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
    80  	// See how many of these duplicates are identical
    81  	byHash := make(map[string][]fs.Object, len(objs))
    82  	for _, o := range objs {
    83  		md5sum, err := o.Hash(ctx, ht)
    84  		if err != nil || md5sum == "" {
    85  			remainingObjs = append(remainingObjs, o)
    86  		} else {
    87  			byHash[md5sum] = append(byHash[md5sum], o)
    88  		}
    89  	}
    90  
    91  	// Delete identical duplicates, filling remainingObjs with the ones remaining
    92  	for md5sum, hashObjs := range byHash {
    93  		remainingObjs = append(remainingObjs, hashObjs[0])
    94  		if len(hashObjs) > 1 {
    95  			fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
    96  			for _, o := range hashObjs[1:] {
    97  				err := DeleteFile(ctx, o)
    98  				if err != nil {
    99  					remainingObjs = append(remainingObjs, o)
   100  				}
   101  			}
   102  		}
   103  	}
   104  
   105  	return remainingObjs
   106  }
   107  
   108  // dedupeInteractive interactively dedupes the slice of objects
   109  func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
   110  	fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
   111  	for i, o := range objs {
   112  		md5sum, err := o.Hash(ctx, ht)
   113  		if err != nil {
   114  			md5sum = err.Error()
   115  		}
   116  		fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
   117  	}
   118  	switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
   119  	case 's':
   120  	case 'k':
   121  		keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
   122  		dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
   123  	case 'r':
   124  		dedupeRename(ctx, f, remote, objs)
   125  	}
   126  }
   127  
   128  // DeduplicateMode is how the dedupe command chooses what to do
   129  type DeduplicateMode int
   130  
   131  // Deduplicate modes
   132  const (
   133  	DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
   134  	DeduplicateSkip                               // skip all conflicts
   135  	DeduplicateFirst                              // choose the first object
   136  	DeduplicateNewest                             // choose the newest object
   137  	DeduplicateOldest                             // choose the oldest object
   138  	DeduplicateRename                             // rename the objects
   139  	DeduplicateLargest                            // choose the largest object
   140  	DeduplicateSmallest                           // choose the smallest object
   141  )
   142  
   143  func (x DeduplicateMode) String() string {
   144  	switch x {
   145  	case DeduplicateInteractive:
   146  		return "interactive"
   147  	case DeduplicateSkip:
   148  		return "skip"
   149  	case DeduplicateFirst:
   150  		return "first"
   151  	case DeduplicateNewest:
   152  		return "newest"
   153  	case DeduplicateOldest:
   154  		return "oldest"
   155  	case DeduplicateRename:
   156  		return "rename"
   157  	case DeduplicateLargest:
   158  		return "largest"
   159  	case DeduplicateSmallest:
   160  		return "smallest"
   161  	}
   162  	return "unknown"
   163  }
   164  
   165  // Set a DeduplicateMode from a string
   166  func (x *DeduplicateMode) Set(s string) error {
   167  	switch strings.ToLower(s) {
   168  	case "interactive":
   169  		*x = DeduplicateInteractive
   170  	case "skip":
   171  		*x = DeduplicateSkip
   172  	case "first":
   173  		*x = DeduplicateFirst
   174  	case "newest":
   175  		*x = DeduplicateNewest
   176  	case "oldest":
   177  		*x = DeduplicateOldest
   178  	case "rename":
   179  		*x = DeduplicateRename
   180  	case "largest":
   181  		*x = DeduplicateLargest
   182  	case "smallest":
   183  		*x = DeduplicateSmallest
   184  	default:
   185  		return errors.Errorf("Unknown mode for dedupe %q.", s)
   186  	}
   187  	return nil
   188  }
   189  
   190  // Type of the value
   191  func (x *DeduplicateMode) Type() string {
   192  	return "string"
   193  }
   194  
   195  // dedupeFindDuplicateDirs scans f for duplicate directories
   196  func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) {
   197  	dirs := map[string][]fs.Directory{}
   198  	err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error {
   199  		entries.ForDir(func(d fs.Directory) {
   200  			dirs[d.Remote()] = append(dirs[d.Remote()], d)
   201  		})
   202  		return nil
   203  	})
   204  	if err != nil {
   205  		return nil, errors.Wrap(err, "find duplicate dirs")
   206  	}
   207  	// make sure parents are before children
   208  	duplicateNames := []string{}
   209  	for name, ds := range dirs {
   210  		if len(ds) > 1 {
   211  			duplicateNames = append(duplicateNames, name)
   212  		}
   213  	}
   214  	sort.Strings(duplicateNames)
   215  	duplicateDirs := [][]fs.Directory{}
   216  	for _, name := range duplicateNames {
   217  		duplicateDirs = append(duplicateDirs, dirs[name])
   218  	}
   219  	return duplicateDirs, nil
   220  }
   221  
   222  // dedupeMergeDuplicateDirs merges all the duplicate directories found
   223  func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error {
   224  	mergeDirs := f.Features().MergeDirs
   225  	if mergeDirs == nil {
   226  		return errors.Errorf("%v: can't merge directories", f)
   227  	}
   228  	dirCacheFlush := f.Features().DirCacheFlush
   229  	if dirCacheFlush == nil {
   230  		return errors.Errorf("%v: can't flush dir cache", f)
   231  	}
   232  	for _, dirs := range duplicateDirs {
   233  		if !fs.Config.DryRun {
   234  			fs.Infof(dirs[0], "Merging contents of duplicate directories")
   235  			err := mergeDirs(ctx, dirs)
   236  			if err != nil {
   237  				err = fs.CountError(err)
   238  				fs.Errorf(nil, "merge duplicate dirs: %v", err)
   239  			}
   240  		} else {
   241  			fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run")
   242  		}
   243  	}
   244  	dirCacheFlush()
   245  	return nil
   246  }
   247  
   248  // sort oldest first
   249  func sortOldestFirst(objs []fs.Object) {
   250  	sort.Slice(objs, func(i, j int) bool {
   251  		return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
   252  	})
   253  }
   254  
   255  // sort smallest first
   256  func sortSmallestFirst(objs []fs.Object) {
   257  	sort.Slice(objs, func(i, j int) bool {
   258  		return objs[i].Size() < objs[j].Size()
   259  	})
   260  }
   261  
   262  // Deduplicate interactively finds duplicate files and offers to
   263  // delete all but one or rename them to be different. Only useful with
   264  // Google Drive which can have duplicate file names.
   265  func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
   266  	fs.Infof(f, "Looking for duplicates using %v mode.", mode)
   267  
   268  	// Find duplicate directories first and fix them
   269  	duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
   270  	if err != nil {
   271  		return err
   272  	}
   273  	if len(duplicateDirs) != 0 {
   274  		err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
   275  		if err != nil {
   276  			return err
   277  		}
   278  	}
   279  
   280  	// find a hash to use
   281  	ht := f.Hashes().GetOne()
   282  
   283  	// Now find duplicate files
   284  	files := map[string][]fs.Object{}
   285  	err = walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
   286  		entries.ForObject(func(o fs.Object) {
   287  			remote := o.Remote()
   288  			files[remote] = append(files[remote], o)
   289  		})
   290  		return nil
   291  	})
   292  	if err != nil {
   293  		return err
   294  	}
   295  
   296  	for remote, objs := range files {
   297  		if len(objs) > 1 {
   298  			fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs))
   299  			objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
   300  			if len(objs) <= 1 {
   301  				fs.Logf(remote, "All duplicates removed")
   302  				continue
   303  			}
   304  			switch mode {
   305  			case DeduplicateInteractive:
   306  				dedupeInteractive(ctx, f, ht, remote, objs)
   307  			case DeduplicateFirst:
   308  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   309  			case DeduplicateNewest:
   310  				sortOldestFirst(objs)
   311  				dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
   312  			case DeduplicateOldest:
   313  				sortOldestFirst(objs)
   314  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   315  			case DeduplicateRename:
   316  				dedupeRename(ctx, f, remote, objs)
   317  			case DeduplicateLargest:
   318  				sortSmallestFirst(objs)
   319  				dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
   320  			case DeduplicateSmallest:
   321  				sortSmallestFirst(objs)
   322  				dedupeDeleteAllButOne(ctx, 0, remote, objs)
   323  			case DeduplicateSkip:
   324  				// skip
   325  			default:
   326  				//skip
   327  			}
   328  		}
   329  	}
   330  	return nil
   331  }