github.com/rclone/rclone@v1.66.1-0.20240517100346-7b89735ae726/fs/operations/dedupe.go (about)

     1  // dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
     2  
     3  package operations
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"log"
     9  	"path"
    10  	"sort"
    11  	"strings"
    12  
    13  	"github.com/rclone/rclone/fs"
    14  	"github.com/rclone/rclone/fs/accounting"
    15  	"github.com/rclone/rclone/fs/config"
    16  	"github.com/rclone/rclone/fs/hash"
    17  	"github.com/rclone/rclone/fs/walk"
    18  )
    19  
    20  // dedupeRename renames the objs slice to different names
    21  func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
    22  	doMove := f.Features().Move
    23  	if doMove == nil {
    24  		log.Fatalf("Fs %v doesn't support Move", f)
    25  	}
    26  	ext := path.Ext(remote)
    27  	base := remote[:len(remote)-len(ext)]
    28  
    29  outer:
    30  	for i, o := range objs {
    31  		suffix := 1
    32  		newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    33  		_, err := f.NewObject(ctx, newName)
    34  		for ; err != fs.ErrorObjectNotFound; suffix++ {
    35  			if err != nil {
    36  				err = fs.CountError(err)
    37  				fs.Errorf(o, "Failed to check for existing object: %v", err)
    38  				continue outer
    39  			}
    40  			if suffix > 100 {
    41  				fs.Errorf(o, "Could not find an available new name")
    42  				continue outer
    43  			}
    44  			newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
    45  			_, err = f.NewObject(ctx, newName)
    46  		}
    47  		if !SkipDestructive(ctx, o, "rename") {
    48  			newObj, err := doMove(ctx, o, newName)
    49  			if err != nil {
    50  				err = fs.CountError(err)
    51  				fs.Errorf(o, "Failed to rename: %v", err)
    52  				continue
    53  			}
    54  			fs.Infof(newObj, "renamed from: %v", o)
    55  		}
    56  	}
    57  }
    58  
    59  // dedupeDeleteAllButOne deletes all but the one in keep
    60  func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
    61  	count := 0
    62  	for i, o := range objs {
    63  		if i == keep {
    64  			continue
    65  		}
    66  		err := DeleteFile(ctx, o)
    67  		if err == nil {
    68  			count++
    69  		}
    70  	}
    71  	if count > 0 {
    72  		fs.Logf(remote, "Deleted %d extra copies", count)
    73  	}
    74  }
    75  
    76  // dedupeDeleteIdentical deletes all but one of identical (by hash) copies
    77  func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
    78  	ci := fs.GetConfig(ctx)
    79  
    80  	// Make map of IDs
    81  	IDs := make(map[string]int, len(objs))
    82  	for _, o := range objs {
    83  		if do, ok := o.(fs.IDer); ok {
    84  			if ID := do.ID(); ID != "" {
    85  				IDs[ID]++
    86  			}
    87  		}
    88  	}
    89  
    90  	// Remove duplicate IDs
    91  	newObjs := objs[:0]
    92  	for _, o := range objs {
    93  		if do, ok := o.(fs.IDer); ok {
    94  			if ID := do.ID(); ID != "" {
    95  				if IDs[ID] <= 1 {
    96  					newObjs = append(newObjs, o)
    97  				} else {
    98  					fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID])
    99  				}
   100  			}
   101  		}
   102  	}
   103  	objs = newObjs
   104  
   105  	// See how many of these duplicates are identical
   106  	dupesByID := make(map[string][]fs.Object, len(objs))
   107  	for _, o := range objs {
   108  		ID := ""
   109  		if ci.SizeOnly && o.Size() >= 0 {
   110  			ID = fmt.Sprintf("size %d", o.Size())
   111  		} else if ht != hash.None {
   112  			hashValue, err := o.Hash(ctx, ht)
   113  			if err == nil && hashValue != "" {
   114  				ID = fmt.Sprintf("%v %s", ht, hashValue)
   115  			}
   116  		}
   117  		if ID == "" {
   118  			remainingObjs = append(remainingObjs, o)
   119  		} else {
   120  			dupesByID[ID] = append(dupesByID[ID], o)
   121  		}
   122  	}
   123  
   124  	// Delete identical duplicates, filling remainingObjs with the ones remaining
   125  	for ID, dupes := range dupesByID {
   126  		remainingObjs = append(remainingObjs, dupes[0])
   127  		if len(dupes) > 1 {
   128  			fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
   129  			for _, o := range dupes[1:] {
   130  				err := DeleteFile(ctx, o)
   131  				if err != nil {
   132  					remainingObjs = append(remainingObjs, o)
   133  				}
   134  			}
   135  		}
   136  	}
   137  
   138  	return remainingObjs
   139  }
   140  
   141  // dedupeList lists the duplicates and does nothing
   142  func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
   143  	fmt.Printf("%s: %d duplicates\n", remote, len(objs))
   144  	for i, o := range objs {
   145  		hashValue := ""
   146  		if ht != hash.None {
   147  			var err error
   148  			hashValue, err = o.Hash(ctx, ht)
   149  			if err != nil {
   150  				hashValue = err.Error()
   151  			}
   152  		}
   153  		if byHash {
   154  			fmt.Printf("  %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
   155  		} else {
   156  			fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
   157  		}
   158  	}
   159  }
   160  
   161  // dedupeInteractive interactively dedupes the slice of objects
   162  func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool {
   163  	dedupeList(ctx, f, ht, remote, objs, byHash)
   164  	commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
   165  	if !byHash {
   166  		commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
   167  	}
   168  	commands = append(commands, "qQuit")
   169  	switch config.Command(commands) {
   170  	case 's':
   171  	case 'k':
   172  		keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
   173  		dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
   174  	case 'r':
   175  		dedupeRename(ctx, f, remote, objs)
   176  	case 'q':
   177  		return false
   178  	}
   179  	return true
   180  }
   181  
   182  // DeduplicateMode is how the dedupe command chooses what to do
   183  type DeduplicateMode int
   184  
   185  // Deduplicate modes
   186  const (
   187  	DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
   188  	DeduplicateSkip                               // skip all conflicts
   189  	DeduplicateFirst                              // choose the first object
   190  	DeduplicateNewest                             // choose the newest object
   191  	DeduplicateOldest                             // choose the oldest object
   192  	DeduplicateRename                             // rename the objects
   193  	DeduplicateLargest                            // choose the largest object
   194  	DeduplicateSmallest                           // choose the smallest object
   195  	DeduplicateList                               // list duplicates only
   196  )
   197  
   198  func (x DeduplicateMode) String() string {
   199  	switch x {
   200  	case DeduplicateInteractive:
   201  		return "interactive"
   202  	case DeduplicateSkip:
   203  		return "skip"
   204  	case DeduplicateFirst:
   205  		return "first"
   206  	case DeduplicateNewest:
   207  		return "newest"
   208  	case DeduplicateOldest:
   209  		return "oldest"
   210  	case DeduplicateRename:
   211  		return "rename"
   212  	case DeduplicateLargest:
   213  		return "largest"
   214  	case DeduplicateSmallest:
   215  		return "smallest"
   216  	case DeduplicateList:
   217  		return "list"
   218  	}
   219  	return "unknown"
   220  }
   221  
   222  // Set a DeduplicateMode from a string
   223  func (x *DeduplicateMode) Set(s string) error {
   224  	switch strings.ToLower(s) {
   225  	case "interactive":
   226  		*x = DeduplicateInteractive
   227  	case "skip":
   228  		*x = DeduplicateSkip
   229  	case "first":
   230  		*x = DeduplicateFirst
   231  	case "newest":
   232  		*x = DeduplicateNewest
   233  	case "oldest":
   234  		*x = DeduplicateOldest
   235  	case "rename":
   236  		*x = DeduplicateRename
   237  	case "largest":
   238  		*x = DeduplicateLargest
   239  	case "smallest":
   240  		*x = DeduplicateSmallest
   241  	case "list":
   242  		*x = DeduplicateList
   243  	default:
   244  		return fmt.Errorf("unknown mode for dedupe %q", s)
   245  	}
   246  	return nil
   247  }
   248  
   249  // Type of the value
   250  func (x *DeduplicateMode) Type() string {
   251  	return "string"
   252  }
   253  
   254  // Directory with entry count and links to parents
   255  type dedupeDir struct {
   256  	dir    fs.Directory
   257  	parent string
   258  	count  int
   259  }
   260  
   261  // Map of directories by ID with recursive counts
   262  type dedupeDirsMap map[string]*dedupeDir
   263  
   264  func (dm dedupeDirsMap) get(id string) *dedupeDir {
   265  	d := dm[id]
   266  	if d == nil {
   267  		d = &dedupeDir{}
   268  		dm[id] = d
   269  	}
   270  	return d
   271  }
   272  
   273  func (dm dedupeDirsMap) increment(parent string) {
   274  	if parent != "" {
   275  		d := dm.get(parent)
   276  		d.count++
   277  		dm.increment(d.parent)
   278  	}
   279  }
   280  
   281  // dedupeFindDuplicateDirs scans f for duplicate directories
   282  func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
   283  	dirsByID := dedupeDirsMap{}
   284  	dirs := map[string][]*dedupeDir{}
   285  
   286  	ci := fs.GetConfig(ctx)
   287  	err = walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
   288  		for _, entry := range entries {
   289  			tr := accounting.Stats(ctx).NewCheckingTransfer(entry, "merging")
   290  
   291  			remote := entry.Remote()
   292  			parentRemote := path.Dir(remote)
   293  			if parentRemote == "." {
   294  				parentRemote = ""
   295  			}
   296  
   297  			// Obtain ID of the object parent, if known.
   298  			// (This usually means that backend allows duplicate paths)
   299  			// Fall back to remote parent path, if unavailable.
   300  			var parent string
   301  			if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
   302  				parent = entryParentIDer.ParentID()
   303  			}
   304  			if parent == "" {
   305  				parent = parentRemote
   306  			}
   307  
   308  			var ID string
   309  			if entryIDer, ok := entry.(fs.IDer); ok {
   310  				ID = entryIDer.ID()
   311  			}
   312  			if ID == "" {
   313  				ID = remote
   314  			}
   315  
   316  			if fsDir, ok := entry.(fs.Directory); ok {
   317  				d := dirsByID.get(ID)
   318  				d.dir = fsDir
   319  				d.parent = parent
   320  				dirs[remote] = append(dirs[remote], d)
   321  			}
   322  
   323  			dirsByID.increment(parent)
   324  			tr.Done(ctx, nil)
   325  		}
   326  		return nil
   327  	})
   328  	if err != nil {
   329  		return nil, fmt.Errorf("find duplicate dirs: %w", err)
   330  	}
   331  
   332  	// Make sure parents are before children
   333  	duplicateNames := []string{}
   334  	for name, ds := range dirs {
   335  		if len(ds) > 1 {
   336  			duplicateNames = append(duplicateNames, name)
   337  		}
   338  	}
   339  	sort.Strings(duplicateNames)
   340  	for _, name := range duplicateNames {
   341  		duplicateDirs = append(duplicateDirs, dirs[name])
   342  	}
   343  
   344  	return
   345  }
   346  
   347  // dedupeMergeDuplicateDirs merges all the duplicate directories found
   348  func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
   349  	mergeDirs := f.Features().MergeDirs
   350  	if mergeDirs == nil {
   351  		return fmt.Errorf("%v: can't merge directories", f)
   352  	}
   353  	dirCacheFlush := f.Features().DirCacheFlush
   354  	if dirCacheFlush == nil {
   355  		return fmt.Errorf("%v: can't flush dir cache", f)
   356  	}
   357  	for _, dedupeDirs := range duplicateDirs {
   358  		if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
   359  			continue
   360  		}
   361  
   362  		// Put largest directory in front to minimize movements
   363  		fsDirs := []fs.Directory{}
   364  		largestCount := -1
   365  		largestIdx := 0
   366  		for i, d := range dedupeDirs {
   367  			fsDirs = append(fsDirs, d.dir)
   368  			if d.count > largestCount {
   369  				largestIdx = i
   370  				largestCount = d.count
   371  			}
   372  		}
   373  		fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]
   374  
   375  		fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
   376  		err := mergeDirs(ctx, fsDirs)
   377  		if err != nil {
   378  			err = fs.CountError(err)
   379  			fs.Errorf(nil, "merge duplicate dirs: %v", err)
   380  		}
   381  	}
   382  	dirCacheFlush()
   383  	return nil
   384  }
   385  
   386  // sort oldest first
   387  func sortOldestFirst(objs []fs.Object) {
   388  	sort.Slice(objs, func(i, j int) bool {
   389  		return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
   390  	})
   391  }
   392  
   393  // sort smallest first
   394  func sortSmallestFirst(objs []fs.Object) {
   395  	sort.Slice(objs, func(i, j int) bool {
   396  		return objs[i].Size() < objs[j].Size()
   397  	})
   398  }
   399  
   400  // Deduplicate interactively finds duplicate files and offers to
   401  // delete all but one or rename them to be different. Only useful with
   402  // Google Drive which can have duplicate file names.
   403  func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
   404  	ci := fs.GetConfig(ctx)
   405  	// find a hash to use
   406  	ht := f.Hashes().GetOne()
   407  	what := "names"
   408  	if byHash {
   409  		if ht == hash.None {
   410  			return fmt.Errorf("%v has no hashes", f)
   411  		}
   412  		what = ht.String() + " hashes"
   413  	}
   414  	fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
   415  
   416  	// Find duplicate directories first and fix them
   417  	if !byHash {
   418  		duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
   419  		if err != nil {
   420  			return err
   421  		}
   422  		if len(duplicateDirs) > 0 {
   423  			if mode != DeduplicateList {
   424  				err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
   425  				if err != nil {
   426  					return err
   427  				}
   428  			} else {
   429  				for _, dedupeDirs := range duplicateDirs {
   430  					remote := dedupeDirs[0].dir.Remote()
   431  					fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
   432  				}
   433  			}
   434  		}
   435  	}
   436  
   437  	// Now find duplicate files
   438  	files := map[string][]fs.Object{}
   439  	err := walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
   440  		entries.ForObject(func(o fs.Object) {
   441  			tr := accounting.Stats(ctx).NewCheckingTransfer(o, "checking")
   442  			defer tr.Done(ctx, nil)
   443  
   444  			var remote string
   445  			var err error
   446  			if byHash {
   447  				remote, err = o.Hash(ctx, ht)
   448  				if err != nil {
   449  					fs.Errorf(o, "Failed to hash: %v", err)
   450  					remote = ""
   451  				}
   452  			} else {
   453  				remote = o.Remote()
   454  			}
   455  			if remote != "" {
   456  				files[remote] = append(files[remote], o)
   457  			}
   458  		})
   459  		return nil
   460  	})
   461  	if err != nil {
   462  		return err
   463  	}
   464  
   465  	for remote, objs := range files {
   466  		if len(objs) <= 1 {
   467  			continue
   468  		}
   469  		fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
   470  		if !byHash && mode != DeduplicateList {
   471  			objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
   472  			if len(objs) <= 1 {
   473  				fs.Logf(remote, "All duplicates removed")
   474  				continue
   475  			}
   476  		}
   477  		switch mode {
   478  		case DeduplicateInteractive:
   479  			if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) {
   480  				return nil
   481  			}
   482  		case DeduplicateFirst:
   483  			dedupeDeleteAllButOne(ctx, 0, remote, objs)
   484  		case DeduplicateNewest:
   485  			sortOldestFirst(objs)
   486  			dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
   487  		case DeduplicateOldest:
   488  			sortOldestFirst(objs)
   489  			dedupeDeleteAllButOne(ctx, 0, remote, objs)
   490  		case DeduplicateRename:
   491  			dedupeRename(ctx, f, remote, objs)
   492  		case DeduplicateLargest:
   493  			sortSmallestFirst(objs)
   494  			dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
   495  		case DeduplicateSmallest:
   496  			sortSmallestFirst(objs)
   497  			dedupeDeleteAllButOne(ctx, 0, remote, objs)
   498  		case DeduplicateSkip:
   499  			fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
   500  		case DeduplicateList:
   501  			dedupeList(ctx, f, ht, remote, objs, byHash)
   502  		default:
   503  			//skip
   504  		}
   505  	}
   506  	return nil
   507  }