github.com/anishathalye/periscope@v0.3.5/internal/periscope/scan.go (about)

     1  package periscope
     2  
     3  import (
     4  	"github.com/anishathalye/periscope/internal/db"
     5  	"github.com/anishathalye/periscope/internal/herror"
     6  	"github.com/anishathalye/periscope/internal/par"
     7  
     8  	"encoding/binary"
     9  	"log"
    10  	"os"
    11  
    12  	"github.com/spf13/afero"
    13  )
    14  
    15  type ScanOptions struct {
    16  	Minimum int64
    17  	Maximum int64
    18  }
    19  
    20  func (ps *Periscope) Scan(paths []string, options *ScanOptions) herror.Interface {
    21  	// check that paths exist before starting any work
    22  	absPaths := make([]string, len(paths))
    23  	for i, path := range paths {
    24  		abs, _, err := ps.checkFile(path, false, true, "scan", false, true)
    25  		if err != nil {
    26  			return err
    27  		}
    28  		absPaths[i] = abs
    29  	}
    30  	dupes, done := ps.findDuplicates(absPaths, options)
    31  	tx, err := ps.db.Begin()
    32  	if err != nil {
    33  		return err
    34  	}
    35  	// remove previously scanned files in paths we are now searching
    36  	for _, path := range absPaths {
    37  		err := tx.RemoveDir(path, options.Minimum, options.Maximum)
    38  		if err != nil {
    39  			return err
    40  		}
    41  	}
    42  	// add all the new things we've found
    43  	for info := range dupes {
    44  		err := tx.Add(info.(db.FileInfo))
    45  		if err != nil {
    46  			return err
    47  		}
    48  	}
    49  	// create indexes if they don't exist already
    50  	err = tx.CreateIndexes()
    51  	if err != nil {
    52  		return err
    53  	}
    54  	if err = tx.Commit(); err != nil {
    55  		return err
    56  	}
    57  	done()
    58  	return nil
    59  }
    60  
    61  // we use this to avoid database writes; findFilesBySize finds files in the
    62  // directory to be scanned, and it also looks up relevant files to consider
    63  // from the database; we want to add newly scanned files to the database
    64  // regardless, but for infos that were already there, we only want to write to
    65  // the database if they info has been updated (we've computed a hash that
    66  // wasn't there before)
    67  type searchResult struct {
    68  	info db.FileInfo
    69  	old  bool
    70  }
    71  
    72  // return value also includes the relevant stuff in the DB
    73  //
    74  // we do this here so that there are no db reads in the rest of findDuplicates,
    75  // so we can do a streaming write into the db without concurrent reads
    76  func (ps *Periscope) findFilesBySize(paths []string, options *ScanOptions) (map[int64][]searchResult, int) {
    77  	sizeToInfos := make(map[int64][]searchResult)
    78  	files := 0
    79  
    80  	bar := ps.progressBar(0, `searching: {{ counters . }} files {{ etime . }} `)
    81  
    82  	for _, root := range paths {
    83  		err := afero.Walk(ps.fs, root, func(path string, info os.FileInfo, err error) error {
    84  			if err != nil {
    85  				log.Printf("%s", err)
    86  				return nil
    87  			}
    88  			if !info.Mode().IsRegular() {
    89  				return nil
    90  			}
    91  			size := info.Size()
    92  			if size > int64(options.Minimum) && (options.Maximum == 0 || size <= int64(options.Maximum)) {
    93  				if len(sizeToInfos[size]) == 0 {
    94  					// find all relevant files from the database, skipping the
    95  					// ones that are included in paths; we only do this once per
    96  					// size (after we've seen a particular size, the [size] key
    97  					// will contain at least one element, so we won't re-do this
    98  					if known, err := ps.db.InfosBySize(size); err == nil {
    99  						for _, k := range known {
   100  							if !containedInAny(k.Path, paths) {
   101  								sizeToInfos[size] = append(sizeToInfos[size], searchResult{info: k, old: true})
   102  								files++
   103  							}
   104  						}
   105  					}
   106  				}
   107  				sizeToInfos[size] = append(sizeToInfos[size], searchResult{
   108  					info: db.FileInfo{
   109  						Path:      path,
   110  						Size:      size,
   111  						ShortHash: nil,
   112  						FullHash:  nil,
   113  					},
   114  					old: false,
   115  				})
   116  				files++
   117  				bar.Increment()
   118  			}
   119  			return nil
   120  		})
   121  		if err != nil {
   122  			log.Printf("Walk() returned error: %s", err)
   123  		}
   124  	}
   125  	bar.Finish()
   126  	return sizeToInfos, files
   127  }
   128  
   129  // paths consists of absolute paths with no symlinks
   130  func (ps *Periscope) findDuplicates(searchPaths []string, options *ScanOptions) (<-chan interface{}, func()) {
   131  	sizeToInfos, files := ps.findFilesBySize(searchPaths, options)
   132  
   133  	bar := ps.progressBar(files, `analyzing: {{ counters . }} {{ bar . "[" "=" ">" " " "]" }} {{ etime . }} {{ rtime . "ETA %s" "%.0s" " " }} `)
   134  	done := func() {
   135  		bar.Finish()
   136  	}
   137  
   138  	return par.MapN(sizeToInfos, scanThreads, func(k, v interface{}, emit func(x interface{})) {
   139  		size := k.(int64)
   140  		searchResults := v.([]searchResult)
   141  
   142  		// files may appear multiple times, if the same directory is repeated in the
   143  		// arguments to scan; skip those
   144  		seen := make(map[string]struct{})
   145  		var infos []db.FileInfo
   146  		// have we updated the data for this path (computed a new hash)? if so, we will
   147  		// write the relevant info to the database
   148  		var updated []bool // has infos[i] been updated?
   149  		for _, result := range searchResults {
   150  			path := result.info.Path
   151  			if _, ok := seen[path]; ok {
   152  				bar.Add(1) // no more work to do for skipped search results
   153  				continue
   154  			}
   155  			seen[path] = struct{}{}
   156  			infos = append(infos, result.info)
   157  			if !result.old {
   158  				updated = append(updated, true)
   159  			} else {
   160  				updated = append(updated, false)
   161  			}
   162  		}
   163  
   164  		// if there's only one file with this size, we don't need to do any hashing
   165  		if len(infos) == 1 {
   166  			// the following check should always be true
   167  			if updated[0] {
   168  				emit(infos[0])
   169  			}
   170  			bar.Add(1)
   171  			return
   172  		}
   173  
   174  		// compute short hashes for all files (skipping the ones where
   175  		// we already have short hashes), bucketing results by short hash
   176  		szBuf := make([]byte, 8)
   177  		binary.LittleEndian.PutUint64(szBuf, uint64(size))
   178  		byShortHash := make(map[[ShortHashSize]byte][]int) // indices into infos array
   179  		for i := range infos {
   180  			info := &infos[i]
   181  			// compute short hash if necessary
   182  			if info.ShortHash == nil {
   183  				// key by size to have unique short hashes, so we can use them as global identifiers
   184  				hash, err := ps.hashPartial(info.Path, szBuf)
   185  				if err != nil {
   186  					log.Printf("hashPartial() returned error: %s", err)
   187  					bar.Add(1) // ignored; no more work to do for this file
   188  					continue   // ignore this file
   189  				}
   190  				info.ShortHash = hash
   191  				updated[i] = true
   192  			}
   193  			hashArr := shortHashToArray(info.ShortHash)
   194  			byShortHash[hashArr] = append(byShortHash[hashArr], i)
   195  		}
   196  
   197  		// wherever there is > 1 file in a bucket, compute the full
   198  		// hashes (skipping the ones where we already have full hashes)
   199  		for _, indices := range byShortHash {
   200  			if len(indices) <= 1 {
   201  				// no need to compute full hash
   202  				bar.Add(len(indices)) // no more work to do for these
   203  				continue
   204  			}
   205  			// collide on short hash; hash full file
   206  			for _, index := range indices {
   207  				info := &infos[index]
   208  				if info.FullHash == nil {
   209  					hash, err := ps.hashFile(info.Path)
   210  					if err != nil {
   211  						log.Printf("hashPartial() returned error: %s", err)
   212  						bar.Add(1) // ignored; no more work to do for this file
   213  						continue   // ignore this file
   214  					}
   215  					info.FullHash = hash
   216  					updated[index] = true
   217  				}
   218  				bar.Add(1)
   219  			}
   220  		}
   221  
   222  		// emit all files for which we've done some work, where there is new info to save to
   223  		// the database
   224  		for i, info := range infos {
   225  			if updated[i] {
   226  				emit(info)
   227  			}
   228  		}
   229  	}), done
   230  }