github.com/anishathalye/periscope@v0.3.5/internal/periscope/scan.go (about) 1 package periscope 2 3 import ( 4 "github.com/anishathalye/periscope/internal/db" 5 "github.com/anishathalye/periscope/internal/herror" 6 "github.com/anishathalye/periscope/internal/par" 7 8 "encoding/binary" 9 "log" 10 "os" 11 12 "github.com/spf13/afero" 13 ) 14 15 type ScanOptions struct { 16 Minimum int64 17 Maximum int64 18 } 19 20 func (ps *Periscope) Scan(paths []string, options *ScanOptions) herror.Interface { 21 // check that paths exist before starting any work 22 absPaths := make([]string, len(paths)) 23 for i, path := range paths { 24 abs, _, err := ps.checkFile(path, false, true, "scan", false, true) 25 if err != nil { 26 return err 27 } 28 absPaths[i] = abs 29 } 30 dupes, done := ps.findDuplicates(absPaths, options) 31 tx, err := ps.db.Begin() 32 if err != nil { 33 return err 34 } 35 // remove previously scanned files in paths we are now searching 36 for _, path := range absPaths { 37 err := tx.RemoveDir(path, options.Minimum, options.Maximum) 38 if err != nil { 39 return err 40 } 41 } 42 // add all the new things we've found 43 for info := range dupes { 44 err := tx.Add(info.(db.FileInfo)) 45 if err != nil { 46 return err 47 } 48 } 49 // create indexes if they don't exist already 50 err = tx.CreateIndexes() 51 if err != nil { 52 return err 53 } 54 if err = tx.Commit(); err != nil { 55 return err 56 } 57 done() 58 return nil 59 } 60 61 // we use this to avoid database writes; findFilesBySize finds files in the 62 // directory to be scanned, and it also looks up relevant files to consider 63 // from the database; we want to add newly scanned files to the database 64 // regardless, but for infos that were already there, we only want to write to 65 // the database if they info has been updated (we've computed a hash that 66 // wasn't there before) 67 type searchResult struct { 68 info db.FileInfo 69 old bool 70 } 71 72 // return value also includes the relevant stuff in the DB 73 // 74 // we do this here so that there are no db reads in the rest of findDuplicates, 75 // so we can do a streaming write into the db without concurrent reads 76 func (ps *Periscope) findFilesBySize(paths []string, options *ScanOptions) (map[int64][]searchResult, int) { 77 sizeToInfos := make(map[int64][]searchResult) 78 files := 0 79 80 bar := ps.progressBar(0, `searching: {{ counters . }} files {{ etime . }} `) 81 82 for _, root := range paths { 83 err := afero.Walk(ps.fs, root, func(path string, info os.FileInfo, err error) error { 84 if err != nil { 85 log.Printf("%s", err) 86 return nil 87 } 88 if !info.Mode().IsRegular() { 89 return nil 90 } 91 size := info.Size() 92 if size > int64(options.Minimum) && (options.Maximum == 0 || size <= int64(options.Maximum)) { 93 if len(sizeToInfos[size]) == 0 { 94 // find all relevant files from the database, skipping the 95 // ones that are included in paths; we only do this once per 96 // size (after we've seen a particular size, the [size] key 97 // will contain at least one element, so we won't re-do this 98 if known, err := ps.db.InfosBySize(size); err == nil { 99 for _, k := range known { 100 if !containedInAny(k.Path, paths) { 101 sizeToInfos[size] = append(sizeToInfos[size], searchResult{info: k, old: true}) 102 files++ 103 } 104 } 105 } 106 } 107 sizeToInfos[size] = append(sizeToInfos[size], searchResult{ 108 info: db.FileInfo{ 109 Path: path, 110 Size: size, 111 ShortHash: nil, 112 FullHash: nil, 113 }, 114 old: false, 115 }) 116 files++ 117 bar.Increment() 118 } 119 return nil 120 }) 121 if err != nil { 122 log.Printf("Walk() returned error: %s", err) 123 } 124 } 125 bar.Finish() 126 return sizeToInfos, files 127 } 128 129 // paths consists of absolute paths with no symlinks 130 func (ps *Periscope) findDuplicates(searchPaths []string, options *ScanOptions) (<-chan interface{}, func()) { 131 sizeToInfos, files := ps.findFilesBySize(searchPaths, options) 132 133 bar := ps.progressBar(files, `analyzing: {{ counters . }} {{ bar . "[" "=" ">" " " "]" }} {{ etime . }} {{ rtime . "ETA %s" "%.0s" " " }} `) 134 done := func() { 135 bar.Finish() 136 } 137 138 return par.MapN(sizeToInfos, scanThreads, func(k, v interface{}, emit func(x interface{})) { 139 size := k.(int64) 140 searchResults := v.([]searchResult) 141 142 // files may appear multiple times, if the same directory is repeated in the 143 // arguments to scan; skip those 144 seen := make(map[string]struct{}) 145 var infos []db.FileInfo 146 // have we updated the data for this path (computed a new hash)? if so, we will 147 // write the relevant info to the database 148 var updated []bool // has infos[i] been updated? 149 for _, result := range searchResults { 150 path := result.info.Path 151 if _, ok := seen[path]; ok { 152 bar.Add(1) // no more work to do for skipped search results 153 continue 154 } 155 seen[path] = struct{}{} 156 infos = append(infos, result.info) 157 if !result.old { 158 updated = append(updated, true) 159 } else { 160 updated = append(updated, false) 161 } 162 } 163 164 // if there's only one file with this size, we don't need to do any hashing 165 if len(infos) == 1 { 166 // the following check should always be true 167 if updated[0] { 168 emit(infos[0]) 169 } 170 bar.Add(1) 171 return 172 } 173 174 // compute short hashes for all files (skipping the ones where 175 // we already have short hashes), bucketing results by short hash 176 szBuf := make([]byte, 8) 177 binary.LittleEndian.PutUint64(szBuf, uint64(size)) 178 byShortHash := make(map[[ShortHashSize]byte][]int) // indices into infos array 179 for i := range infos { 180 info := &infos[i] 181 // compute short hash if necessary 182 if info.ShortHash == nil { 183 // key by size to have unique short hashes, so we can use them as global identifiers 184 hash, err := ps.hashPartial(info.Path, szBuf) 185 if err != nil { 186 log.Printf("hashPartial() returned error: %s", err) 187 bar.Add(1) // ignored; no more work to do for this file 188 continue // ignore this file 189 } 190 info.ShortHash = hash 191 updated[i] = true 192 } 193 hashArr := shortHashToArray(info.ShortHash) 194 byShortHash[hashArr] = append(byShortHash[hashArr], i) 195 } 196 197 // wherever there is > 1 file in a bucket, compute the full 198 // hashes (skipping the ones where we already have full hashes) 199 for _, indices := range byShortHash { 200 if len(indices) <= 1 { 201 // no need to compute full hash 202 bar.Add(len(indices)) // no more work to do for these 203 continue 204 } 205 // collide on short hash; hash full file 206 for _, index := range indices { 207 info := &infos[index] 208 if info.FullHash == nil { 209 hash, err := ps.hashFile(info.Path) 210 if err != nil { 211 log.Printf("hashPartial() returned error: %s", err) 212 bar.Add(1) // ignored; no more work to do for this file 213 continue // ignore this file 214 } 215 info.FullHash = hash 216 updated[index] = true 217 } 218 bar.Add(1) 219 } 220 } 221 222 // emit all files for which we've done some work, where there is new info to save to 223 // the database 224 for i, info := range infos { 225 if updated[i] { 226 emit(info) 227 } 228 } 229 }), done 230 }