github.com/shivakar/gdupes@v0.0.0-20180726052558-d5c070c306d0/gdupes/process.go (about) 1 package gdupes 2 3 import ( 4 "io" 5 "os" 6 "strings" 7 "sync" 8 9 "github.com/shivakar/xxhash" 10 ) 11 12 // hashFile returns hexadecimal string output of xxhash64 for the given file 13 func hashFile(filepath string) string { 14 h := xxhash.NewXXHash64() 15 f, err := os.Open(filepath) 16 if err != nil { 17 panic(err) 18 } 19 b := make([]byte, 1024*1024) // 1 MB buffer 20 for { 21 n, err := f.Read(b) 22 23 if n > 0 { 24 h.Write(b[:n]) 25 } 26 if err == io.EOF { 27 break 28 } 29 } 30 return h.String() 31 } 32 33 // includeFileInOutput returns true if the file should be included as per 34 // the given configuration 35 func includeFileInOutput(c *Config, fm FileMeta, 36 fms FileMetaSlice) (bool, int, error) { 37 if !c.Hardlinks { 38 // Not treating hardlinks as duplicates 39 inc, idx, err := fms.ContainsInode(fm) 40 return !inc, idx, err 41 } 42 return true, -1, nil 43 } 44 45 // ProcessFiles computes hashes and updates map of hashes for files to be 46 // processed 47 func ProcessFiles(c *Config, filesToProcess <-chan string, 48 fileHashes map[string]FileMetaSlice, 49 lock *sync.Mutex, wg *sync.WaitGroup) { 50 defer wg.Done() 51 for f := range filesToProcess { 52 h := hashFile(f) 53 info, err := os.Stat(f) 54 if err != nil { 55 panic(err) 56 } 57 fm := FileMeta{Path: f, Info: info} 58 lock.Lock() 59 _, ok := fileHashes[h] 60 if !ok { 61 fileHashes[h] = []FileMeta{fm} 62 } else { 63 inc, idx, err := includeFileInOutput(c, fm, fileHashes[h]) 64 if err != nil { 65 panic(err) 66 } 67 if !inc { 68 // Check if this file has a longer filename. This is to ensure 69 // that regardless of what order hardlinks are processed in, you 70 // always get the same result 71 if strings.Compare(fm.Path, fileHashes[h][idx].Path) == 1 { 72 fileHashes[h][idx] = fm 73 } 74 lock.Unlock() 75 continue 76 } 77 fileHashes[h] = append(fileHashes[h], fm) 78 } 79 lock.Unlock() 80 } 81 }