github.com/shivakar/gdupes@v0.0.0-20180726052558-d5c070c306d0/gdupes/process.go (about)

     1  package gdupes
     2  
     3  import (
     4  	"io"
     5  	"os"
     6  	"strings"
     7  	"sync"
     8  
     9  	"github.com/shivakar/xxhash"
    10  )
    11  
    12  // hashFile returns hexadecimal string output of xxhash64 for the given file
    13  func hashFile(filepath string) string {
    14  	h := xxhash.NewXXHash64()
    15  	f, err := os.Open(filepath)
    16  	if err != nil {
    17  		panic(err)
    18  	}
    19  	b := make([]byte, 1024*1024) // 1 MB buffer
    20  	for {
    21  		n, err := f.Read(b)
    22  
    23  		if n > 0 {
    24  			h.Write(b[:n])
    25  		}
    26  		if err == io.EOF {
    27  			break
    28  		}
    29  	}
    30  	return h.String()
    31  }
    32  
    33  // includeFileInOutput returns true if the file should be included as per
    34  // the given configuration
    35  func includeFileInOutput(c *Config, fm FileMeta,
    36  	fms FileMetaSlice) (bool, int, error) {
    37  	if !c.Hardlinks {
    38  		// Not treating hardlinks as duplicates
    39  		inc, idx, err := fms.ContainsInode(fm)
    40  		return !inc, idx, err
    41  	}
    42  	return true, -1, nil
    43  }
    44  
    45  // ProcessFiles computes hashes and updates map of hashes for files to be
    46  // processed
    47  func ProcessFiles(c *Config, filesToProcess <-chan string,
    48  	fileHashes map[string]FileMetaSlice,
    49  	lock *sync.Mutex, wg *sync.WaitGroup) {
    50  	defer wg.Done()
    51  	for f := range filesToProcess {
    52  		h := hashFile(f)
    53  		info, err := os.Stat(f)
    54  		if err != nil {
    55  			panic(err)
    56  		}
    57  		fm := FileMeta{Path: f, Info: info}
    58  		lock.Lock()
    59  		_, ok := fileHashes[h]
    60  		if !ok {
    61  			fileHashes[h] = []FileMeta{fm}
    62  		} else {
    63  			inc, idx, err := includeFileInOutput(c, fm, fileHashes[h])
    64  			if err != nil {
    65  				panic(err)
    66  			}
    67  			if !inc {
    68  				// Check if this file has a longer filename. This is to ensure
    69  				// that regardless of what order hardlinks are processed in, you
    70  				// always get the same result
    71  				if strings.Compare(fm.Path, fileHashes[h][idx].Path) == 1 {
    72  					fileHashes[h][idx] = fm
    73  				}
    74  				lock.Unlock()
    75  				continue
    76  			}
    77  			fileHashes[h] = append(fileHashes[h], fm)
    78  		}
    79  		lock.Unlock()
    80  	}
    81  }