github.com/lkarlslund/stringdedup@v0.6.2/example/main.go (about)

     1  package main
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"path/filepath"
     7  	"runtime"
     8  	"time"
     9  	"unsafe"
    10  
    11  	"github.com/OneOfOne/xxhash"
    12  	"github.com/lkarlslund/stringdedup"
    13  )
    14  
    15  type fileinfo struct {
    16  	folder, basename, extension string
    17  }
    18  
    19  var files, files2 []fileinfo
    20  
    21  func main() {
    22  	fmt.Println("String deduplication demonstration")
    23  	fmt.Println("---")
    24  
    25  	d := stringdedup.New(func(in []byte) uint32 {
    26  		return xxhash.Checksum32(in)
    27  	})
    28  
    29  	var memstats runtime.MemStats
    30  
    31  	runtime.ReadMemStats(&memstats)
    32  	fmt.Printf("Initial memory usage at start of program: %v objects, consuming %v bytes\n", memstats.HeapObjects, memstats.HeapInuse)
    33  	fmt.Println("---")
    34  
    35  	searchDir := "/usr"
    36  	if runtime.GOOS == "windows" {
    37  		searchDir = "c:/windows"
    38  	}
    39  
    40  	fmt.Printf("Scanning and indexing files in %v - hang on ...\n", searchDir)
    41  
    42  	filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error {
    43  		if !f.IsDir() {
    44  			folder := filepath.Dir(path)
    45  			extension := filepath.Ext(path)
    46  			basename := filepath.Base(path)
    47  			basename = basename[:len(basename)-len(extension)]
    48  			files = append(files, fileinfo{
    49  				folder:    folder,
    50  				basename:  extension,
    51  				extension: basename,
    52  			})
    53  		}
    54  		return nil
    55  	})
    56  
    57  	fmt.Println("Scanning done!")
    58  	fmt.Println("---")
    59  
    60  	runtime.GC()                       // Let garbage collector run, and see memory usage
    61  	time.Sleep(time.Millisecond * 100) // Settle down
    62  	runtime.ReadMemStats(&memstats)
    63  	fmt.Printf("Memory usage for %v fileinfo: %v object, consuming %v bytes\n", len(files), memstats.HeapObjects, memstats.HeapInuse)
    64  
    65  	undedupbytes := memstats.HeapInuse
    66  
    67  	fmt.Printf("Slice reference costs %v x %v bytes - a total of %v bytes\n", len(files), unsafe.Sizeof(fileinfo{}), len(files)*int(unsafe.Sizeof(fileinfo{})))
    68  
    69  	checksum := xxhash.New64()
    70  	for _, fi := range files {
    71  		checksum.Write([]byte(fi.folder + fi.basename + fi.extension))
    72  	}
    73  	csum := checksum.Sum64()
    74  	fmt.Printf("Validation checksum on non deduped files is %x\n", csum)
    75  	fmt.Println("---")
    76  
    77  	// NON DEDUPLICATED STATISTICS END
    78  
    79  	// A new batch of fileinfo
    80  	files2 = make([]fileinfo, len(files), cap(files))
    81  
    82  	// Lets try that again with deduplication
    83  	for i, fi := range files {
    84  		files2[i] = fileinfo{
    85  			folder:    d.S(fi.folder),
    86  			basename:  d.S(fi.basename),
    87  			extension: d.S(fi.extension),
    88  		}
    89  	}
    90  
    91  	runtime.ReadMemStats(&memstats)
    92  	fmt.Println("Both a duplicated and non-deduplicated slice is now in memory")
    93  	fmt.Printf("Double allocated memory usage for %v fileinfo: %v objects, consuming %v bytes\n", len(files2), memstats.HeapObjects, memstats.HeapInuse)
    94  
    95  	// Let garbage collector run, and see memory usage
    96  	runtime.KeepAlive(files)
    97  	files = nil
    98  	runtime.GC()
    99  	time.Sleep(time.Millisecond * 1000)
   100  
   101  	runtime.ReadMemStats(&memstats)
   102  	fmt.Println("---")
   103  	fmt.Printf("Dedup memory usage for %v fileinfo: %v objects, consuming %v bytes\n", len(files2), memstats.HeapObjects, memstats.HeapInuse)
   104  
   105  	dedupbytes := memstats.HeapInuse
   106  	fmt.Printf("Reduction in memory usage: %.2f\n", float32(dedupbytes)/float32(undedupbytes))
   107  
   108  	// Drop indexes and let's see
   109  	d.Flush()
   110  	runtime.GC()
   111  	time.Sleep(time.Millisecond * 1000)
   112  
   113  	runtime.ReadMemStats(&memstats)
   114  	fmt.Println("---")
   115  	fmt.Printf("Flushed index memory usage: %v object, consuming %v bytes\n", memstats.HeapObjects, memstats.HeapInuse)
   116  	fmt.Printf("Reduction in memory usage (after dropping indexes): %.2f\n", float32(memstats.HeapInuse)/float32(undedupbytes))
   117  
   118  	// Validate that deduped files are the same as non deduped files
   119  	checksum = xxhash.New64()
   120  	for _, fi := range files2 {
   121  		checksum.Write([]byte(fi.folder + fi.basename + fi.extension))
   122  	}
   123  	fmt.Println("---")
   124  	csum2 := checksum.Sum64()
   125  	fmt.Printf("Validation on dedup strings checksum is %x\n", csum2)
   126  	checksum = nil
   127  
   128  	if csum != csum2 {
   129  		fmt.Println("!!! VALIDATION FAILED. DEDUPED STRINGS ARE NOT THE SAME AS NON DEDUPED STRINGS !!!")
   130  	}
   131  
   132  	var bytes int
   133  	for _, file := range files2 {
   134  		bytes += len(file.basename) + len(file.extension) + len(file.folder)
   135  	}
   136  
   137  	// Let garbage collector run, and see memory usage
   138  	// Clean up stuff left by finalizers
   139  	files2 = nil
   140  	runtime.GC()
   141  	time.Sleep(time.Millisecond * 100)
   142  	runtime.GC()
   143  
   144  	runtime.ReadMemStats(&memstats)
   145  	fmt.Println("---")
   146  	fmt.Printf("Cleared memory usage: %v object, consuming %v bytes\n", memstats.HeapObjects, memstats.HeapInuse)
   147  }
   148  
   149  // func printmemstats("")