github.com/lkarlslund/stringdedup@v0.6.2/sd.go (about)

     1  package stringdedup
     2  
     3  import (
     4  	"runtime"
     5  	"sync"
     6  	"sync/atomic"
     7  	"time"
     8  	"unsafe"
     9  
    10  	gsync "github.com/SaveTheRbtz/generic-sync-map-go"
    11  	_ "go4.org/unsafe/assume-no-moving-gc"
    12  )
    13  
    14  func New[hashtype comparable](hashfunc func(in []byte) hashtype) *stringDedup[hashtype] {
    15  	var sd stringDedup[hashtype]
    16  	sd.removefromthismap = generateFinalizerFunc(&sd)
    17  	sd.hashfunc = hashfunc
    18  	return &sd
    19  }
    20  
    21  type stringDedup[hashtype comparable] struct {
    22  	stats Statistics // Statistics moved to front to ensure 64-bit alignment even on 32-bit platforms (uses atomic to update)
    23  
    24  	pointermap gsync.MapOf[uintptr, hashtype]
    25  	hashmap    gsync.MapOf[hashtype, weakdata] // key is hash, value is weakdata entry containing pointer to start of string or byte slice *header* and length
    26  
    27  	// Let dedup object keep some strings 'alive' for a period of time
    28  	KeepAlive time.Duration
    29  
    30  	keepAliveSchedLock                    sync.Mutex
    31  	keepalivemap                          gsync.MapOf[string, time.Time]
    32  	keepaliveFlusher                      *time.Timer
    33  	keepaliveitems, keepaliveitemsremoved int64
    34  
    35  	hashfunc func([]byte) hashtype
    36  
    37  	removefromthismap finalizerFunc
    38  
    39  	flushing bool
    40  
    41  	// DontValidateResults skips collisions check in returned strings
    42  	DontValidateResults bool // Disable at your own peril, hash collisions will give you wrong strings back
    43  }
    44  
    45  type Statistics struct {
    46  	ItemsAdded,
    47  	BytesInMemory,
    48  	ItemsSaved,
    49  	BytesSaved,
    50  	ItemsRemoved,
    51  	Collisions,
    52  	FirstCollisionDetected,
    53  	KeepAliveItemsAdded,
    54  	KeepAliveItemsRemoved int64
    55  }
    56  
    57  // Size returns the number of deduplicated strings currently being tracked in memory
    58  func (sd *stringDedup[hashtype]) Size() int64 {
    59  	return atomic.LoadInt64(&sd.stats.ItemsAdded) - atomic.LoadInt64(&sd.stats.ItemsRemoved)
    60  }
    61  
    62  func (sd *stringDedup[hashtype]) Statistics() Statistics {
    63  	// Not thread safe
    64  	return sd.stats
    65  }
    66  
    67  // Flush clears all state information about deduplication
    68  func (sd *stringDedup[hashtype]) Flush() {
    69  	// Clear our data
    70  	sd.flushing = true
    71  
    72  	sd.pointermap.Range(func(pointer uintptr, hash hashtype) bool {
    73  		// Don't finalize, we don't care about it any more
    74  		runtime.SetFinalizer((*byte)(unsafe.Pointer(pointer)), nil)
    75  
    76  		sd.pointermap.Delete(pointer)
    77  		sd.hashmap.Delete(hash)
    78  
    79  		atomic.AddInt64(&sd.stats.ItemsRemoved, 1)
    80  		return true
    81  	})
    82  
    83  	// Get rid of any keepalives
    84  	sd.keepalivemap.Range(func(s string, t time.Time) bool {
    85  		sd.keepalivemap.Delete(s)
    86  		atomic.AddInt64(&sd.keepaliveitemsremoved, 1)
    87  		return true
    88  	})
    89  
    90  	sd.flushing = false
    91  }
    92  
    93  // BS takes a slice of bytes, and returns a copy of it as a deduplicated string
    94  func (sd *stringDedup[hashtype]) BS(in []byte) string {
    95  	str := castBytesToString(in) // NoCopy
    96  	return sd.S(str)
    97  }
    98  
    99  func (sd *stringDedup[hashtype]) S(in string) string {
   100  	if len(in) == 0 {
   101  		// Nothing to see here, move along now
   102  		return in
   103  	}
   104  
   105  	hash := sd.hashfunc(castStringToBytes(in))
   106  
   107  	ws, loaded := sd.hashmap.Load(hash)
   108  
   109  	if loaded {
   110  		atomic.AddInt64(&sd.stats.ItemsSaved, 1)
   111  		atomic.AddInt64(&sd.stats.BytesSaved, int64(ws.length))
   112  		out := ws.String()
   113  		if !sd.DontValidateResults && out != in {
   114  			atomic.CompareAndSwapInt64(&sd.stats.FirstCollisionDetected, 0, sd.Size())
   115  			atomic.AddInt64(&sd.stats.Collisions, 1)
   116  			return in // Collision
   117  		}
   118  		return out
   119  	}
   120  
   121  	// We might recieve a static non-dynamically allocated string, so we need to make a copy
   122  	// Can we detect this somehow and avoid it?
   123  	buf := make([]byte, len(in))
   124  	copy(buf, in)
   125  	str := castBytesToString(buf)
   126  	ws = weakString(str)
   127  
   128  	sd.hashmap.Store(hash, ws)
   129  	sd.pointermap.Store(ws.data, hash)
   130  
   131  	// We need to keep the string alive
   132  	if sd.KeepAlive > 0 {
   133  		sd.keepalivemap.Store(str, time.Now().Add(sd.KeepAlive))
   134  		atomic.AddInt64(&sd.keepaliveitems, 1)
   135  		// Naughty checking without locking
   136  		if sd.keepaliveFlusher == nil {
   137  			sd.keepAliveSchedLock.Lock()
   138  			if sd.keepaliveFlusher == nil {
   139  				sd.keepaliveFlusher = time.AfterFunc(sd.KeepAlive/5, sd.flushKeepAlive)
   140  			}
   141  			sd.keepAliveSchedLock.Unlock()
   142  		}
   143  	}
   144  
   145  	atomic.AddInt64(&sd.stats.ItemsAdded, 1)
   146  	atomic.AddInt64(&sd.stats.BytesInMemory, int64(ws.length))
   147  
   148  	runtime.SetFinalizer((*byte)(unsafe.Pointer(ws.data)), sd.removefromthismap)
   149  	runtime.KeepAlive(str)
   150  	return str
   151  }
   152  
   153  func (sd *stringDedup[hashtype]) flushKeepAlive() {
   154  	var items int
   155  	now := time.Now()
   156  	sd.keepalivemap.Range(func(key string, value time.Time) bool {
   157  		if now.After(value) {
   158  			sd.keepalivemap.Delete(key)
   159  			atomic.AddInt64(&sd.keepaliveitemsremoved, 1)
   160  		} else {
   161  			items++
   162  		}
   163  		return true
   164  	})
   165  
   166  	// Reschedule ourselves if needed
   167  	sd.keepAliveSchedLock.Lock()
   168  	if items > 0 {
   169  		sd.keepaliveFlusher = time.AfterFunc(sd.KeepAlive/5, sd.flushKeepAlive)
   170  	} else {
   171  		sd.keepaliveFlusher = nil
   172  	}
   173  	sd.keepAliveSchedLock.Unlock()
   174  }
   175  
   176  type finalizerFunc func(*byte)
   177  
   178  func generateFinalizerFunc[hashtype comparable](sd *stringDedup[hashtype]) finalizerFunc {
   179  	return func(in *byte) {
   180  		if sd.flushing {
   181  			return // We're flushing, don't bother
   182  		}
   183  
   184  		pointer := uintptr(unsafe.Pointer(in))
   185  		hash, found := sd.pointermap.Load(pointer)
   186  		if !found {
   187  			panic("dedup map mismatch")
   188  
   189  		}
   190  		sd.pointermap.Delete(pointer)
   191  		sd.hashmap.Delete(hash)
   192  		atomic.AddInt64(&sd.stats.ItemsRemoved, 1)
   193  	}
   194  }