github.com/lkarlslund/stringdedup@v0.6.2/sd.go (about) 1 package stringdedup 2 3 import ( 4 "runtime" 5 "sync" 6 "sync/atomic" 7 "time" 8 "unsafe" 9 10 gsync "github.com/SaveTheRbtz/generic-sync-map-go" 11 _ "go4.org/unsafe/assume-no-moving-gc" 12 ) 13 14 func New[hashtype comparable](hashfunc func(in []byte) hashtype) *stringDedup[hashtype] { 15 var sd stringDedup[hashtype] 16 sd.removefromthismap = generateFinalizerFunc(&sd) 17 sd.hashfunc = hashfunc 18 return &sd 19 } 20 21 type stringDedup[hashtype comparable] struct { 22 stats Statistics // Statistics moved to front to ensure 64-bit alignment even on 32-bit platforms (uses atomic to update) 23 24 pointermap gsync.MapOf[uintptr, hashtype] 25 hashmap gsync.MapOf[hashtype, weakdata] // key is hash, value is weakdata entry containing pointer to start of string or byte slice *header* and length 26 27 // Let dedup object keep some strings 'alive' for a period of time 28 KeepAlive time.Duration 29 30 keepAliveSchedLock sync.Mutex 31 keepalivemap gsync.MapOf[string, time.Time] 32 keepaliveFlusher *time.Timer 33 keepaliveitems, keepaliveitemsremoved int64 34 35 hashfunc func([]byte) hashtype 36 37 removefromthismap finalizerFunc 38 39 flushing bool 40 41 // DontValidateResults skips collisions check in returned strings 42 DontValidateResults bool // Disable at your own peril, hash collisions will give you wrong strings back 43 } 44 45 type Statistics struct { 46 ItemsAdded, 47 BytesInMemory, 48 ItemsSaved, 49 BytesSaved, 50 ItemsRemoved, 51 Collisions, 52 FirstCollisionDetected, 53 KeepAliveItemsAdded, 54 KeepAliveItemsRemoved int64 55 } 56 57 // Size returns the number of deduplicated strings currently being tracked in memory 58 func (sd *stringDedup[hashtype]) Size() int64 { 59 return atomic.LoadInt64(&sd.stats.ItemsAdded) - atomic.LoadInt64(&sd.stats.ItemsRemoved) 60 } 61 62 func (sd *stringDedup[hashtype]) Statistics() Statistics { 63 // Not thread safe 64 return sd.stats 65 } 66 67 // Flush clears all state information about deduplication 68 func (sd *stringDedup[hashtype]) Flush() { 69 // Clear our data 70 sd.flushing = true 71 72 sd.pointermap.Range(func(pointer uintptr, hash hashtype) bool { 73 // Don't finalize, we don't care about it any more 74 runtime.SetFinalizer((*byte)(unsafe.Pointer(pointer)), nil) 75 76 sd.pointermap.Delete(pointer) 77 sd.hashmap.Delete(hash) 78 79 atomic.AddInt64(&sd.stats.ItemsRemoved, 1) 80 return true 81 }) 82 83 // Get rid of any keepalives 84 sd.keepalivemap.Range(func(s string, t time.Time) bool { 85 sd.keepalivemap.Delete(s) 86 atomic.AddInt64(&sd.keepaliveitemsremoved, 1) 87 return true 88 }) 89 90 sd.flushing = false 91 } 92 93 // BS takes a slice of bytes, and returns a copy of it as a deduplicated string 94 func (sd *stringDedup[hashtype]) BS(in []byte) string { 95 str := castBytesToString(in) // NoCopy 96 return sd.S(str) 97 } 98 99 func (sd *stringDedup[hashtype]) S(in string) string { 100 if len(in) == 0 { 101 // Nothing to see here, move along now 102 return in 103 } 104 105 hash := sd.hashfunc(castStringToBytes(in)) 106 107 ws, loaded := sd.hashmap.Load(hash) 108 109 if loaded { 110 atomic.AddInt64(&sd.stats.ItemsSaved, 1) 111 atomic.AddInt64(&sd.stats.BytesSaved, int64(ws.length)) 112 out := ws.String() 113 if !sd.DontValidateResults && out != in { 114 atomic.CompareAndSwapInt64(&sd.stats.FirstCollisionDetected, 0, sd.Size()) 115 atomic.AddInt64(&sd.stats.Collisions, 1) 116 return in // Collision 117 } 118 return out 119 } 120 121 // We might recieve a static non-dynamically allocated string, so we need to make a copy 122 // Can we detect this somehow and avoid it? 123 buf := make([]byte, len(in)) 124 copy(buf, in) 125 str := castBytesToString(buf) 126 ws = weakString(str) 127 128 sd.hashmap.Store(hash, ws) 129 sd.pointermap.Store(ws.data, hash) 130 131 // We need to keep the string alive 132 if sd.KeepAlive > 0 { 133 sd.keepalivemap.Store(str, time.Now().Add(sd.KeepAlive)) 134 atomic.AddInt64(&sd.keepaliveitems, 1) 135 // Naughty checking without locking 136 if sd.keepaliveFlusher == nil { 137 sd.keepAliveSchedLock.Lock() 138 if sd.keepaliveFlusher == nil { 139 sd.keepaliveFlusher = time.AfterFunc(sd.KeepAlive/5, sd.flushKeepAlive) 140 } 141 sd.keepAliveSchedLock.Unlock() 142 } 143 } 144 145 atomic.AddInt64(&sd.stats.ItemsAdded, 1) 146 atomic.AddInt64(&sd.stats.BytesInMemory, int64(ws.length)) 147 148 runtime.SetFinalizer((*byte)(unsafe.Pointer(ws.data)), sd.removefromthismap) 149 runtime.KeepAlive(str) 150 return str 151 } 152 153 func (sd *stringDedup[hashtype]) flushKeepAlive() { 154 var items int 155 now := time.Now() 156 sd.keepalivemap.Range(func(key string, value time.Time) bool { 157 if now.After(value) { 158 sd.keepalivemap.Delete(key) 159 atomic.AddInt64(&sd.keepaliveitemsremoved, 1) 160 } else { 161 items++ 162 } 163 return true 164 }) 165 166 // Reschedule ourselves if needed 167 sd.keepAliveSchedLock.Lock() 168 if items > 0 { 169 sd.keepaliveFlusher = time.AfterFunc(sd.KeepAlive/5, sd.flushKeepAlive) 170 } else { 171 sd.keepaliveFlusher = nil 172 } 173 sd.keepAliveSchedLock.Unlock() 174 } 175 176 type finalizerFunc func(*byte) 177 178 func generateFinalizerFunc[hashtype comparable](sd *stringDedup[hashtype]) finalizerFunc { 179 return func(in *byte) { 180 if sd.flushing { 181 return // We're flushing, don't bother 182 } 183 184 pointer := uintptr(unsafe.Pointer(in)) 185 hash, found := sd.pointermap.Load(pointer) 186 if !found { 187 panic("dedup map mismatch") 188 189 } 190 sd.pointermap.Delete(pointer) 191 sd.hashmap.Delete(hash) 192 atomic.AddInt64(&sd.stats.ItemsRemoved, 1) 193 } 194 }