github.com/philpearl/intern@v0.0.1/intern.go (about) 1 // Package intern is a string interning library. Pass it a string, and it will store it and return it, removing duplicates. That is, however many times you show it a string, it will only store that string once, and will always 2 // return a version of it backed by the same memory. 3 // 4 // Storage is kind to GC. It is optimised for storing a very large number of strings. 5 package intern 6 7 import ( 8 "math/bits" 9 "reflect" 10 "unsafe" 11 12 "github.com/philpearl/stringbank" 13 ) 14 15 // We use the runtime's map hash function without the overhead of using 16 // hash/maphash 17 //go:linkname runtime_memhash runtime.memhash 18 //go:noescape 19 func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr 20 21 // Intern implements the interner. Allocate it 22 type Intern struct { 23 stringbank.Stringbank 24 table table 25 oldTable table 26 count int 27 oldTableCursor int 28 } 29 30 // New creates a new interning table 31 func New(cap int) *Intern { 32 if cap < 16 { 33 cap = 16 34 } else { 35 cap = 1 << uint(64-bits.LeadingZeros(uint(cap-1))) 36 } 37 return &Intern{ 38 table: table{ 39 hashes: make([]uint32, cap), 40 indices: make([]int, cap), 41 }, 42 } 43 } 44 45 // Len returns the number of unique strings stored 46 func (i *Intern) Len() int { 47 return i.count 48 } 49 50 // Cap returns the size of the intern table 51 func (i *Intern) Cap() int { 52 return i.table.len() 53 } 54 55 // Deduplicate takes a string and returns a permanently stored version. This will always 56 // be backed by the same memory for the same string. 57 func (i *Intern) Deduplicate(val string) string { 58 return i.Get(i.Save(val)) 59 } 60 61 // Save stores a string in out deduplicated string store, and returns an integer offset 62 // for accessing it. 63 func (i *Intern) Save(val string) int { 64 // we use a hashtable where the keys are stringbank offsets, but comparisons are done on 65 // strings. There is no value to store 66 i.resize() 67 68 hash := uint32(runtime_memhash( 69 unsafe.Pointer((*reflect.StringHeader)(unsafe.Pointer(&val)).Data), 70 0, 71 uintptr(len(val)), 72 )) 73 74 if i.oldTable.len() != 0 { 75 _, index := i.findInTable(i.oldTable, val, hash) 76 if index != 0 { 77 return index - 1 78 } 79 } 80 81 cursor, index := i.findInTable(i.table, val, hash) 82 if index != 0 { 83 return index - 1 84 } 85 86 // String was not found, so we want to store it. Cursor is the index where we should 87 // store it 88 offset := i.Stringbank.Save(val) 89 i.table.hashes[cursor] = hash 90 i.table.indices[cursor] = offset + 1 91 i.count++ 92 93 return offset 94 } 95 96 // findInTable find the string val in the hash table. If the string is present, it returns the 97 // place in the table where it was found, plus the stringbank offset of the string + 1 98 func (i *Intern) findInTable(table table, val string, hashVal uint32) (cursor int, index int) { 99 l := table.len() 100 cursor = int(hashVal) & (l - 1) 101 start := cursor 102 for table.indices[cursor] != 0 { 103 if table.hashes[cursor] == hashVal { 104 if index := int(table.indices[cursor]); i.Get(index-1) == val { 105 return cursor, index 106 } 107 } 108 cursor++ 109 if cursor == l { 110 cursor = 0 111 } 112 if cursor == start { 113 panic("out of space!") 114 } 115 } 116 return cursor, 0 117 } 118 119 func (i *Intern) copyEntryToTable(table table, index int, hash uint32) { 120 l := table.len() 121 cursor := int(hash) & (l - 1) 122 start := cursor 123 for table.indices[cursor] != 0 { 124 // the entry we're copying in is guaranteed not to be already 125 // present, so we're just looking for an empty space 126 cursor++ 127 if cursor == l { 128 cursor = 0 129 } 130 if cursor == start { 131 panic("out of space (resize)!") 132 } 133 } 134 table.indices[cursor] = index 135 table.hashes[cursor] = hash 136 } 137 138 func (i *Intern) resize() { 139 if i.table.hashes == nil { 140 i.table.hashes = make([]uint32, 16) 141 i.table.indices = make([]int, 16) 142 } 143 144 if i.count < i.table.len()*3/4 && i.oldTable.len() == 0 { 145 return 146 } 147 148 if i.oldTable.hashes == nil { 149 i.oldTable, i.table = i.table, table{ 150 hashes: make([]uint32, len(i.table.hashes)*2), 151 indices: make([]int, len(i.table.indices)*2), 152 } 153 } 154 155 // We copy items between tables 16 at a time. Since we do this every time 156 // anyone writes to the table we won't run out of space in the new table 157 // before this is complete 158 l := i.oldTable.len() 159 for k := 0; k < 16; k++ { 160 if index := i.oldTable.indices[k+i.oldTableCursor]; index != 0 { 161 i.copyEntryToTable(i.table, index, i.oldTable.hashes[k+i.oldTableCursor]) 162 // The entry can exist in the old and new versions of the table without 163 // problems. If we did try to delete from the old table we'd have issues 164 // searching forward from clashing entries. 165 } 166 } 167 i.oldTableCursor += 16 168 if i.oldTableCursor >= l { 169 i.oldTable.hashes = nil 170 i.oldTable.indices = nil 171 i.oldTableCursor = 0 172 } 173 } 174 175 // table represents a hash table. We keep the indices and hashes separate in 176 // case we want to use different size types in the future 177 type table struct { 178 // We keep hashes in the table to speed up resizing, and also stepping through 179 // entries that have different hashes but hit the same bucket 180 hashes []uint32 181 // index is the index of the string in the stringbank, plus 1 so that valid 182 // entries are never zero 183 indices []int 184 } 185 186 func (t table) len() int { 187 return len(t.hashes) 188 }