github.com/philpearl/symboltab@v1.1.4/symboltab.go (about) 1 // Package symboltab is a symbol table. It converts strings to sequence numbers. This is useful 2 // for things like graph algorithms, where IDs are stored and compared a lot. 3 // 4 // symboltab is optimised for storing a lot of strings, so things are optimised for reducing 5 // work for the GC 6 package symboltab 7 8 import ( 9 "math/bits" 10 "reflect" 11 "unsafe" 12 13 "github.com/philpearl/stringbank" 14 ) 15 16 // Our space costs are 8 bytes per entry. With a load factor of 0.5 (written as 2 here for reasons) that's 17 // increased to at least 16 bytes per entry 18 const loadFactor = 2 19 20 // SymbolTab is the symbol table. Allocate it via New() 21 type SymbolTab struct { 22 sb stringbank.Stringbank 23 table table 24 oldTable table 25 count int 26 oldTableCursor int 27 ib intbank 28 } 29 30 // New creates a new SymbolTab. cap is the initial capacity of the table - it will grow 31 // automatically when needed 32 func New(cap int) *SymbolTab { 33 // want to allocate a table large enough to hold cap without growing 34 cap = cap * loadFactor 35 if cap < 16 { 36 cap = 16 37 } else { 38 cap = 1 << uint(64-bits.LeadingZeros(uint(cap-1))) 39 } 40 return &SymbolTab{ 41 table: table{ 42 hashes: make([]uint32, cap), 43 sequence: make([]int32, cap), 44 }, 45 } 46 } 47 48 // Len returns the number of unique strings stored 49 func (i *SymbolTab) Len() int { 50 return i.count 51 } 52 53 // Cap returns the size of the SymbolTab table 54 func (i *SymbolTab) Cap() int { 55 return i.table.len() 56 } 57 58 // SymbolSize contains the approximate size of string storage in the symboltable. This will be an over-estimate and 59 // includes as yet unused and wasted space 60 func (i *SymbolTab) SymbolSize() int { 61 return i.sb.Size() 62 } 63 64 // SequenceToString looks up a string by its sequence number. Obtain the sequence number 65 // for a string with StringToSequence 66 func (i *SymbolTab) SequenceToString(seq int32) string { 67 // Look up the stringbank offset for this sequence number, then get the string 68 offset := i.ib.lookup(seq) 69 return i.sb.Get(offset) 70 } 71 72 // We use the runtime's map hash function without the overhead of using 73 // hash/maphash 74 //go:linkname runtime_memhash runtime.memhash 75 //go:noescape 76 func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr 77 78 // StringToSequence looks up the string val and returns its sequence number seq. If val does 79 // not currently exist in the symbol table, it will add it if addNew is true. found indicates 80 // whether val was already present in the SymbolTab 81 func (i *SymbolTab) StringToSequence(val string, addNew bool) (seq int32, found bool) { 82 // we use a hashtable where the keys are stringbank offsets, but comparisons are done on 83 // strings. There is no value to store 84 85 hash := uint32(runtime_memhash( 86 unsafe.Pointer((*reflect.StringHeader)(unsafe.Pointer(&val)).Data), 87 0, 88 uintptr(len(val)), 89 )) 90 91 if addNew { 92 // We're going to add to the table, make sure it is big enough 93 i.resize() 94 } 95 96 if i.oldTable.len() != 0 { 97 if addNew { 98 // If we're resizing currently, then do some resizing work 99 i.resizeWork() 100 } 101 102 // The data might still be only in the old table, so look there first. If we find the 103 // data here then we can just go with that answer. But if not it may be in the new table 104 // only. Certainly if we add we want to add to the new table 105 _, sequence := i.findInTable(i.oldTable, val, hash) 106 if sequence != 0 { 107 return sequence, true 108 } 109 } 110 111 cursor, sequence := i.findInTable(i.table, val, hash) 112 if sequence != 0 { 113 return sequence, true 114 } 115 116 if !addNew { 117 return 0, false 118 } 119 120 // String was not found, so we want to store it. Cursor is the index where we should 121 // store it 122 i.count++ 123 sequence = int32(i.count) 124 i.table.hashes[cursor] = hash 125 i.table.sequence[cursor] = sequence 126 127 offset := i.sb.Save(val) 128 i.ib.save(sequence, offset) 129 130 return sequence, false 131 } 132 133 // findInTable find the string val in the hash table. If the string is present, it returns the 134 // place in the table where it was found, plus the stringbank offset of the string + 1 135 func (i *SymbolTab) findInTable(table table, val string, hashVal uint32) (cursor int, sequence int32) { 136 l := table.len() 137 if l == 0 { 138 return 0, 0 139 } 140 cursor = int(hashVal) & (l - 1) 141 start := cursor 142 for table.sequence[cursor] != 0 { 143 if table.hashes[cursor] == hashVal { 144 if seq := table.sequence[cursor]; i.sb.Get(int(i.ib.lookup(seq))) == val { 145 return cursor, table.sequence[cursor] 146 } 147 } 148 cursor++ 149 if cursor == l { 150 cursor = 0 151 } 152 if cursor == start { 153 panic("out of space!") 154 } 155 } 156 return cursor, 0 157 } 158 159 func (i *SymbolTab) copyEntryToTable(table table, hash uint32, seq int32) { 160 l := table.len() 161 cursor := int(hash) & (l - 1) 162 start := cursor 163 for table.sequence[cursor] != 0 { 164 // the entry we're copying in is guaranteed not to be already 165 // present, so we're just looking for an empty space 166 cursor++ 167 if cursor == l { 168 cursor = 0 169 } 170 if cursor == start { 171 panic("out of space (resize)!") 172 } 173 } 174 table.hashes[cursor] = hash 175 table.sequence[cursor] = seq 176 } 177 178 func (i *SymbolTab) resizeWork() { 179 // We copy items between tables 16 at a time. Since we do this every time 180 // anyone writes to the table we won't run out of space in the new table 181 // before this is complete 182 l := i.oldTable.len() 183 if l == 0 { 184 return 185 } 186 for k := 0; k < 16; k++ { 187 offset := k + i.oldTableCursor 188 if seq := i.oldTable.sequence[offset]; seq != 0 { 189 i.copyEntryToTable(i.table, i.oldTable.hashes[offset], i.oldTable.sequence[offset]) 190 // The entry can exist in the old and new versions of the table without 191 // problems. If we did try to delete from the old table we'd have issues 192 // searching forward from clashing entries. 193 } 194 } 195 i.oldTableCursor += 16 196 if i.oldTableCursor >= l { 197 // resizing is complete - clear out the old table 198 i.oldTable.hashes = nil 199 i.oldTable.sequence = nil 200 i.oldTableCursor = 0 201 } 202 } 203 204 func (i *SymbolTab) resize() { 205 if i.table.hashes == nil { 206 // Makes zero value of SymbolTab useful 207 i.table.hashes = make([]uint32, 16) 208 i.table.sequence = make([]int32, 16) 209 } 210 211 if i.count < i.table.len()/loadFactor { 212 // Not full enough to grow the table 213 return 214 } 215 216 if i.oldTable.hashes == nil { 217 // Not already resizing, so kick off the process. Note that despite all the work we do to try to be 218 // clever, just allocating these slices can cause a considerable amount of work, presumably because 219 // they are set to zero. 220 i.oldTable, i.table = i.table, table{ 221 hashes: make([]uint32, len(i.table.hashes)*2), 222 sequence: make([]int32, len(i.table.sequence)*2), 223 } 224 } 225 } 226 227 // table represents a hash table. We keep the strings and hashes separate in 228 // case we want to use different size types in the future 229 type table struct { 230 // We keep hashes in the table to speed up resizing, and also stepping through 231 // entries that have different hashes but hit the same bucket 232 hashes []uint32 233 // sequence contains the sequence numbers of the entries 234 sequence []int32 235 } 236 237 func (t table) len() int { 238 return len(t.hashes) 239 }