github.com/philpearl/symboltab@v1.1.4/symboltab.go (about)

     1  // Package symboltab is a symbol table. It converts strings to sequence numbers. This is useful
     2  // for things like graph algorithms, where IDs are stored and compared a lot.
     3  //
     4  // symboltab is optimised for storing a lot of strings, so things are optimised for reducing
     5  // work for the GC
     6  package symboltab
     7  
     8  import (
     9  	"math/bits"
    10  	"reflect"
    11  	"unsafe"
    12  
    13  	"github.com/philpearl/stringbank"
    14  )
    15  
    16  // Our space costs are 8 bytes per entry. With a load factor of 0.5 (written as 2 here for reasons) that's
    17  // increased to at least 16 bytes per entry
    18  const loadFactor = 2
    19  
    20  // SymbolTab is the symbol table. Allocate it via New()
    21  type SymbolTab struct {
    22  	sb             stringbank.Stringbank
    23  	table          table
    24  	oldTable       table
    25  	count          int
    26  	oldTableCursor int
    27  	ib             intbank
    28  }
    29  
    30  // New creates a new SymbolTab. cap is the initial capacity of the table - it will grow
    31  // automatically when needed
    32  func New(cap int) *SymbolTab {
    33  	// want to allocate a table large enough to hold cap without growing
    34  	cap = cap * loadFactor
    35  	if cap < 16 {
    36  		cap = 16
    37  	} else {
    38  		cap = 1 << uint(64-bits.LeadingZeros(uint(cap-1)))
    39  	}
    40  	return &SymbolTab{
    41  		table: table{
    42  			hashes:   make([]uint32, cap),
    43  			sequence: make([]int32, cap),
    44  		},
    45  	}
    46  }
    47  
    48  // Len returns the number of unique strings stored
    49  func (i *SymbolTab) Len() int {
    50  	return i.count
    51  }
    52  
    53  // Cap returns the size of the SymbolTab table
    54  func (i *SymbolTab) Cap() int {
    55  	return i.table.len()
    56  }
    57  
    58  // SymbolSize contains the approximate size of string storage in the symboltable. This will be an over-estimate and
    59  // includes as yet unused and wasted space
    60  func (i *SymbolTab) SymbolSize() int {
    61  	return i.sb.Size()
    62  }
    63  
    64  // SequenceToString looks up a string by its sequence number. Obtain the sequence number
    65  // for a string with StringToSequence
    66  func (i *SymbolTab) SequenceToString(seq int32) string {
    67  	// Look up the stringbank offset for this sequence number, then get the string
    68  	offset := i.ib.lookup(seq)
    69  	return i.sb.Get(offset)
    70  }
    71  
    72  // We use the runtime's map hash function without the overhead of using
    73  // hash/maphash
    74  //go:linkname runtime_memhash runtime.memhash
    75  //go:noescape
    76  func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
    77  
    78  // StringToSequence looks up the string val and returns its sequence number seq. If val does
    79  // not currently exist in the symbol table, it will add it if addNew is true. found indicates
    80  // whether val was already present in the SymbolTab
    81  func (i *SymbolTab) StringToSequence(val string, addNew bool) (seq int32, found bool) {
    82  	// we use a hashtable where the keys are stringbank offsets, but comparisons are done on
    83  	// strings. There is no value to store
    84  
    85  	hash := uint32(runtime_memhash(
    86  		unsafe.Pointer((*reflect.StringHeader)(unsafe.Pointer(&val)).Data),
    87  		0,
    88  		uintptr(len(val)),
    89  	))
    90  
    91  	if addNew {
    92  		// We're going to add to the table, make sure it is big enough
    93  		i.resize()
    94  	}
    95  
    96  	if i.oldTable.len() != 0 {
    97  		if addNew {
    98  			// If we're resizing currently, then do some resizing work
    99  			i.resizeWork()
   100  		}
   101  
   102  		// The data might still be only in the old table, so look there first. If we find the
   103  		// data here then we can just go with that answer. But if not it may be in the new table
   104  		// only. Certainly if we add we want to add to the new table
   105  		_, sequence := i.findInTable(i.oldTable, val, hash)
   106  		if sequence != 0 {
   107  			return sequence, true
   108  		}
   109  	}
   110  
   111  	cursor, sequence := i.findInTable(i.table, val, hash)
   112  	if sequence != 0 {
   113  		return sequence, true
   114  	}
   115  
   116  	if !addNew {
   117  		return 0, false
   118  	}
   119  
   120  	// String was not found, so we want to store it. Cursor is the index where we should
   121  	// store it
   122  	i.count++
   123  	sequence = int32(i.count)
   124  	i.table.hashes[cursor] = hash
   125  	i.table.sequence[cursor] = sequence
   126  
   127  	offset := i.sb.Save(val)
   128  	i.ib.save(sequence, offset)
   129  
   130  	return sequence, false
   131  }
   132  
   133  // findInTable find the string val in the hash table. If the string is present, it returns the
   134  // place in the table where it was found, plus the stringbank offset of the string + 1
   135  func (i *SymbolTab) findInTable(table table, val string, hashVal uint32) (cursor int, sequence int32) {
   136  	l := table.len()
   137  	if l == 0 {
   138  		return 0, 0
   139  	}
   140  	cursor = int(hashVal) & (l - 1)
   141  	start := cursor
   142  	for table.sequence[cursor] != 0 {
   143  		if table.hashes[cursor] == hashVal {
   144  			if seq := table.sequence[cursor]; i.sb.Get(int(i.ib.lookup(seq))) == val {
   145  				return cursor, table.sequence[cursor]
   146  			}
   147  		}
   148  		cursor++
   149  		if cursor == l {
   150  			cursor = 0
   151  		}
   152  		if cursor == start {
   153  			panic("out of space!")
   154  		}
   155  	}
   156  	return cursor, 0
   157  }
   158  
   159  func (i *SymbolTab) copyEntryToTable(table table, hash uint32, seq int32) {
   160  	l := table.len()
   161  	cursor := int(hash) & (l - 1)
   162  	start := cursor
   163  	for table.sequence[cursor] != 0 {
   164  		// the entry we're copying in is guaranteed not to be already
   165  		// present, so we're just looking for an empty space
   166  		cursor++
   167  		if cursor == l {
   168  			cursor = 0
   169  		}
   170  		if cursor == start {
   171  			panic("out of space (resize)!")
   172  		}
   173  	}
   174  	table.hashes[cursor] = hash
   175  	table.sequence[cursor] = seq
   176  }
   177  
   178  func (i *SymbolTab) resizeWork() {
   179  	// We copy items between tables 16 at a time. Since we do this every time
   180  	// anyone writes to the table we won't run out of space in the new table
   181  	// before this is complete
   182  	l := i.oldTable.len()
   183  	if l == 0 {
   184  		return
   185  	}
   186  	for k := 0; k < 16; k++ {
   187  		offset := k + i.oldTableCursor
   188  		if seq := i.oldTable.sequence[offset]; seq != 0 {
   189  			i.copyEntryToTable(i.table, i.oldTable.hashes[offset], i.oldTable.sequence[offset])
   190  			// The entry can exist in the old and new versions of the table without
   191  			// problems. If we did try to delete from the old table we'd have issues
   192  			// searching forward from clashing entries.
   193  		}
   194  	}
   195  	i.oldTableCursor += 16
   196  	if i.oldTableCursor >= l {
   197  		// resizing is complete - clear out the old table
   198  		i.oldTable.hashes = nil
   199  		i.oldTable.sequence = nil
   200  		i.oldTableCursor = 0
   201  	}
   202  }
   203  
   204  func (i *SymbolTab) resize() {
   205  	if i.table.hashes == nil {
   206  		// Makes zero value of SymbolTab useful
   207  		i.table.hashes = make([]uint32, 16)
   208  		i.table.sequence = make([]int32, 16)
   209  	}
   210  
   211  	if i.count < i.table.len()/loadFactor {
   212  		// Not full enough to grow the table
   213  		return
   214  	}
   215  
   216  	if i.oldTable.hashes == nil {
   217  		// Not already resizing, so kick off the process. Note that despite all the work we do to try to be
   218  		// clever, just allocating these slices can cause a considerable amount of work, presumably because
   219  		// they are set to zero.
   220  		i.oldTable, i.table = i.table, table{
   221  			hashes:   make([]uint32, len(i.table.hashes)*2),
   222  			sequence: make([]int32, len(i.table.sequence)*2),
   223  		}
   224  	}
   225  }
   226  
   227  // table represents a hash table. We keep the strings and hashes separate in
   228  // case we want to use different size types in the future
   229  type table struct {
   230  	// We keep hashes in the table to speed up resizing, and also stepping through
   231  	// entries that have different hashes but hit the same bucket
   232  	hashes []uint32
   233  	// sequence contains the sequence numbers of the entries
   234  	sequence []int32
   235  }
   236  
   237  func (t table) len() int {
   238  	return len(t.hashes)
   239  }