github.com/philpearl/intern@v0.0.1/intern.go

github.com/philpearl/intern@v0.0.1/intern.go (about)

     1  // Package intern is a string interning library. Pass it a string, and it will store it and return it, removing duplicates. That is, however many times you show it a string, it will only store that string once, and will always
     2  // return a version of it backed by the same memory.
     3  //
     4  // Storage is kind to GC. It is optimised for storing a very large number of strings.
     5  package intern
     6  
     7  import (
     8  	"math/bits"
     9  	"reflect"
    10  	"unsafe"
    11  
    12  	"github.com/philpearl/stringbank"
    13  )
    14  
    15  // We use the runtime's map hash function without the overhead of using
    16  // hash/maphash
    17  //go:linkname runtime_memhash runtime.memhash
    18  //go:noescape
    19  func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
    20  
    21  // Intern implements the interner. Allocate it
    22  type Intern struct {
    23  	stringbank.Stringbank
    24  	table          table
    25  	oldTable       table
    26  	count          int
    27  	oldTableCursor int
    28  }
    29  
    30  // New creates a new interning table
    31  func New(cap int) *Intern {
    32  	if cap < 16 {
    33  		cap = 16
    34  	} else {
    35  		cap = 1 << uint(64-bits.LeadingZeros(uint(cap-1)))
    36  	}
    37  	return &Intern{
    38  		table: table{
    39  			hashes:  make([]uint32, cap),
    40  			indices: make([]int, cap),
    41  		},
    42  	}
    43  }
    44  
    45  // Len returns the number of unique strings stored
    46  func (i *Intern) Len() int {
    47  	return i.count
    48  }
    49  
    50  // Cap returns the size of the intern table
    51  func (i *Intern) Cap() int {
    52  	return i.table.len()
    53  }
    54  
    55  // Deduplicate takes a string and returns a permanently stored version. This will always
    56  // be backed by the same memory for the same string.
    57  func (i *Intern) Deduplicate(val string) string {
    58  	return i.Get(i.Save(val))
    59  }
    60  
    61  // Save stores a string in out deduplicated string store, and returns an integer offset
    62  // for accessing it.
    63  func (i *Intern) Save(val string) int {
    64  	// we use a hashtable where the keys are stringbank offsets, but comparisons are done on
    65  	// strings. There is no value to store
    66  	i.resize()
    67  
    68  	hash := uint32(runtime_memhash(
    69  		unsafe.Pointer((*reflect.StringHeader)(unsafe.Pointer(&val)).Data),
    70  		0,
    71  		uintptr(len(val)),
    72  	))
    73  
    74  	if i.oldTable.len() != 0 {
    75  		_, index := i.findInTable(i.oldTable, val, hash)
    76  		if index != 0 {
    77  			return index - 1
    78  		}
    79  	}
    80  
    81  	cursor, index := i.findInTable(i.table, val, hash)
    82  	if index != 0 {
    83  		return index - 1
    84  	}
    85  
    86  	// String was not found, so we want to store it. Cursor is the index where we should
    87  	// store it
    88  	offset := i.Stringbank.Save(val)
    89  	i.table.hashes[cursor] = hash
    90  	i.table.indices[cursor] = offset + 1
    91  	i.count++
    92  
    93  	return offset
    94  }
    95  
    96  // findInTable find the string val in the hash table. If the string is present, it returns the
    97  // place in the table where it was found, plus the stringbank offset of the string + 1
    98  func (i *Intern) findInTable(table table, val string, hashVal uint32) (cursor int, index int) {
    99  	l := table.len()
   100  	cursor = int(hashVal) & (l - 1)
   101  	start := cursor
   102  	for table.indices[cursor] != 0 {
   103  		if table.hashes[cursor] == hashVal {
   104  			if index := int(table.indices[cursor]); i.Get(index-1) == val {
   105  				return cursor, index
   106  			}
   107  		}
   108  		cursor++
   109  		if cursor == l {
   110  			cursor = 0
   111  		}
   112  		if cursor == start {
   113  			panic("out of space!")
   114  		}
   115  	}
   116  	return cursor, 0
   117  }
   118  
   119  func (i *Intern) copyEntryToTable(table table, index int, hash uint32) {
   120  	l := table.len()
   121  	cursor := int(hash) & (l - 1)
   122  	start := cursor
   123  	for table.indices[cursor] != 0 {
   124  		// the entry we're copying in is guaranteed not to be already
   125  		// present, so we're just looking for an empty space
   126  		cursor++
   127  		if cursor == l {
   128  			cursor = 0
   129  		}
   130  		if cursor == start {
   131  			panic("out of space (resize)!")
   132  		}
   133  	}
   134  	table.indices[cursor] = index
   135  	table.hashes[cursor] = hash
   136  }
   137  
   138  func (i *Intern) resize() {
   139  	if i.table.hashes == nil {
   140  		i.table.hashes = make([]uint32, 16)
   141  		i.table.indices = make([]int, 16)
   142  	}
   143  
   144  	if i.count < i.table.len()*3/4 && i.oldTable.len() == 0 {
   145  		return
   146  	}
   147  
   148  	if i.oldTable.hashes == nil {
   149  		i.oldTable, i.table = i.table, table{
   150  			hashes:  make([]uint32, len(i.table.hashes)*2),
   151  			indices: make([]int, len(i.table.indices)*2),
   152  		}
   153  	}
   154  
   155  	// We copy items between tables 16 at a time. Since we do this every time
   156  	// anyone writes to the table we won't run out of space in the new table
   157  	// before this is complete
   158  	l := i.oldTable.len()
   159  	for k := 0; k < 16; k++ {
   160  		if index := i.oldTable.indices[k+i.oldTableCursor]; index != 0 {
   161  			i.copyEntryToTable(i.table, index, i.oldTable.hashes[k+i.oldTableCursor])
   162  			// The entry can exist in the old and new versions of the table without
   163  			// problems. If we did try to delete from the old table we'd have issues
   164  			// searching forward from clashing entries.
   165  		}
   166  	}
   167  	i.oldTableCursor += 16
   168  	if i.oldTableCursor >= l {
   169  		i.oldTable.hashes = nil
   170  		i.oldTable.indices = nil
   171  		i.oldTableCursor = 0
   172  	}
   173  }
   174  
   175  // table represents a hash table. We keep the indices and hashes separate in
   176  // case we want to use different size types in the future
   177  type table struct {
   178  	// We keep hashes in the table to speed up resizing, and also stepping through
   179  	// entries that have different hashes but hit the same bucket
   180  	hashes []uint32
   181  	// index is the index of the string in the stringbank, plus 1 so that valid
   182  	// entries are never zero
   183  	indices []int
   184  }
   185  
   186  func (t table) len() int {
   187  	return len(t.hashes)
   188  }