github.com/tobgu/qframe@v0.4.0/internal/grouper/grouper.go (about)

     1  package grouper
     2  
     3  import (
     4  	"math/bits"
     5  
     6  	"github.com/tobgu/qframe/internal/column"
     7  	"github.com/tobgu/qframe/internal/index"
     8  	"github.com/tobgu/qframe/internal/math/integer"
     9  )
    10  
    11  /*
    12  This package implements a basic hash table used for GroupBy and Distinct operations.
    13  
    14  Hashing is done using Go runtime memhash, collisions are handled using linear probing.
    15  
    16  When the table reaches a certain load factor it will be reallocated into a new, larger table.
    17  */
    18  
    19  // An entry in the hash table. For group by operations a slice of all positions each group
    20  // are stored. For distinct operations only the first position is stored to avoid some overhead.
    21  type tableEntry struct {
    22  	ix       index.Int
    23  	hash     uint32
    24  	firstPos uint32
    25  	occupied bool
    26  }
    27  
    28  type table struct {
    29  	entries     []tableEntry
    30  	comparables []column.Comparable
    31  	stats       GroupStats
    32  	loadFactor  float64
    33  	groupCount  uint32
    34  	collectIx   bool
    35  }
    36  
    37  const growthFactor = 2
    38  
    39  func (t *table) grow() {
    40  	newLen := uint32(growthFactor * len(t.entries))
    41  	newEntries := make([]tableEntry, newLen)
    42  	bitMask := newLen - 1
    43  	for _, e := range t.entries {
    44  		for pos := e.hash & bitMask; ; pos = (pos + 1) & bitMask {
    45  			if !newEntries[pos].occupied {
    46  				newEntries[pos] = e
    47  				break
    48  			}
    49  			t.stats.RelocationCollisions++
    50  		}
    51  	}
    52  
    53  	t.stats.RelocationCount++
    54  	t.entries = newEntries
    55  	t.loadFactor = t.loadFactor / growthFactor
    56  }
    57  
    58  func (t *table) hash(i uint32) uint32 {
    59  	hashVal := uint64(0)
    60  	for _, c := range t.comparables {
    61  		hashVal = c.Hash(i, hashVal)
    62  	}
    63  
    64  	return uint32(hashVal)
    65  }
    66  
    67  const maxLoadFactor = 0.5
    68  
    69  func (t *table) insertEntry(i uint32) {
    70  	if t.loadFactor > maxLoadFactor {
    71  		t.grow()
    72  	}
    73  
    74  	hashSum := t.hash(i)
    75  	bitMask := uint64(len(t.entries) - 1)
    76  	startPos := uint64(hashSum) & bitMask
    77  	var dstEntry *tableEntry
    78  	for pos := startPos; dstEntry == nil; pos = (pos + 1) & bitMask {
    79  		e := &t.entries[pos]
    80  		if !e.occupied || e.hash == hashSum && equals(t.comparables, i, e.firstPos) {
    81  			dstEntry = e
    82  		} else {
    83  			t.stats.InsertCollisions++
    84  		}
    85  	}
    86  
    87  	// Update entry
    88  	if !dstEntry.occupied {
    89  		// Eden entry
    90  		dstEntry.hash = hashSum
    91  		dstEntry.firstPos = i
    92  		dstEntry.occupied = true
    93  		t.groupCount++
    94  		t.loadFactor = float64(t.groupCount) / float64(len(t.entries))
    95  	} else {
    96  		// Existing entry
    97  		if t.collectIx {
    98  			// Small hack to reduce number of allocations under some circumstances. Delay
    99  			// creation of index slice until there are at least two entries in the group
   100  			// since we store the first position in a separate variable on the entry anyway.
   101  			if dstEntry.ix == nil {
   102  				dstEntry.ix = index.Int{dstEntry.firstPos, i}
   103  			} else {
   104  				dstEntry.ix = append(dstEntry.ix, i)
   105  			}
   106  		}
   107  	}
   108  }
   109  
   110  func newTable(sizeExp int, comparables []column.Comparable, collectIx bool) *table {
   111  	return &table{
   112  		entries:     make([]tableEntry, integer.Pow2(sizeExp)),
   113  		comparables: comparables,
   114  		collectIx:   collectIx}
   115  }
   116  
   117  func equals(comparables []column.Comparable, i, j uint32) bool {
   118  	for _, c := range comparables {
   119  		if c.Compare(i, j) != column.Equal {
   120  			return false
   121  		}
   122  	}
   123  	return true
   124  }
   125  
   126  type GroupStats struct {
   127  	RelocationCount      int
   128  	RelocationCollisions int
   129  	InsertCollisions     int
   130  	GroupCount           int
   131  	LoadFactor           float64
   132  }
   133  
   134  func calculateInitialSizeExp(ixLen int) int {
   135  	// Size is expressed as 2^x to keep the size a multiple of two.
   136  	// Initial size is picked fairly arbitrarily at the moment, we don't really know the distribution of
   137  	// values within the index. Guarantee a minimum initial size of 8 (2³) for sanity.
   138  	fitSize := uint64(ixLen) / 4
   139  	return integer.Max(bits.Len64(fitSize), 3)
   140  }
   141  
   142  func groupIndex(ix index.Int, comparables []column.Comparable, collectIx bool) ([]tableEntry, GroupStats) {
   143  	initialSizeExp := calculateInitialSizeExp(len(ix))
   144  	table := newTable(initialSizeExp, comparables, collectIx)
   145  	for _, i := range ix {
   146  		table.insertEntry(i)
   147  	}
   148  
   149  	stats := table.stats
   150  	stats.LoadFactor = table.loadFactor
   151  	stats.GroupCount = int(table.groupCount)
   152  	return table.entries, stats
   153  }
   154  
   155  func GroupBy(ix index.Int, comparables []column.Comparable) ([]index.Int, GroupStats) {
   156  	entries, stats := groupIndex(ix, comparables, true)
   157  	result := make([]index.Int, 0, stats.GroupCount)
   158  	for _, e := range entries {
   159  		if e.occupied {
   160  			if e.ix == nil {
   161  				result = append(result, index.Int{e.firstPos})
   162  			} else {
   163  				result = append(result, e.ix)
   164  			}
   165  		}
   166  	}
   167  
   168  	return result, stats
   169  }
   170  
   171  func Distinct(ix index.Int, comparables []column.Comparable) index.Int {
   172  	entries, stats := groupIndex(ix, comparables, false)
   173  	result := make(index.Int, 0, stats.GroupCount)
   174  	for _, e := range entries {
   175  		if e.occupied {
   176  			result = append(result, e.firstPos)
   177  		}
   178  	}
   179  
   180  	return result
   181  }