github.com/tobgu/qframe@v0.4.0/internal/grouper/grouper.go (about) 1 package grouper 2 3 import ( 4 "math/bits" 5 6 "github.com/tobgu/qframe/internal/column" 7 "github.com/tobgu/qframe/internal/index" 8 "github.com/tobgu/qframe/internal/math/integer" 9 ) 10 11 /* 12 This package implements a basic hash table used for GroupBy and Distinct operations. 13 14 Hashing is done using Go runtime memhash, collisions are handled using linear probing. 15 16 When the table reaches a certain load factor it will be reallocated into a new, larger table. 17 */ 18 19 // An entry in the hash table. For group by operations a slice of all positions each group 20 // are stored. For distinct operations only the first position is stored to avoid some overhead. 21 type tableEntry struct { 22 ix index.Int 23 hash uint32 24 firstPos uint32 25 occupied bool 26 } 27 28 type table struct { 29 entries []tableEntry 30 comparables []column.Comparable 31 stats GroupStats 32 loadFactor float64 33 groupCount uint32 34 collectIx bool 35 } 36 37 const growthFactor = 2 38 39 func (t *table) grow() { 40 newLen := uint32(growthFactor * len(t.entries)) 41 newEntries := make([]tableEntry, newLen) 42 bitMask := newLen - 1 43 for _, e := range t.entries { 44 for pos := e.hash & bitMask; ; pos = (pos + 1) & bitMask { 45 if !newEntries[pos].occupied { 46 newEntries[pos] = e 47 break 48 } 49 t.stats.RelocationCollisions++ 50 } 51 } 52 53 t.stats.RelocationCount++ 54 t.entries = newEntries 55 t.loadFactor = t.loadFactor / growthFactor 56 } 57 58 func (t *table) hash(i uint32) uint32 { 59 hashVal := uint64(0) 60 for _, c := range t.comparables { 61 hashVal = c.Hash(i, hashVal) 62 } 63 64 return uint32(hashVal) 65 } 66 67 const maxLoadFactor = 0.5 68 69 func (t *table) insertEntry(i uint32) { 70 if t.loadFactor > maxLoadFactor { 71 t.grow() 72 } 73 74 hashSum := t.hash(i) 75 bitMask := uint64(len(t.entries) - 1) 76 startPos := uint64(hashSum) & bitMask 77 var dstEntry *tableEntry 78 for pos := startPos; dstEntry == nil; pos = (pos + 1) & bitMask { 79 e := &t.entries[pos] 80 if !e.occupied || e.hash == hashSum && equals(t.comparables, i, e.firstPos) { 81 dstEntry = e 82 } else { 83 t.stats.InsertCollisions++ 84 } 85 } 86 87 // Update entry 88 if !dstEntry.occupied { 89 // Eden entry 90 dstEntry.hash = hashSum 91 dstEntry.firstPos = i 92 dstEntry.occupied = true 93 t.groupCount++ 94 t.loadFactor = float64(t.groupCount) / float64(len(t.entries)) 95 } else { 96 // Existing entry 97 if t.collectIx { 98 // Small hack to reduce number of allocations under some circumstances. Delay 99 // creation of index slice until there are at least two entries in the group 100 // since we store the first position in a separate variable on the entry anyway. 101 if dstEntry.ix == nil { 102 dstEntry.ix = index.Int{dstEntry.firstPos, i} 103 } else { 104 dstEntry.ix = append(dstEntry.ix, i) 105 } 106 } 107 } 108 } 109 110 func newTable(sizeExp int, comparables []column.Comparable, collectIx bool) *table { 111 return &table{ 112 entries: make([]tableEntry, integer.Pow2(sizeExp)), 113 comparables: comparables, 114 collectIx: collectIx} 115 } 116 117 func equals(comparables []column.Comparable, i, j uint32) bool { 118 for _, c := range comparables { 119 if c.Compare(i, j) != column.Equal { 120 return false 121 } 122 } 123 return true 124 } 125 126 type GroupStats struct { 127 RelocationCount int 128 RelocationCollisions int 129 InsertCollisions int 130 GroupCount int 131 LoadFactor float64 132 } 133 134 func calculateInitialSizeExp(ixLen int) int { 135 // Size is expressed as 2^x to keep the size a multiple of two. 136 // Initial size is picked fairly arbitrarily at the moment, we don't really know the distribution of 137 // values within the index. Guarantee a minimum initial size of 8 (2³) for sanity. 138 fitSize := uint64(ixLen) / 4 139 return integer.Max(bits.Len64(fitSize), 3) 140 } 141 142 func groupIndex(ix index.Int, comparables []column.Comparable, collectIx bool) ([]tableEntry, GroupStats) { 143 initialSizeExp := calculateInitialSizeExp(len(ix)) 144 table := newTable(initialSizeExp, comparables, collectIx) 145 for _, i := range ix { 146 table.insertEntry(i) 147 } 148 149 stats := table.stats 150 stats.LoadFactor = table.loadFactor 151 stats.GroupCount = int(table.groupCount) 152 return table.entries, stats 153 } 154 155 func GroupBy(ix index.Int, comparables []column.Comparable) ([]index.Int, GroupStats) { 156 entries, stats := groupIndex(ix, comparables, true) 157 result := make([]index.Int, 0, stats.GroupCount) 158 for _, e := range entries { 159 if e.occupied { 160 if e.ix == nil { 161 result = append(result, index.Int{e.firstPos}) 162 } else { 163 result = append(result, e.ix) 164 } 165 } 166 } 167 168 return result, stats 169 } 170 171 func Distinct(ix index.Int, comparables []column.Comparable) index.Int { 172 entries, stats := groupIndex(ix, comparables, false) 173 result := make(index.Int, 0, stats.GroupCount) 174 for _, e := range entries { 175 if e.occupied { 176 result = append(result, e.firstPos) 177 } 178 } 179 180 return result 181 }