github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/surf/builder.go (about) 1 package surf 2 3 // Builder is builder of SuRF. 4 type Builder struct { 5 sparseStartLevel uint32 6 valueSize uint32 7 totalCount int 8 9 // LOUDS-Sparse bitvecs 10 lsLabels [][]byte 11 lsHasChild [][]uint64 12 lsLoudsBits [][]uint64 13 14 // LOUDS-Dense bitvecs 15 ldLabels [][]uint64 16 ldHasChild [][]uint64 17 ldIsPrefix [][]uint64 18 19 // suffix 20 hashSuffixLen uint32 21 realSuffixLen uint32 22 suffixes [][]uint64 23 suffixCounts []uint32 24 25 // value 26 values [][]byte 27 valueCounts []uint32 28 29 // prefix 30 hasPrefix [][]uint64 31 prefixes [][][]byte 32 33 nodeCounts []uint32 34 isLastItemTerminator []bool 35 } 36 37 // NewBuilder returns a new SuRF builder. 38 func NewBuilder(valueSize uint32, hashSuffixLen, realSuffixLen uint32) *Builder { 39 return &Builder{ 40 valueSize: valueSize, 41 hashSuffixLen: hashSuffixLen, 42 realSuffixLen: realSuffixLen, 43 } 44 } 45 46 // Build returns the SuRF for added kv pairs. 47 // The bitsPerKeyHint is a size hint used when determine how many levels can use the dense-loudes format. 48 // The dense-loudes format is faster than sparse-loudes format, but may consume more space. 49 func (b *Builder) Build(keys, vals [][]byte, bitsPerKeyHint int) *SuRF { 50 b.totalCount = len(keys) 51 b.buildNodes(keys, vals, 0, 0, 0) 52 b.determineCutoffLevel(bitsPerKeyHint) 53 b.buildDense() 54 55 surf := new(SuRF) 56 surf.ld.Init(b) 57 surf.ls.Init(b) 58 return surf 59 } 60 61 // buildNodes is recursive algorithm to bulk building SuRF nodes. 62 // * We divide keys into groups by the `key[depth]`, so keys in each group shares the same prefix 63 // * If depth larger than the length if the first key in group, the key is prefix of others in group 64 // So we should append `labelTerminator` to labels and update `b.isLastItemTerminator`, then remove it from group. 65 // * Scan over keys in current group when meets different label, use the new sub group call buildNodes with level+1 recursively 66 // * If all keys in current group have the same label, this node can be compressed, use this group call buildNodes with level recursively. 67 // * If current group contains only one key constract suffix of this key and return. 68 func (b *Builder) buildNodes(keys, vals [][]byte, prefixDepth, depth, level int) { 69 b.ensureLevel(level) 70 nodeStartPos := b.numItems(level) 71 72 groupStart := 0 73 if depth >= len(keys[groupStart]) { 74 b.lsLabels[level] = append(b.lsLabels[level], labelTerminator) 75 b.isLastItemTerminator[level] = true 76 b.insertSuffix(keys[groupStart], level, depth) 77 b.insertValue(vals[groupStart], level) 78 b.moveToNextItemSlot(level) 79 groupStart++ 80 } 81 82 for groupEnd := groupStart; groupEnd <= len(keys); groupEnd++ { 83 if groupEnd < len(keys) && keys[groupStart][depth] == keys[groupEnd][depth] { 84 continue 85 } 86 87 if groupEnd == len(keys) && groupStart == 0 && groupEnd-groupStart != 1 { 88 // node at this level is one-way node, compress it to next node 89 b.buildNodes(keys, vals, prefixDepth, depth+1, level) 90 return 91 } 92 93 b.lsLabels[level] = append(b.lsLabels[level], keys[groupStart][depth]) 94 b.moveToNextItemSlot(level) 95 if groupEnd-groupStart == 1 { 96 b.insertSuffix(keys[groupStart], level, depth) 97 b.insertValue(vals[groupStart], level) 98 } else { 99 setBit(b.lsHasChild[level], b.numItems(level)-1) 100 b.buildNodes(keys[groupStart:groupEnd], vals[groupStart:groupEnd], depth+1, depth+1, level+1) 101 } 102 103 groupStart = groupEnd 104 } 105 106 // check if current node contains compressed path. 107 if depth-prefixDepth > 0 { 108 prefix := keys[0][prefixDepth:depth] 109 setBit(b.hasPrefix[level], b.nodeCounts[level]) 110 b.insertPrefix(prefix, level) 111 } 112 setBit(b.lsLoudsBits[level], nodeStartPos) 113 114 b.nodeCounts[level]++ 115 if b.nodeCounts[level]%wordSize == 0 { 116 b.hasPrefix[level] = append(b.hasPrefix[level], 0) 117 } 118 } 119 120 func (b *Builder) buildDense() { 121 var level int 122 for level = 0; uint32(level) < b.sparseStartLevel; level++ { 123 b.initDenseVectors(level) 124 if b.numItems(level) == 0 { 125 continue 126 } 127 128 var nodeID uint32 129 if b.isTerminator(level, 0) { 130 setBit(b.ldIsPrefix[level], 0) 131 } else { 132 b.setLabelAndHasChildVec(level, nodeID, 0) 133 } 134 135 var pos uint32 136 numItems := b.numItems(level) 137 for pos = 1; pos < numItems; pos++ { 138 if b.isStartOfNode(level, pos) { 139 nodeID++ 140 if b.isTerminator(level, pos) { 141 setBit(b.ldIsPrefix[level], nodeID) 142 continue 143 } 144 } 145 b.setLabelAndHasChildVec(level, nodeID, pos) 146 } 147 } 148 } 149 150 func (b *Builder) ensureLevel(level int) { 151 if level >= b.treeHeight() { 152 b.addLevel() 153 } 154 } 155 156 func (b *Builder) suffixLen() uint32 { 157 return b.hashSuffixLen + b.realSuffixLen 158 } 159 160 func (b *Builder) treeHeight() int { 161 return len(b.nodeCounts) 162 } 163 164 func (b *Builder) numItems(level int) uint32 { 165 return uint32(len(b.lsLabels[level])) 166 } 167 168 func (b *Builder) addLevel() { 169 b.lsLabels = append(b.lsLabels, []byte{}) 170 b.lsHasChild = append(b.lsHasChild, []uint64{}) 171 b.lsLoudsBits = append(b.lsLoudsBits, []uint64{}) 172 b.hasPrefix = append(b.hasPrefix, []uint64{}) 173 b.suffixes = append(b.suffixes, []uint64{}) 174 b.suffixCounts = append(b.suffixCounts, 0) 175 b.values = append(b.values, []byte{}) 176 b.valueCounts = append(b.valueCounts, 0) 177 b.prefixes = append(b.prefixes, [][]byte{}) 178 179 b.nodeCounts = append(b.nodeCounts, 0) 180 b.isLastItemTerminator = append(b.isLastItemTerminator, false) 181 182 level := b.treeHeight() - 1 183 b.lsHasChild[level] = append(b.lsHasChild[level], 0) 184 b.lsLoudsBits[level] = append(b.lsLoudsBits[level], 0) 185 b.hasPrefix[level] = append(b.hasPrefix[level], 0) 186 } 187 188 func (b *Builder) moveToNextItemSlot(level int) { 189 if b.numItems(level)%wordSize == 0 { 190 b.lsHasChild[level] = append(b.lsHasChild[level], 0) 191 b.lsLoudsBits[level] = append(b.lsLoudsBits[level], 0) 192 } 193 } 194 195 func (b *Builder) insertSuffix(key []byte, level, depth int) { 196 if level >= b.treeHeight() { 197 b.addLevel() 198 } 199 suffix := constructSuffix(key, uint32(depth)+1, b.realSuffixLen, b.hashSuffixLen) 200 201 suffixLen := b.suffixLen() 202 pos := b.suffixCounts[level] * suffixLen 203 if pos == uint32(len(b.suffixes[level])*wordSize) { 204 b.suffixes[level] = append(b.suffixes[level], 0) 205 } 206 wordID := pos / wordSize 207 offset := pos % wordSize 208 remain := wordSize - offset 209 b.suffixes[level][wordID] |= suffix << offset 210 if suffixLen > remain { 211 b.suffixes[level] = append(b.suffixes[level], suffix>>remain) 212 } 213 b.suffixCounts[level]++ 214 } 215 216 func (b *Builder) insertValue(value []byte, level int) { 217 b.values[level] = append(b.values[level], value[:b.valueSize]...) 218 b.valueCounts[level]++ 219 } 220 221 func (b *Builder) insertPrefix(prefix []byte, level int) { 222 b.prefixes[level] = append(b.prefixes[level], append([]byte{}, prefix...)) 223 } 224 225 func (b *Builder) determineCutoffLevel(bitsPerKeyHint int) { 226 height := b.treeHeight() 227 if height == 0 { 228 return 229 } 230 231 sizeHint := uint64(b.totalCount * bitsPerKeyHint) 232 suffixSize := uint64(b.totalCount) * uint64(b.suffixLen()) 233 var prefixSize uint64 234 for _, l := range b.prefixes { 235 for _, p := range l { 236 prefixSize += uint64(len(p)) * 8 237 } 238 } 239 for _, nc := range b.nodeCounts { 240 prefixSize += uint64(nc) 241 } 242 243 var level int 244 // Begins from last level to make the height of dense levels as large as possible. 245 for level = height - 1; level > 0; level-- { 246 ds := b.denseSizeNoSuffix(level) 247 ss := b.sparseSizeNoSuffix(level) 248 sz := ds + ss + suffixSize + prefixSize 249 if sz <= sizeHint { 250 break 251 } 252 } 253 b.sparseStartLevel = uint32(level) 254 } 255 256 func (b *Builder) denseSizeNoSuffix(level int) uint64 { 257 var total uint64 258 for l := 0; l < level; l++ { 259 total += uint64(2 * denseFanout * b.nodeCounts[l]) 260 if l > 0 { 261 total += uint64(b.nodeCounts[l-1]) 262 } 263 } 264 return total 265 } 266 267 func (b *Builder) sparseSizeNoSuffix(level int) uint64 { 268 var total uint64 269 height := b.treeHeight() 270 for l := level; l < height; l++ { 271 n := uint64(len(b.lsLabels[l])) 272 total += n*8 + 2*n 273 } 274 return total 275 } 276 277 func (b *Builder) setLabelAndHasChildVec(level int, nodeID, pos uint32) { 278 label := b.lsLabels[level][pos] 279 setBit(b.ldLabels[level], nodeID*denseFanout+uint32(label)) 280 if readBit(b.lsHasChild[level], pos) { 281 setBit(b.ldHasChild[level], nodeID*denseFanout+uint32(label)) 282 } 283 } 284 285 func (b *Builder) initDenseVectors(level int) { 286 vecLength := b.nodeCounts[level] * (denseFanout / wordSize) 287 prefixVecLen := b.nodeCounts[level] / wordSize 288 if b.nodeCounts[level]%wordSize != 0 { 289 prefixVecLen++ 290 } 291 292 b.ldLabels = append(b.ldLabels, make([]uint64, vecLength)) 293 b.ldHasChild = append(b.ldHasChild, make([]uint64, vecLength)) 294 b.ldIsPrefix = append(b.ldIsPrefix, make([]uint64, prefixVecLen)) 295 } 296 297 func (b *Builder) isStartOfNode(level int, pos uint32) bool { 298 return readBit(b.lsLoudsBits[level], pos) 299 } 300 301 func (b *Builder) isTerminator(level int, pos uint32) bool { 302 label := b.lsLabels[level][pos] 303 return (label == labelTerminator) && !readBit(b.lsHasChild[level], pos) 304 }