github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/surf/builder.go (about)

     1  package surf
     2  
     3  // Builder is builder of SuRF.
     4  type Builder struct {
     5  	sparseStartLevel uint32
     6  	valueSize        uint32
     7  	totalCount       int
     8  
     9  	// LOUDS-Sparse bitvecs
    10  	lsLabels    [][]byte
    11  	lsHasChild  [][]uint64
    12  	lsLoudsBits [][]uint64
    13  
    14  	// LOUDS-Dense bitvecs
    15  	ldLabels   [][]uint64
    16  	ldHasChild [][]uint64
    17  	ldIsPrefix [][]uint64
    18  
    19  	// suffix
    20  	hashSuffixLen uint32
    21  	realSuffixLen uint32
    22  	suffixes      [][]uint64
    23  	suffixCounts  []uint32
    24  
    25  	// value
    26  	values      [][]byte
    27  	valueCounts []uint32
    28  
    29  	// prefix
    30  	hasPrefix [][]uint64
    31  	prefixes  [][][]byte
    32  
    33  	nodeCounts           []uint32
    34  	isLastItemTerminator []bool
    35  }
    36  
    37  // NewBuilder returns a new SuRF builder.
    38  func NewBuilder(valueSize uint32, hashSuffixLen, realSuffixLen uint32) *Builder {
    39  	return &Builder{
    40  		valueSize:     valueSize,
    41  		hashSuffixLen: hashSuffixLen,
    42  		realSuffixLen: realSuffixLen,
    43  	}
    44  }
    45  
    46  // Build returns the SuRF for added kv pairs.
    47  // The bitsPerKeyHint is a size hint used when determine how many levels can use the dense-loudes format.
    48  // The dense-loudes format is faster than sparse-loudes format, but may consume more space.
    49  func (b *Builder) Build(keys, vals [][]byte, bitsPerKeyHint int) *SuRF {
    50  	b.totalCount = len(keys)
    51  	b.buildNodes(keys, vals, 0, 0, 0)
    52  	b.determineCutoffLevel(bitsPerKeyHint)
    53  	b.buildDense()
    54  
    55  	surf := new(SuRF)
    56  	surf.ld.Init(b)
    57  	surf.ls.Init(b)
    58  	return surf
    59  }
    60  
    61  // buildNodes is recursive algorithm to bulk building SuRF nodes.
    62  //	* We divide keys into groups by the `key[depth]`, so keys in each group shares the same prefix
    63  //	* If depth larger than the length if the first key in group, the key is prefix of others in group
    64  //	  So we should append `labelTerminator` to labels and update `b.isLastItemTerminator`, then remove it from group.
    65  //	* Scan over keys in current group when meets different label, use the new sub group call buildNodes with level+1 recursively
    66  //	* If all keys in current group have the same label, this node can be compressed, use this group call buildNodes with level recursively.
    67  //	* If current group contains only one key constract suffix of this key and return.
    68  func (b *Builder) buildNodes(keys, vals [][]byte, prefixDepth, depth, level int) {
    69  	b.ensureLevel(level)
    70  	nodeStartPos := b.numItems(level)
    71  
    72  	groupStart := 0
    73  	if depth >= len(keys[groupStart]) {
    74  		b.lsLabels[level] = append(b.lsLabels[level], labelTerminator)
    75  		b.isLastItemTerminator[level] = true
    76  		b.insertSuffix(keys[groupStart], level, depth)
    77  		b.insertValue(vals[groupStart], level)
    78  		b.moveToNextItemSlot(level)
    79  		groupStart++
    80  	}
    81  
    82  	for groupEnd := groupStart; groupEnd <= len(keys); groupEnd++ {
    83  		if groupEnd < len(keys) && keys[groupStart][depth] == keys[groupEnd][depth] {
    84  			continue
    85  		}
    86  
    87  		if groupEnd == len(keys) && groupStart == 0 && groupEnd-groupStart != 1 {
    88  			// node at this level is one-way node, compress it to next node
    89  			b.buildNodes(keys, vals, prefixDepth, depth+1, level)
    90  			return
    91  		}
    92  
    93  		b.lsLabels[level] = append(b.lsLabels[level], keys[groupStart][depth])
    94  		b.moveToNextItemSlot(level)
    95  		if groupEnd-groupStart == 1 {
    96  			b.insertSuffix(keys[groupStart], level, depth)
    97  			b.insertValue(vals[groupStart], level)
    98  		} else {
    99  			setBit(b.lsHasChild[level], b.numItems(level)-1)
   100  			b.buildNodes(keys[groupStart:groupEnd], vals[groupStart:groupEnd], depth+1, depth+1, level+1)
   101  		}
   102  
   103  		groupStart = groupEnd
   104  	}
   105  
   106  	// check if current node contains compressed path.
   107  	if depth-prefixDepth > 0 {
   108  		prefix := keys[0][prefixDepth:depth]
   109  		setBit(b.hasPrefix[level], b.nodeCounts[level])
   110  		b.insertPrefix(prefix, level)
   111  	}
   112  	setBit(b.lsLoudsBits[level], nodeStartPos)
   113  
   114  	b.nodeCounts[level]++
   115  	if b.nodeCounts[level]%wordSize == 0 {
   116  		b.hasPrefix[level] = append(b.hasPrefix[level], 0)
   117  	}
   118  }
   119  
   120  func (b *Builder) buildDense() {
   121  	var level int
   122  	for level = 0; uint32(level) < b.sparseStartLevel; level++ {
   123  		b.initDenseVectors(level)
   124  		if b.numItems(level) == 0 {
   125  			continue
   126  		}
   127  
   128  		var nodeID uint32
   129  		if b.isTerminator(level, 0) {
   130  			setBit(b.ldIsPrefix[level], 0)
   131  		} else {
   132  			b.setLabelAndHasChildVec(level, nodeID, 0)
   133  		}
   134  
   135  		var pos uint32
   136  		numItems := b.numItems(level)
   137  		for pos = 1; pos < numItems; pos++ {
   138  			if b.isStartOfNode(level, pos) {
   139  				nodeID++
   140  				if b.isTerminator(level, pos) {
   141  					setBit(b.ldIsPrefix[level], nodeID)
   142  					continue
   143  				}
   144  			}
   145  			b.setLabelAndHasChildVec(level, nodeID, pos)
   146  		}
   147  	}
   148  }
   149  
   150  func (b *Builder) ensureLevel(level int) {
   151  	if level >= b.treeHeight() {
   152  		b.addLevel()
   153  	}
   154  }
   155  
   156  func (b *Builder) suffixLen() uint32 {
   157  	return b.hashSuffixLen + b.realSuffixLen
   158  }
   159  
   160  func (b *Builder) treeHeight() int {
   161  	return len(b.nodeCounts)
   162  }
   163  
   164  func (b *Builder) numItems(level int) uint32 {
   165  	return uint32(len(b.lsLabels[level]))
   166  }
   167  
   168  func (b *Builder) addLevel() {
   169  	b.lsLabels = append(b.lsLabels, []byte{})
   170  	b.lsHasChild = append(b.lsHasChild, []uint64{})
   171  	b.lsLoudsBits = append(b.lsLoudsBits, []uint64{})
   172  	b.hasPrefix = append(b.hasPrefix, []uint64{})
   173  	b.suffixes = append(b.suffixes, []uint64{})
   174  	b.suffixCounts = append(b.suffixCounts, 0)
   175  	b.values = append(b.values, []byte{})
   176  	b.valueCounts = append(b.valueCounts, 0)
   177  	b.prefixes = append(b.prefixes, [][]byte{})
   178  
   179  	b.nodeCounts = append(b.nodeCounts, 0)
   180  	b.isLastItemTerminator = append(b.isLastItemTerminator, false)
   181  
   182  	level := b.treeHeight() - 1
   183  	b.lsHasChild[level] = append(b.lsHasChild[level], 0)
   184  	b.lsLoudsBits[level] = append(b.lsLoudsBits[level], 0)
   185  	b.hasPrefix[level] = append(b.hasPrefix[level], 0)
   186  }
   187  
   188  func (b *Builder) moveToNextItemSlot(level int) {
   189  	if b.numItems(level)%wordSize == 0 {
   190  		b.lsHasChild[level] = append(b.lsHasChild[level], 0)
   191  		b.lsLoudsBits[level] = append(b.lsLoudsBits[level], 0)
   192  	}
   193  }
   194  
   195  func (b *Builder) insertSuffix(key []byte, level, depth int) {
   196  	if level >= b.treeHeight() {
   197  		b.addLevel()
   198  	}
   199  	suffix := constructSuffix(key, uint32(depth)+1, b.realSuffixLen, b.hashSuffixLen)
   200  
   201  	suffixLen := b.suffixLen()
   202  	pos := b.suffixCounts[level] * suffixLen
   203  	if pos == uint32(len(b.suffixes[level])*wordSize) {
   204  		b.suffixes[level] = append(b.suffixes[level], 0)
   205  	}
   206  	wordID := pos / wordSize
   207  	offset := pos % wordSize
   208  	remain := wordSize - offset
   209  	b.suffixes[level][wordID] |= suffix << offset
   210  	if suffixLen > remain {
   211  		b.suffixes[level] = append(b.suffixes[level], suffix>>remain)
   212  	}
   213  	b.suffixCounts[level]++
   214  }
   215  
   216  func (b *Builder) insertValue(value []byte, level int) {
   217  	b.values[level] = append(b.values[level], value[:b.valueSize]...)
   218  	b.valueCounts[level]++
   219  }
   220  
   221  func (b *Builder) insertPrefix(prefix []byte, level int) {
   222  	b.prefixes[level] = append(b.prefixes[level], append([]byte{}, prefix...))
   223  }
   224  
   225  func (b *Builder) determineCutoffLevel(bitsPerKeyHint int) {
   226  	height := b.treeHeight()
   227  	if height == 0 {
   228  		return
   229  	}
   230  
   231  	sizeHint := uint64(b.totalCount * bitsPerKeyHint)
   232  	suffixSize := uint64(b.totalCount) * uint64(b.suffixLen())
   233  	var prefixSize uint64
   234  	for _, l := range b.prefixes {
   235  		for _, p := range l {
   236  			prefixSize += uint64(len(p)) * 8
   237  		}
   238  	}
   239  	for _, nc := range b.nodeCounts {
   240  		prefixSize += uint64(nc)
   241  	}
   242  
   243  	var level int
   244  	// Begins from last level to make the height of dense levels as large as possible.
   245  	for level = height - 1; level > 0; level-- {
   246  		ds := b.denseSizeNoSuffix(level)
   247  		ss := b.sparseSizeNoSuffix(level)
   248  		sz := ds + ss + suffixSize + prefixSize
   249  		if sz <= sizeHint {
   250  			break
   251  		}
   252  	}
   253  	b.sparseStartLevel = uint32(level)
   254  }
   255  
   256  func (b *Builder) denseSizeNoSuffix(level int) uint64 {
   257  	var total uint64
   258  	for l := 0; l < level; l++ {
   259  		total += uint64(2 * denseFanout * b.nodeCounts[l])
   260  		if l > 0 {
   261  			total += uint64(b.nodeCounts[l-1])
   262  		}
   263  	}
   264  	return total
   265  }
   266  
   267  func (b *Builder) sparseSizeNoSuffix(level int) uint64 {
   268  	var total uint64
   269  	height := b.treeHeight()
   270  	for l := level; l < height; l++ {
   271  		n := uint64(len(b.lsLabels[l]))
   272  		total += n*8 + 2*n
   273  	}
   274  	return total
   275  }
   276  
   277  func (b *Builder) setLabelAndHasChildVec(level int, nodeID, pos uint32) {
   278  	label := b.lsLabels[level][pos]
   279  	setBit(b.ldLabels[level], nodeID*denseFanout+uint32(label))
   280  	if readBit(b.lsHasChild[level], pos) {
   281  		setBit(b.ldHasChild[level], nodeID*denseFanout+uint32(label))
   282  	}
   283  }
   284  
   285  func (b *Builder) initDenseVectors(level int) {
   286  	vecLength := b.nodeCounts[level] * (denseFanout / wordSize)
   287  	prefixVecLen := b.nodeCounts[level] / wordSize
   288  	if b.nodeCounts[level]%wordSize != 0 {
   289  		prefixVecLen++
   290  	}
   291  
   292  	b.ldLabels = append(b.ldLabels, make([]uint64, vecLength))
   293  	b.ldHasChild = append(b.ldHasChild, make([]uint64, vecLength))
   294  	b.ldIsPrefix = append(b.ldIsPrefix, make([]uint64, prefixVecLen))
   295  }
   296  
   297  func (b *Builder) isStartOfNode(level int, pos uint32) bool {
   298  	return readBit(b.lsLoudsBits[level], pos)
   299  }
   300  
   301  func (b *Builder) isTerminator(level int, pos uint32) bool {
   302  	label := b.lsLabels[level][pos]
   303  	return (label == labelTerminator) && !readBit(b.lsHasChild[level], pos)
   304  }