github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/hashtable.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    23  )
    24  
    25  // TODO(yuzefovich): support rehashing instead of large fixed bucket size.
    26  const hashTableNumBuckets = 1 << 16
    27  
    28  // hashTableBuildMode represents different modes in which the hashTable can be
    29  // built.
    30  type hashTableBuildMode int
    31  
    32  const (
    33  	// hashTableFullBuildMode is the mode where hashTable buffers all input
    34  	// tuples and populates first and next arrays for each hash bucket.
    35  	hashTableFullBuildMode hashTableBuildMode = iota
    36  
    37  	// hashTableDistinctBuildMode is the mode where hashTable only buffers
    38  	// distinct tuples and discards the duplicates.
    39  	hashTableDistinctBuildMode
    40  )
    41  
    42  // hashTableProbeMode represents different modes of probing the hashTable.
    43  type hashTableProbeMode int
    44  
    45  const (
    46  	// hashTableDefaultProbeMode is the default probing mode of the hashTable.
    47  	hashTableDefaultProbeMode hashTableProbeMode = iota
    48  
    49  	// hashTableDeletingProbeMode is the mode of probing the hashTable in which
    50  	// it "deletes" the tuples from itself once they are matched against
    51  	// probing tuples.
    52  	// For example, if we have a hashTable consisting of tuples {1, 1}, {1, 2},
    53  	// {2, 3}, and the probing tuples are {1, 4}, {1, 5}, {1, 6}, then we get
    54  	// the following when probing on the first column only:
    55  	//   {1, 4} -> {1, 1}   | hashTable = {1, 2}, {2, 3}
    56  	//   {1, 5} -> {1, 2}   | hashTable = {2, 3}
    57  	//   {1, 6} -> no match | hashTable = {2, 3}
    58  	// Note that the output of such probing is not fully deterministic when
    59  	// tuples contain non-equality columns.
    60  	hashTableDeletingProbeMode
    61  )
    62  
    63  // hashTableBuildBuffer stores the information related to the build table.
    64  type hashTableBuildBuffer struct {
    65  	// first stores the first keyID of the key that resides in each bucket.
    66  	// This keyID is used to determine the corresponding equality column key as
    67  	// well as output column values.
    68  	first []uint64
    69  
    70  	// next is a densely-packed list that stores the keyID of the next key in the
    71  	// hash table bucket chain, where an id of 0 is reserved to represent end of
    72  	// chain.
    73  	next []uint64
    74  }
    75  
    76  // hashTableProbeBuffer stores the information related to the probe table.
    77  type hashTableProbeBuffer struct {
    78  	// first stores the first keyID of the key that resides in each bucket.
    79  	// This keyID is used to determine the corresponding equality column key as
    80  	// well as output column values.
    81  	first []uint64
    82  
    83  	// next is a densely-packed list that stores the keyID of the next key in the
    84  	// hash table bucket chain, where an id of 0 is reserved to represent end of
    85  	// chain.
    86  	next []uint64
    87  
    88  	// headID stores the first build table keyID that matched with the probe batch
    89  	// key at any given index.
    90  	headID []uint64
    91  
    92  	// differs stores whether the key at any index differs with the build table
    93  	// key.
    94  	differs []bool
    95  
    96  	// distinct stores whether the key in the probe batch is distinct in the build
    97  	// table.
    98  	distinct []bool
    99  
   100  	// keys stores the equality columns on the probe table for a single batch.
   101  	keys []coldata.Vec
   102  	// buckets is used to store the computed hash value of each key in a single
   103  	// batch.
   104  	buckets []uint64
   105  
   106  	// groupID stores the keyID that maps to the joining rows of the build table.
   107  	// The ith element of groupID stores the keyID of the build table that
   108  	// corresponds to the ith key in the probe table.
   109  	groupID []uint64
   110  	// toCheck stores the indices of the eqCol rows that have yet to be found or
   111  	// rejected.
   112  	toCheck []uint64
   113  
   114  	// hashBuffer stores the hash values of each tuple in the probe table. It will
   115  	// be dynamically updated when the hashTable is build in distinct mode.
   116  	hashBuffer []uint64
   117  }
   118  
   119  // hashTable is a structure used by the hash joiner to store the build table
   120  // batches. Keys are stored according to the encoding of the equality column,
   121  // which point to the corresponding output keyID. The keyID is calculated
   122  // using the below equation:
   123  //
   124  // keyID = keys.indexOf(key) + 1
   125  //
   126  // and inversely:
   127  //
   128  // keys[keyID - 1] = key
   129  //
   130  // The table can then be probed in column batches to find at most one matching
   131  // row per column batch row.
   132  type hashTable struct {
   133  	allocator *colmem.Allocator
   134  
   135  	// buildScratch contains the scratch buffers required for the build table.
   136  	buildScratch hashTableBuildBuffer
   137  
   138  	// probeScratch contains the scratch buffers required for the probe table.
   139  	probeScratch hashTableProbeBuffer
   140  
   141  	// same and visited are only used when the hashTable contains non-distinct
   142  	// keys.
   143  	//
   144  	// same is a densely-packed list that stores the keyID of the next key in the
   145  	// hash table that has the same value as the current key. The headID of the key
   146  	// is the first key of that value found in the next linked list. This field
   147  	// will be lazily populated by the prober.
   148  	same []uint64
   149  	// visited represents whether each of the corresponding keys have been touched
   150  	// by the prober.
   151  	visited []bool
   152  
   153  	// vals stores the union of the equality and output columns of the build
   154  	// table. A key tuple is defined as the elements in each row of vals that
   155  	// makes up the equality columns. The ID of a key at any index of vals is
   156  	// index + 1.
   157  	vals *appendOnlyBufferedBatch
   158  	// keyCols stores the indices of vals which are key columns.
   159  	keyCols []uint32
   160  
   161  	// numBuckets returns the number of buckets the hashTable employs. This is
   162  	// equivalent to the size of first.
   163  	numBuckets uint64
   164  
   165  	// allowNullEquality determines if NULL keys should be treated as equal to
   166  	// each other.
   167  	allowNullEquality bool
   168  
   169  	overloadHelper overloadHelper
   170  	datumAlloc     sqlbase.DatumAlloc
   171  	cancelChecker  CancelChecker
   172  
   173  	buildMode hashTableBuildMode
   174  	probeMode hashTableProbeMode
   175  }
   176  
   177  var _ resetter = &hashTable{}
   178  
   179  func newHashTable(
   180  	allocator *colmem.Allocator,
   181  	numBuckets uint64,
   182  	sourceTypes []*types.T,
   183  	eqCols []uint32,
   184  	allowNullEquality bool,
   185  	buildMode hashTableBuildMode,
   186  	probeMode hashTableProbeMode,
   187  ) *hashTable {
   188  	if !allowNullEquality && probeMode == hashTableDeletingProbeMode {
   189  		// At the moment, we don't have a use case for such behavior, so let's
   190  		// assert that it is not requested.
   191  		colexecerror.InternalError("hashTableDeletingProbeMode is supported only when null equality is allowed")
   192  	}
   193  	ht := &hashTable{
   194  		allocator: allocator,
   195  
   196  		buildScratch: hashTableBuildBuffer{
   197  			first: make([]uint64, numBuckets),
   198  		},
   199  
   200  		probeScratch: hashTableProbeBuffer{
   201  			keys:    make([]coldata.Vec, len(eqCols)),
   202  			buckets: make([]uint64, coldata.BatchSize()),
   203  			groupID: make([]uint64, coldata.BatchSize()),
   204  			headID:  make([]uint64, coldata.BatchSize()),
   205  			toCheck: make([]uint64, coldata.BatchSize()),
   206  			differs: make([]bool, coldata.BatchSize()),
   207  		},
   208  
   209  		vals:              newAppendOnlyBufferedBatch(allocator, sourceTypes, 0 /* initialSize */),
   210  		keyCols:           eqCols,
   211  		numBuckets:        numBuckets,
   212  		allowNullEquality: allowNullEquality,
   213  		buildMode:         buildMode,
   214  		probeMode:         probeMode,
   215  	}
   216  
   217  	if buildMode == hashTableDistinctBuildMode {
   218  		ht.probeScratch.first = make([]uint64, numBuckets)
   219  		ht.probeScratch.next = make([]uint64, coldata.BatchSize()+1)
   220  		ht.buildScratch.next = make([]uint64, 1, coldata.BatchSize()+1)
   221  		ht.probeScratch.hashBuffer = make([]uint64, coldata.BatchSize())
   222  		ht.probeScratch.distinct = make([]bool, coldata.BatchSize())
   223  	}
   224  
   225  	return ht
   226  }
   227  
   228  // build executes the entirety of the hash table build phase using the input
   229  // as the build source. The input is entirely consumed in the process.
   230  func (ht *hashTable) build(ctx context.Context, input colexecbase.Operator) {
   231  	nKeyCols := len(ht.keyCols)
   232  
   233  	switch ht.buildMode {
   234  	case hashTableFullBuildMode:
   235  		for {
   236  			batch := input.Next(ctx)
   237  			if batch.Length() == 0 {
   238  				break
   239  			}
   240  
   241  			ht.allocator.PerformOperation(ht.vals.ColVecs(), func() {
   242  				ht.vals.append(batch, 0 /* startIdx */, batch.Length())
   243  			})
   244  		}
   245  
   246  		keyCols := make([]coldata.Vec, nKeyCols)
   247  		for i := 0; i < nKeyCols; i++ {
   248  			keyCols[i] = ht.vals.ColVec(int(ht.keyCols[i]))
   249  		}
   250  
   251  		// ht.next is used to store the computed hash value of each key.
   252  		ht.buildScratch.next = maybeAllocateUint64Array(ht.buildScratch.next, ht.vals.Length()+1)
   253  		ht.computeBuckets(ctx, ht.buildScratch.next[1:], keyCols, ht.vals.Length(), nil)
   254  		ht.buildNextChains(ctx, ht.buildScratch.first, ht.buildScratch.next, 1, ht.vals.Length())
   255  	case hashTableDistinctBuildMode:
   256  		for {
   257  			batch := input.Next(ctx)
   258  			if batch.Length() == 0 {
   259  				break
   260  			}
   261  
   262  			srcVecs := batch.ColVecs()
   263  
   264  			for i := 0; i < nKeyCols; i++ {
   265  				ht.probeScratch.keys[i] = srcVecs[ht.keyCols[i]]
   266  			}
   267  
   268  			ht.computeBuckets(ctx, ht.probeScratch.next[1:], ht.probeScratch.keys, batch.Length(), batch.Selection())
   269  			copy(ht.probeScratch.hashBuffer, ht.probeScratch.next[1:])
   270  
   271  			// We should not zero out the entire `first` buffer here since the size of
   272  			// the `first` buffer same as the hash range (2^16) by default. The size
   273  			// of the hashBuffer is same as the batch size which is often a lot
   274  			// smaller than the hash range. Since we are only concerned with tuples
   275  			// inside the hashBuffer, we only need to zero out the corresponding
   276  			// entries in the `first` buffer that occurred in the hashBuffer.
   277  			for _, hash := range ht.probeScratch.hashBuffer[:batch.Length()] {
   278  				ht.probeScratch.first[hash] = 0
   279  			}
   280  
   281  			ht.buildNextChains(ctx, ht.probeScratch.first, ht.probeScratch.next, 1, batch.Length())
   282  
   283  			ht.removeDuplicates(batch, ht.probeScratch.keys, ht.probeScratch.first, ht.probeScratch.next, ht.checkProbeForDistinct)
   284  
   285  			numBuffered := ht.vals.Length()
   286  			// We only check duplicates when there is at least one buffered
   287  			// tuple.
   288  			if numBuffered > 0 {
   289  				ht.removeDuplicates(batch, ht.probeScratch.keys, ht.buildScratch.first, ht.buildScratch.next, ht.checkBuildForDistinct)
   290  			}
   291  
   292  			ht.allocator.PerformOperation(ht.vals.ColVecs(), func() {
   293  				ht.vals.append(batch, 0 /* startIdx */, batch.Length())
   294  			})
   295  
   296  			ht.buildScratch.next = append(ht.buildScratch.next, ht.probeScratch.hashBuffer[:batch.Length()]...)
   297  			ht.buildNextChains(ctx, ht.buildScratch.first, ht.buildScratch.next, numBuffered+1, batch.Length())
   298  		}
   299  	default:
   300  		colexecerror.InternalError(fmt.Sprintf("hashTable in unhandled state"))
   301  	}
   302  }
   303  
   304  // removeDuplicates checks the tuples in the probe table against another table
   305  // and updates the selection vector of the probe table to only include distinct
   306  // tuples. The table removeDuplicates will check against is specified by
   307  // `first`, `next` vectors and `duplicatesChecker`. `duplicatesChecker` takes
   308  // a slice of key columns of the probe table, number of tuple to check and the
   309  // selection vector of the probe table and returns number of tuples that needs
   310  // to be checked for next iteration. It populates the ht.probeScratch.headID to
   311  // point to the keyIDs that need to be included in probe table's selection
   312  // vector.
   313  // NOTE: *first* and *next* vectors should be properly populated.
   314  func (ht *hashTable) removeDuplicates(
   315  	batch coldata.Batch,
   316  	keyCols []coldata.Vec,
   317  	first, next []uint64,
   318  	duplicatesChecker func([]coldata.Vec, uint64, []int) uint64,
   319  ) {
   320  	nToCheck := uint64(batch.Length())
   321  	sel := batch.Selection()
   322  
   323  	for i := uint64(0); i < nToCheck; i++ {
   324  		ht.probeScratch.groupID[i] = first[ht.probeScratch.hashBuffer[i]]
   325  		ht.probeScratch.toCheck[i] = i
   326  	}
   327  
   328  	for nToCheck > 0 {
   329  		// Continue searching for the build table matching keys while the toCheck
   330  		// array is non-empty.
   331  		nToCheck = duplicatesChecker(keyCols, nToCheck, sel)
   332  		ht.findNext(next, nToCheck)
   333  	}
   334  
   335  	ht.updateSel(batch)
   336  }
   337  
   338  // checkCols performs a column by column checkCol on the key columns.
   339  func (ht *hashTable) checkCols(
   340  	probeVecs, buildVecs []coldata.Vec, buildKeyCols []uint32, nToCheck uint64, probeSel []int,
   341  ) {
   342  	switch ht.probeMode {
   343  	case hashTableDefaultProbeMode:
   344  		for i := range ht.keyCols {
   345  			ht.checkCol(probeVecs[i], buildVecs[buildKeyCols[i]], i, nToCheck, probeSel)
   346  		}
   347  	case hashTableDeletingProbeMode:
   348  		for i := range ht.keyCols {
   349  			ht.checkColDeleting(probeVecs[i], buildVecs[buildKeyCols[i]], i, nToCheck, probeSel)
   350  		}
   351  	default:
   352  		colexecerror.InternalError(fmt.Sprintf("unsupported hash table probe mode: %d", ht.probeMode))
   353  	}
   354  }
   355  
   356  // checkColsForDistinctTuples performs a column by column check to find distinct
   357  // tuples in the probe table that are not present in the build table.
   358  func (ht *hashTable) checkColsForDistinctTuples(
   359  	probeVecs []coldata.Vec, nToCheck uint64, probeSel []int,
   360  ) {
   361  	buildVecs := ht.vals.ColVecs()
   362  	for i := range ht.keyCols {
   363  		probeVec := probeVecs[i]
   364  		buildVec := buildVecs[ht.keyCols[i]]
   365  
   366  		ht.checkColForDistinctTuples(probeVec, buildVec, nToCheck, probeSel)
   367  	}
   368  }
   369  
   370  // computeBuckets computes the hash value of each key and stores the result in
   371  // buckets.
   372  func (ht *hashTable) computeBuckets(
   373  	ctx context.Context, buckets []uint64, keys []coldata.Vec, nKeys int, sel []int,
   374  ) {
   375  	initHash(buckets, nKeys, defaultInitHashValue)
   376  
   377  	if nKeys == 0 {
   378  		// No work to do - avoid doing the loops below.
   379  		return
   380  	}
   381  
   382  	for i := range ht.keyCols {
   383  		rehash(ctx, buckets, keys[i], nKeys, sel, ht.cancelChecker, ht.overloadHelper, &ht.datumAlloc)
   384  	}
   385  
   386  	finalizeHash(buckets, nKeys, ht.numBuckets)
   387  }
   388  
   389  // buildNextChains builds the hash map from the computed hash values.
   390  func (ht *hashTable) buildNextChains(
   391  	ctx context.Context, first, next []uint64, offset, batchSize int,
   392  ) {
   393  	// The loop direction here is reversed to ensure that when we are building the
   394  	// next chain for the probe table, the keyID in each equality chain inside
   395  	// `next` is strictly in ascending order. This is crucial to ensure that when
   396  	// built in distinct mode, hash table will emit the first distinct tuple it
   397  	// encounters. When the next chain is built for build side, this invariant no
   398  	// longer holds for the equality chains inside `next`. This is ok however for
   399  	// the build side since all tuple that buffered on build side are already
   400  	// distinct, therefore we can be sure that when we emit a tuple, there cannot
   401  	// potentially be other tuples with the same key.
   402  	for id := offset + batchSize - 1; id >= offset; id-- {
   403  		ht.cancelChecker.check(ctx)
   404  		// keyID is stored into corresponding hash bucket at the front of the next
   405  		// chain.
   406  		hash := next[id]
   407  		firstKeyID := first[hash]
   408  		// This is to ensure that `first` always points to the tuple with smallest
   409  		// keyID in each equality chain. firstKeyID==0 means it is the first tuple
   410  		// that we have encountered with the given hash value.
   411  		if firstKeyID == 0 || uint64(id) < firstKeyID {
   412  			next[id] = first[hash]
   413  			first[hash] = uint64(id)
   414  		} else {
   415  			next[id] = next[firstKeyID]
   416  			next[firstKeyID] = uint64(id)
   417  		}
   418  	}
   419  }
   420  
   421  // maybeAllocate* methods make sure that the passed in array is allocated and
   422  // of the desired length.
   423  func maybeAllocateUint64Array(array []uint64, length int) []uint64 {
   424  	if array == nil || cap(array) < length {
   425  		return make([]uint64, length)
   426  	}
   427  	array = array[:length]
   428  	for n := 0; n < length; n += copy(array[n:], zeroUint64Column) {
   429  	}
   430  	return array
   431  }
   432  
   433  func maybeAllocateBoolArray(array []bool, length int) []bool {
   434  	if array == nil || cap(array) < length {
   435  		return make([]bool, length)
   436  	}
   437  	array = array[:length]
   438  	for n := 0; n < length; n += copy(array[n:], zeroBoolColumn) {
   439  	}
   440  	return array
   441  }
   442  
   443  func (ht *hashTable) maybeAllocateSameAndVisited() {
   444  	ht.same = maybeAllocateUint64Array(ht.same, ht.vals.Length()+1)
   445  	ht.visited = maybeAllocateBoolArray(ht.visited, ht.vals.Length()+1)
   446  	// Since keyID = 0 is reserved for end of list, it can be marked as visited
   447  	// at the beginning.
   448  	ht.visited[0] = true
   449  }
   450  
   451  // lookupInitial finds the corresponding hash table buckets for the equality
   452  // column of the batch and stores the results in groupID. It also initializes
   453  // toCheck with all indices in the range [0, batchSize).
   454  func (ht *hashTable) lookupInitial(ctx context.Context, batchSize int, sel []int) {
   455  	ht.computeBuckets(ctx, ht.probeScratch.buckets, ht.probeScratch.keys, batchSize, sel)
   456  	for i := 0; i < batchSize; i++ {
   457  		ht.probeScratch.groupID[i] = ht.buildScratch.first[ht.probeScratch.buckets[i]]
   458  		ht.probeScratch.toCheck[i] = uint64(i)
   459  	}
   460  }
   461  
   462  // findNext determines the id of the next key inside the groupID buckets for
   463  // each equality column key in toCheck.
   464  func (ht *hashTable) findNext(next []uint64, nToCheck uint64) {
   465  	for i := uint64(0); i < nToCheck; i++ {
   466  		ht.probeScratch.groupID[ht.probeScratch.toCheck[i]] =
   467  			next[ht.probeScratch.groupID[ht.probeScratch.toCheck[i]]]
   468  	}
   469  }
   470  
   471  // reset resets the hashTable for reuse.
   472  // NOTE: memory that already has been allocated for ht.vals is *not* released.
   473  // However, resetting the length of ht.vals to zero doesn't confuse the
   474  // allocator - it is smart enough to look at the capacities of the allocated
   475  // vectors, and the capacities would stay the same until an actual new
   476  // allocation is needed, and at that time the allocator will update the memory
   477  // account accordingly.
   478  func (ht *hashTable) reset(_ context.Context) {
   479  	for n := 0; n < len(ht.buildScratch.first); n += copy(ht.buildScratch.first[n:], zeroUint64Column) {
   480  	}
   481  	ht.vals.ResetInternalBatch()
   482  	ht.vals.SetLength(0)
   483  	// ht.next, ht.same and ht.visited are reset separately before
   484  	// they are used (these slices are not used in all of the code paths).
   485  	// ht.buckets doesn't need to be reset because buckets are always initialized
   486  	// when computing the hash.
   487  	copy(ht.probeScratch.groupID[:coldata.BatchSize()], zeroUint64Column)
   488  	// ht.toCheck doesn't need to be reset because it is populated manually every
   489  	// time before checking the columns.
   490  	copy(ht.probeScratch.headID[:coldata.BatchSize()], zeroUint64Column)
   491  	copy(ht.probeScratch.differs[:coldata.BatchSize()], zeroBoolColumn)
   492  	copy(ht.probeScratch.distinct, zeroBoolColumn)
   493  }