github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/external_hash_joiner.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colcontainer"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    25  	"github.com/cockroachdb/cockroach/pkg/util/log"
    26  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    27  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    28  	"github.com/cockroachdb/errors"
    29  	"github.com/marusama/semaphore"
    30  )
    31  
    32  // externalHashJoinerState indicates the current state of the external hash
    33  // joiner.
    34  type externalHashJoinerState int
    35  
    36  const (
    37  	// externalHJInitialPartitioning indicates that the operator is currently
    38  	// reading batches from both inputs and distributing tuples to different
    39  	// partitions based on the hash values. Once both inputs are exhausted, the
    40  	// external hash joiner transitions to externalHJJoinNewPartition state.
    41  	externalHJInitialPartitioning externalHashJoinerState = iota
    42  	// externalHJRecursivePartitioning indicates that the operator is recursively
    43  	// partitioning one of the existing partitions (that is too big to join at
    44  	// once). It will do so using a different hash function and will spill newly
    45  	// created partitions to disk. We also keep track whether repartitioning
    46  	// reduces the size of the partitions in question - if we see that the newly
    47  	// created largest partition is about the same in size as the "parent"
    48  	// partition (the percentage difference is less than
    49  	// externalHJRecursivePartitioningSizeDecreaseThreshold), it is likely that
    50  	// the partition consists of the tuples not distinct on the equality columns,
    51  	// so we fall back to using a combination of sort and merge join to process
    52  	// such partition. After repartitioning, the operator transitions to
    53  	// externalHJJoinNewPartition state.
    54  	externalHJRecursivePartitioning
    55  	// externalHJJoinNewPartition indicates that the operator should choose a
    56  	// partition index and join the corresponding partitions from both sides
    57  	// using the in-memory hash joiner. We will only join the partitions if the
    58  	// right side partition fits into memory (because in-memory hash joiner will
    59  	// fully buffer the right side but will process left side in the streaming
    60  	// fashion). If there are no partition indices that the operator can join, it
    61  	// transitions into externalHJRecursivePartitioning state. If there are no
    62  	// partition indices to join using in-memory hash joiner, but there are
    63  	// indices to join using sort + merge join strategy, the operator transitions
    64  	// to externalHJSortMergeNewPartition state. If there are no partition
    65  	// indices left at all to join, the operator transitions to
    66  	// externalHJFinished state.
    67  	externalHJJoinNewPartition
    68  	// externalHJJoining indicates that the operator is currently joining tuples
    69  	// from the corresponding partitions from both sides. An in-memory hash join
    70  	// operator is used to perform the join. Once the in-memory operator returns
    71  	// a zero-length batch (indicating that full output for the current
    72  	// partitions has been emitted), the external hash joiner transitions to
    73  	// externalHJJoinNewPartition state.
    74  	externalHJJoining
    75  	// externalHJSortMergeNewPartition indicates that the operator should choose
    76  	// a partition index to join using sort + merge join strategy. If there are
    77  	// no partition indices for this strategy left, the operator transitions to
    78  	// externalHJFinished state.
    79  	externalHJSortMergeNewPartition
    80  	// externalHJSortMergeJoining indicates that the operator is currently
    81  	// joining tuples from the corresponding partitions from both sides using
    82  	// (disk-backed) sort + merge join strategy. Once the in-memory merge joiner
    83  	// returns a zero-length batch (indicating that full output for the current
    84  	// partitions has been emitted), the external hash joiner transitions to
    85  	// externalHJSortMergeNewPartition state.
    86  	externalHJSortMergeJoining
    87  	// externalHJFinished indicates that the external hash joiner has emitted all
    88  	// tuples already and only zero-length batch will be emitted from now on.
    89  	externalHJFinished
    90  )
    91  
    92  const (
    93  	// externalHJRecursivePartitioningSizeDecreaseThreshold determines by how
    94  	// much the newly-created partitions in the recursive partitioning stage
    95  	// should be smaller than the "parent" partition in order to consider the
    96  	// repartitioning "successful". If this threshold is not met, then this newly
    97  	// created partition will be added to sort + merge join list (which, in a
    98  	// sense, serves as the base case for "recursion").
    99  	externalHJRecursivePartitioningSizeDecreaseThreshold = 0.05
   100  	// externalHJDiskQueuesMemFraction determines the fraction of the available
   101  	// RAM that is allocated for the in-memory cache of disk queues.
   102  	externalHJDiskQueuesMemFraction = 0.5
   103  	// We need at least two buckets per side to make progress. However, the
   104  	// minimum number of partitions necessary are the partitions in use during a
   105  	// fallback to sort and merge join. We'll be using the minimum necessary per
   106  	// input + 2 (1 for each spilling queue that the merge joiner uses). For
   107  	// clarity this is what happens:
   108  	// - The 2 partitions that need to be sorted + merged will use an FD each: 2
   109  	//   FDs. Meanwhile, each sorter will use up to externalSorterMinPartitions to
   110  	//   sort and partition this input. At this stage 2 + 2 *
   111  	//   externalSorterMinPartitions FDs are used.
   112  	// - Once the inputs (the hash joiner partitions) are finished, both FDs will
   113  	//   be released. The merge joiner will now be in use, which uses two
   114  	//   spillingQueues with 1 FD each for a total of 2. Since each sorter will
   115  	//   use externalSorterMinPartitions, the FDs used at this stage are 2 +
   116  	//   (2 * externalSorterMinPartitions) as well. Note that as soon as the
   117  	//   sorter emits its first batch, it must be the case that the input to it
   118  	//   has returned a zero batch, and thus the FD has been closed.
   119  	sortMergeNonSortMinFDsOpen = 2
   120  	externalHJMinPartitions    = sortMergeNonSortMinFDsOpen + (externalSorterMinPartitions * 2)
   121  	// externalHJMinimalMaxRightPartitionSize determines the minimum value for
   122  	// maxRightPartitionSizeToJoin variable of the external hash joiner.
   123  	externalHJMinimalMaxRightPartitionSize = 64 << 10 /* 64 KiB */
   124  )
   125  
   126  // externalHashJoiner is an operator that performs Grace hash join algorithm
   127  // and can spill to disk. The high level view is that it partitions the left
   128  // and right side into large buckets by a hash function A, writes those buckets
   129  // to disk, then iterates through pairs of those buckets and does a normal hash
   130  // join with a different hash function B.
   131  //
   132  // In order to get different hash functions, we're using the same family of
   133  // hash functions that in-memory hash joiner uses, but we will seed it with a
   134  // different initial hash value.
   135  //
   136  // The operator works in two phases.
   137  //
   138  // Phase 1: partitioning
   139  // In this phase, we iterate through both sides of the join, hashing every row
   140  // using a hash function A that produces n partitions. This will produce n
   141  // partitions for each side of the join, which will be persisted to disk
   142  // separately. As memory fills up, each of these partitions is flushed to disk
   143  // repeatedly until the inputs are exhausted.
   144  //
   145  // Phase 2: join
   146  // Now, we retrieve pairs of partitions from disk and join each pair using the
   147  // ordinary hash join algorithm (and a different hash function B). Since we're
   148  // performing an equality join, we can guarantee that each row on the left side
   149  // of the join, if it has a match, will be in the same partition on the right
   150  // side of the join. So, it's safe to do the join in pieces, partition by
   151  // partition.
   152  //
   153  // If one of the partitions itself runs out of memory, we can recursively apply
   154  // this algorithm. The partition will be divided into sub-partitions by a new
   155  // hash function, spilled to disk, and so on. If repartitioning doesn't reduce
   156  // size of the partitions sufficiently, then such partitions will be handled
   157  // using the combination of disk-backed sort and merge join operators.
   158  type externalHashJoiner struct {
   159  	twoInputNode
   160  	NonExplainable
   161  	closerHelper
   162  
   163  	// mu is used to protect against concurrent IdempotentClose and Next calls,
   164  	// which are currently allowed.
   165  	// TODO(asubiotto): Explore calling IdempotentClose from the same goroutine as
   166  	//  Next, which will simplify this model.
   167  	mu syncutil.Mutex
   168  
   169  	state              externalHashJoinerState
   170  	unlimitedAllocator *colmem.Allocator
   171  	spec               hashJoinerSpec
   172  	diskQueueCfg       colcontainer.DiskQueueCfg
   173  
   174  	// fdState is used to acquire file descriptors up front.
   175  	fdState struct {
   176  		fdSemaphore semaphore.Semaphore
   177  		acquiredFDs int
   178  	}
   179  
   180  	// Partitioning phase variables.
   181  	leftPartitioner  colcontainer.PartitionedQueue
   182  	rightPartitioner colcontainer.PartitionedQueue
   183  	tupleDistributor *tupleHashDistributor
   184  	// maxNumberActivePartitions determines the maximum number of active
   185  	// partitions that the operator is allowed to have. This number is computed
   186  	// semi-dynamically and will influence the choice of numBuckets value.
   187  	maxNumberActivePartitions int
   188  	// numBuckets is the number of buckets that a partition is divided into.
   189  	numBuckets int
   190  	// partitionsToJoinUsingInMemHash is a map from partitionIdx to a utility
   191  	// struct. This map contains all partition indices that need to be joined
   192  	// using the in-memory hash joiner. If the partition is too big, it will be
   193  	// tried to be repartitioned; if during repartitioning the size doesn't
   194  	// decrease enough, it will be added to partitionsToJoinUsingSortMerge.
   195  	partitionsToJoinUsingInMemHash map[int]*externalHJPartitionInfo
   196  	// partitionsToJoinUsingSortMerge contains all partition indices that need to
   197  	// be joined using sort + merge join strategy. Partition indices will be
   198  	// added into this map if recursive partitioning doesn't seem to make
   199  	// progress on partition' size reduction.
   200  	partitionsToJoinUsingSortMerge []int
   201  	// partitionIdxOffset stores the first "available" partition index to use.
   202  	// During the partitioning step, all tuples will go into one of the buckets
   203  	// in [partitionIdxOffset, partitionIdxOffset + numBuckets) range.
   204  	partitionIdxOffset int
   205  	// numRepartitions tracks the number of times the external hash joiner had to
   206  	// recursively repartition another partition because the latter was too big
   207  	// to join.
   208  	numRepartitions int
   209  	// scratch and recursiveScratch are helper structs.
   210  	scratch, recursiveScratch struct {
   211  		// Input sources can have different schemas, so when distributing tuples
   212  		// (i.e. copying them into scratch batch to be spilled) we might need two
   213  		// different batches.
   214  		leftBatch, rightBatch coldata.Batch
   215  	}
   216  
   217  	// Join phase variables.
   218  	leftJoinerInput, rightJoinerInput *partitionerToOperator
   219  	inMemHashJoiner                   *hashJoiner
   220  	// diskBackedSortMerge is a side chain of disk-backed sorters that feed into
   221  	// disk-backed merge joiner which the external hash joiner can fall back to.
   222  	diskBackedSortMerge resettableOperator
   223  
   224  	memState struct {
   225  		// maxRightPartitionSizeToJoin indicates the maximum memory size of a
   226  		// partition on the right side that we're ok with joining without having to
   227  		// repartition it. We pay attention only to the right side because in-memory
   228  		// hash joiner will buffer the whole right input before processing the left
   229  		// input in a "streaming" fashion.
   230  		maxRightPartitionSizeToJoin int64
   231  	}
   232  
   233  	testingKnobs struct {
   234  		// numForcedRepartitions is a number of times that the external hash joiner
   235  		// is forced to recursively repartition (even if it is otherwise not
   236  		// needed) before it proceeds to actual join partitions.
   237  		numForcedRepartitions int
   238  		// delegateFDAcquisitions, if true, means that a test wants to force the
   239  		// PartitionedDiskQueues to track the number of file descriptors the hash
   240  		// joiner will open/close. This disables the default behavior of acquiring
   241  		// all file descriptors up front in Next.
   242  		delegateFDAcquisitions bool
   243  	}
   244  }
   245  
   246  var _ closableOperator = &externalHashJoiner{}
   247  
   248  type externalHJPartitionInfo struct {
   249  	rightMemSize       int64
   250  	rightParentMemSize int64
   251  }
   252  
   253  type joinSide int
   254  
   255  const (
   256  	leftSide joinSide = iota
   257  	rightSide
   258  )
   259  
   260  // newExternalHashJoiner returns a disk-backed hash joiner.
   261  // - unlimitedAllocator must have been created with a memory account derived
   262  // from an unlimited memory monitor. It will be used by several internal
   263  // components of the external hash joiner which is responsible for making sure
   264  // that the components stay within the memory limit.
   265  // - numForcedRepartitions is a number of times that the external hash joiner
   266  // is forced to recursively repartition (even if it is otherwise not needed).
   267  // This should be non-zero only in tests.
   268  // - delegateFDAcquisitions specifies whether the external hash joiner should
   269  // let the partitioned disk queues acquire file descriptors instead of acquiring
   270  // them up front in Next. Should be true only in tests.
   271  func newExternalHashJoiner(
   272  	unlimitedAllocator *colmem.Allocator,
   273  	spec hashJoinerSpec,
   274  	leftInput, rightInput colexecbase.Operator,
   275  	memoryLimit int64,
   276  	diskQueueCfg colcontainer.DiskQueueCfg,
   277  	fdSemaphore semaphore.Semaphore,
   278  	createReusableDiskBackedSorter func(input colexecbase.Operator, inputTypes []*types.T, orderingCols []execinfrapb.Ordering_Column, maxNumberPartitions int) (colexecbase.Operator, error),
   279  	numForcedRepartitions int,
   280  	delegateFDAcquisitions bool,
   281  	diskAcc *mon.BoundAccount,
   282  ) colexecbase.Operator {
   283  	if diskQueueCfg.CacheMode != colcontainer.DiskQueueCacheModeClearAndReuseCache {
   284  		colexecerror.InternalError(errors.Errorf("external hash joiner instantiated with suboptimal disk queue cache mode: %d", diskQueueCfg.CacheMode))
   285  	}
   286  	partitionedDiskQueueSemaphore := fdSemaphore
   287  	if !delegateFDAcquisitions {
   288  		// To avoid deadlocks with other disk queues, we manually attempt to acquire
   289  		// the maximum number of descriptors all at once in Next. Passing in a nil
   290  		// semaphore indicates that the caller will do the acquiring.
   291  		partitionedDiskQueueSemaphore = nil
   292  	}
   293  	leftPartitioner := colcontainer.NewPartitionedDiskQueue(
   294  		spec.left.sourceTypes, diskQueueCfg, partitionedDiskQueueSemaphore, colcontainer.PartitionerStrategyDefault, diskAcc,
   295  	)
   296  	leftJoinerInput := newPartitionerToOperator(
   297  		unlimitedAllocator, spec.left.sourceTypes, leftPartitioner, 0, /* partitionIdx */
   298  	)
   299  	rightPartitioner := colcontainer.NewPartitionedDiskQueue(
   300  		spec.right.sourceTypes, diskQueueCfg, partitionedDiskQueueSemaphore, colcontainer.PartitionerStrategyDefault, diskAcc,
   301  	)
   302  	rightJoinerInput := newPartitionerToOperator(
   303  		unlimitedAllocator, spec.right.sourceTypes, rightPartitioner, 0, /* partitionIdx */
   304  	)
   305  	// With the default limit of 256 file descriptors, this results in 16
   306  	// partitions. This is a hard maximum of partitions that will be used by the
   307  	// external hash joiner. Below we check whether we have enough RAM to support
   308  	// the caches of this number of partitions.
   309  	// TODO(yuzefovich): this number should be tuned.
   310  	maxNumberActivePartitions := fdSemaphore.GetLimit() / 16
   311  	if diskQueueCfg.BufferSizeBytes > 0 {
   312  		diskQueuesTotalMemLimit := int(float64(memoryLimit) * externalHJDiskQueuesMemFraction)
   313  		numDiskQueuesThatFit := diskQueuesTotalMemLimit / diskQueueCfg.BufferSizeBytes
   314  		if numDiskQueuesThatFit < maxNumberActivePartitions {
   315  			maxNumberActivePartitions = numDiskQueuesThatFit
   316  		}
   317  	}
   318  	if maxNumberActivePartitions < externalHJMinPartitions {
   319  		maxNumberActivePartitions = externalHJMinPartitions
   320  	}
   321  	diskQueuesMemUsed := maxNumberActivePartitions * diskQueueCfg.BufferSizeBytes
   322  	makeOrderingCols := func(eqCols []uint32) []execinfrapb.Ordering_Column {
   323  		res := make([]execinfrapb.Ordering_Column, len(eqCols))
   324  		for i, colIdx := range eqCols {
   325  			res[i].ColIdx = colIdx
   326  		}
   327  		return res
   328  	}
   329  	// We need to allocate 2 FDs for reading the partitions (reused by the merge
   330  	// joiner) that we need to join using sort + merge join strategy, and all
   331  	// others are divided between the two inputs.
   332  	externalSorterMaxNumberPartitions := (maxNumberActivePartitions - sortMergeNonSortMinFDsOpen) / 2
   333  	if externalSorterMaxNumberPartitions < externalSorterMinPartitions {
   334  		// This code gets a maximum number of partitions based on the semaphore
   335  		// limit. In tests, this limit is set artificially low to catch any
   336  		// violations of the limit, resulting in possibly computing a low number of
   337  		// partitions for the sorter, which we overwrite here.
   338  		externalSorterMaxNumberPartitions = externalSorterMinPartitions
   339  	}
   340  	leftOrdering := makeOrderingCols(spec.left.eqCols)
   341  	leftPartitionSorter, err := createReusableDiskBackedSorter(
   342  		leftJoinerInput, spec.left.sourceTypes, leftOrdering, externalSorterMaxNumberPartitions,
   343  	)
   344  	if err != nil {
   345  		colexecerror.InternalError(err)
   346  	}
   347  	rightOrdering := makeOrderingCols(spec.right.eqCols)
   348  	rightPartitionSorter, err := createReusableDiskBackedSorter(
   349  		rightJoinerInput, spec.right.sourceTypes, rightOrdering, externalSorterMaxNumberPartitions,
   350  	)
   351  	if err != nil {
   352  		colexecerror.InternalError(err)
   353  	}
   354  	diskBackedSortMerge, err := newMergeJoinOp(
   355  		unlimitedAllocator, memoryLimit, diskQueueCfg,
   356  		partitionedDiskQueueSemaphore, spec.joinType, leftPartitionSorter, rightPartitionSorter,
   357  		spec.left.sourceTypes, spec.right.sourceTypes, leftOrdering, rightOrdering,
   358  		diskAcc,
   359  	)
   360  	if err != nil {
   361  		colexecerror.InternalError(err)
   362  	}
   363  	ehj := &externalHashJoiner{
   364  		twoInputNode:              newTwoInputNode(leftInput, rightInput),
   365  		unlimitedAllocator:        unlimitedAllocator,
   366  		spec:                      spec,
   367  		diskQueueCfg:              diskQueueCfg,
   368  		leftPartitioner:           leftPartitioner,
   369  		rightPartitioner:          rightPartitioner,
   370  		maxNumberActivePartitions: maxNumberActivePartitions,
   371  		// In the initial partitioning state we will use half of available
   372  		// partitions to write the partitioned input from the left side and another
   373  		// half for the right side.
   374  		// TODO(yuzefovich): figure out whether we should care about
   375  		// hj.numBuckets being a power of two (finalizeHash step is faster if so).
   376  		numBuckets:                     maxNumberActivePartitions / 2,
   377  		partitionsToJoinUsingInMemHash: make(map[int]*externalHJPartitionInfo),
   378  		partitionsToJoinUsingSortMerge: make([]int, 0),
   379  		leftJoinerInput:                leftJoinerInput,
   380  		rightJoinerInput:               rightJoinerInput,
   381  		inMemHashJoiner: newHashJoiner(
   382  			unlimitedAllocator, spec, leftJoinerInput, rightJoinerInput,
   383  		).(*hashJoiner),
   384  		diskBackedSortMerge: diskBackedSortMerge,
   385  	}
   386  	ehj.fdState.fdSemaphore = fdSemaphore
   387  	// To simplify the accounting, we will assume that the in-memory hash
   388  	// joiner's memory usage is equal to the size of the right partition to be
   389  	// joined (which will be fully buffered). This is an underestimate because a
   390  	// single batch from the left partition will be read at a time as well as an
   391  	// output batch will be used, but that shouldn't matter in the grand scheme
   392  	// of things.
   393  	ehj.memState.maxRightPartitionSizeToJoin = memoryLimit - int64(diskQueuesMemUsed)
   394  	if ehj.memState.maxRightPartitionSizeToJoin < externalHJMinimalMaxRightPartitionSize {
   395  		ehj.memState.maxRightPartitionSizeToJoin = externalHJMinimalMaxRightPartitionSize
   396  	}
   397  	ehj.scratch.leftBatch = unlimitedAllocator.NewMemBatch(spec.left.sourceTypes)
   398  	ehj.recursiveScratch.leftBatch = unlimitedAllocator.NewMemBatch(spec.left.sourceTypes)
   399  	sameSourcesSchema := len(spec.left.sourceTypes) == len(spec.right.sourceTypes)
   400  	for i, leftType := range spec.left.sourceTypes {
   401  		if i < len(spec.right.sourceTypes) && !leftType.Identical(spec.right.sourceTypes[i]) {
   402  			sameSourcesSchema = false
   403  		}
   404  	}
   405  	if sameSourcesSchema {
   406  		// The schemas of both sources are the same, so we can reuse the left
   407  		// scratch batch.
   408  		ehj.scratch.rightBatch = ehj.scratch.leftBatch
   409  		ehj.recursiveScratch.rightBatch = ehj.recursiveScratch.leftBatch
   410  	} else {
   411  		ehj.scratch.rightBatch = unlimitedAllocator.NewMemBatch(spec.right.sourceTypes)
   412  		ehj.recursiveScratch.rightBatch = unlimitedAllocator.NewMemBatch(spec.right.sourceTypes)
   413  	}
   414  	ehj.testingKnobs.numForcedRepartitions = numForcedRepartitions
   415  	ehj.testingKnobs.delegateFDAcquisitions = delegateFDAcquisitions
   416  	return ehj
   417  }
   418  
   419  func (hj *externalHashJoiner) Init() {
   420  	hj.inputOne.Init()
   421  	hj.inputTwo.Init()
   422  	// In the join phase, hash join operator will use the default init hash
   423  	// value, so in order to use a "different" hash function in the partitioning
   424  	// phase we use a different init hash value.
   425  	hj.tupleDistributor = newTupleHashDistributor(
   426  		defaultInitHashValue+1, hj.numBuckets,
   427  	)
   428  	hj.state = externalHJInitialPartitioning
   429  }
   430  
   431  func (hj *externalHashJoiner) partitionBatch(
   432  	ctx context.Context, batch coldata.Batch, side joinSide, parentMemSize int64,
   433  ) {
   434  	batchLen := batch.Length()
   435  	if batchLen == 0 {
   436  		return
   437  	}
   438  	scratchBatch := hj.scratch.leftBatch
   439  	sourceSpec := hj.spec.left
   440  	partitioner := hj.leftPartitioner
   441  	if side == rightSide {
   442  		scratchBatch = hj.scratch.rightBatch
   443  		sourceSpec = hj.spec.right
   444  		partitioner = hj.rightPartitioner
   445  	}
   446  	selections := hj.tupleDistributor.distribute(
   447  		ctx, batch, sourceSpec.sourceTypes, sourceSpec.eqCols,
   448  	)
   449  	for idx, sel := range selections {
   450  		partitionIdx := hj.partitionIdxOffset + idx
   451  		if len(sel) > 0 {
   452  			scratchBatch.ResetInternalBatch()
   453  			// The partitioner expects the batches without a selection vector, so we
   454  			// need to copy the tuples according to the selection vector into a
   455  			// scratch batch.
   456  			colVecs := scratchBatch.ColVecs()
   457  			hj.unlimitedAllocator.PerformOperation(colVecs, func() {
   458  				for i, colvec := range colVecs {
   459  					colvec.Copy(coldata.CopySliceArgs{
   460  						SliceArgs: coldata.SliceArgs{
   461  							Src:       batch.ColVec(i),
   462  							Sel:       sel,
   463  							SrcEndIdx: len(sel),
   464  						},
   465  					})
   466  				}
   467  				scratchBatch.SetLength(len(sel))
   468  			})
   469  			if err := partitioner.Enqueue(ctx, partitionIdx, scratchBatch); err != nil {
   470  				colexecerror.InternalError(err)
   471  			}
   472  			partitionInfo, ok := hj.partitionsToJoinUsingInMemHash[partitionIdx]
   473  			if !ok {
   474  				partitionInfo = &externalHJPartitionInfo{}
   475  				hj.partitionsToJoinUsingInMemHash[partitionIdx] = partitionInfo
   476  			}
   477  			if side == rightSide {
   478  				partitionInfo.rightParentMemSize = parentMemSize
   479  				// We cannot use allocator's methods directly because those
   480  				// look at the capacities of the vectors, and in our case only
   481  				// first len(sel) tuples belong to the "current" batch.
   482  				partitionInfo.rightMemSize += colmem.GetProportionalBatchMemSize(scratchBatch, int64(len(sel)))
   483  			}
   484  		}
   485  	}
   486  }
   487  
   488  func (hj *externalHashJoiner) Next(ctx context.Context) coldata.Batch {
   489  	hj.mu.Lock()
   490  	defer hj.mu.Unlock()
   491  StateChanged:
   492  	for {
   493  		switch hj.state {
   494  		case externalHJInitialPartitioning:
   495  			leftBatch := hj.inputOne.Next(ctx)
   496  			rightBatch := hj.inputTwo.Next(ctx)
   497  			if leftBatch.Length() == 0 && rightBatch.Length() == 0 {
   498  				// Both inputs have been partitioned and spilled, so we transition to
   499  				// "joining" phase. Close all the open write file descriptors.
   500  				//
   501  				// TODO(yuzefovich): this will also clear the cache once the new PR is
   502  				// in. This means we will reallocate a cache whenever reading from the
   503  				// partitions. What I think we might want to do is not close the
   504  				// partitions here. Instead, we move on to joining, which will switch
   505  				// all of these reserved file descriptors to read in the best case (no
   506  				// repartitioning) and reuse the cache. Only if we need to repartition
   507  				// should we CloseAllOpenWriteFileDescriptors of both sides. It might
   508  				// also be more efficient to Dequeue from the partitions you'll read
   509  				// from before doing that to exempt them from releasing their FDs to
   510  				// the semaphore.
   511  				if err := hj.leftPartitioner.CloseAllOpenWriteFileDescriptors(ctx); err != nil {
   512  					colexecerror.InternalError(err)
   513  				}
   514  				if err := hj.rightPartitioner.CloseAllOpenWriteFileDescriptors(ctx); err != nil {
   515  					colexecerror.InternalError(err)
   516  				}
   517  				hj.inMemHashJoiner.Init()
   518  				hj.partitionIdxOffset += hj.numBuckets
   519  				hj.state = externalHJJoinNewPartition
   520  				continue
   521  			}
   522  			if !hj.testingKnobs.delegateFDAcquisitions && hj.fdState.acquiredFDs == 0 {
   523  				toAcquire := hj.maxNumberActivePartitions
   524  				if err := hj.fdState.fdSemaphore.Acquire(ctx, toAcquire); err != nil {
   525  					colexecerror.InternalError(err)
   526  				}
   527  				hj.fdState.acquiredFDs = toAcquire
   528  			}
   529  			hj.partitionBatch(ctx, leftBatch, leftSide, math.MaxInt64)
   530  			hj.partitionBatch(ctx, rightBatch, rightSide, math.MaxInt64)
   531  
   532  		case externalHJRecursivePartitioning:
   533  			hj.numRepartitions++
   534  			if log.V(2) && hj.numRepartitions%10 == 0 {
   535  				log.Infof(ctx,
   536  					"external hash joiner is performing %d'th repartition", hj.numRepartitions,
   537  				)
   538  			}
   539  			// In order to use a different hash function when repartitioning, we need
   540  			// to increase the seed value of the tuple distributor.
   541  			hj.tupleDistributor.initHashValue++
   542  			// We're actively will be using hj.numBuckets + 1 partitions (because
   543  			// we're repartitioning one side at a time), so we can set hj.numBuckets
   544  			// higher than in the initial partitioning step.
   545  			// TODO(yuzefovich): figure out whether we should care about
   546  			// hj.numBuckets being a power of two (finalizeHash step is faster if so).
   547  			hj.numBuckets = hj.maxNumberActivePartitions - 1
   548  			hj.tupleDistributor.resetNumOutputs(hj.numBuckets)
   549  			for parentPartitionIdx, parentPartitionInfo := range hj.partitionsToJoinUsingInMemHash {
   550  				for _, side := range []joinSide{leftSide, rightSide} {
   551  					batch := hj.recursiveScratch.leftBatch
   552  					partitioner := hj.leftPartitioner
   553  					memSize := int64(math.MaxInt64)
   554  					if side == rightSide {
   555  						batch = hj.recursiveScratch.rightBatch
   556  						partitioner = hj.rightPartitioner
   557  						memSize = parentPartitionInfo.rightMemSize
   558  					}
   559  					for {
   560  						if err := partitioner.Dequeue(ctx, parentPartitionIdx, batch); err != nil {
   561  							colexecerror.InternalError(err)
   562  						}
   563  						if batch.Length() == 0 {
   564  							break
   565  						}
   566  						hj.partitionBatch(ctx, batch, side, memSize)
   567  					}
   568  					// We're done reading from this partition, and it will never be read
   569  					// from again, so we can close it.
   570  					if err := partitioner.CloseInactiveReadPartitions(ctx); err != nil {
   571  						colexecerror.InternalError(err)
   572  					}
   573  					// We're done writing to the newly created partitions.
   574  					// TODO(yuzefovich): we should not release the descriptors here. The
   575  					// invariant should be: we're entering
   576  					// externalHJRecursivePartitioning, at that stage we have at most
   577  					// numBuckets*2 file descriptors open. At the top of the state
   578  					// transition, close all open write file descriptors, which should
   579  					// reduce the open descriptors to 0. Now we open the two read'
   580  					// partitions for 2 file descriptors and whatever number of write
   581  					// partitions we want. This'll allow us to remove the call to
   582  					// CloseAllOpen... in the first state as well.
   583  					if err := partitioner.CloseAllOpenWriteFileDescriptors(ctx); err != nil {
   584  						colexecerror.InternalError(err)
   585  					}
   586  				}
   587  				for idx := 0; idx < hj.numBuckets; idx++ {
   588  					newPartitionIdx := hj.partitionIdxOffset + idx
   589  					if partitionInfo, ok := hj.partitionsToJoinUsingInMemHash[newPartitionIdx]; ok {
   590  						before, after := partitionInfo.rightParentMemSize, partitionInfo.rightMemSize
   591  						if before > 0 {
   592  							sizeDecrease := 1.0 - float64(after)/float64(before)
   593  							if sizeDecrease < externalHJRecursivePartitioningSizeDecreaseThreshold {
   594  								// We will need to join this partition using sort + merge
   595  								// join strategy.
   596  								hj.partitionsToJoinUsingSortMerge = append(hj.partitionsToJoinUsingSortMerge, newPartitionIdx)
   597  								delete(hj.partitionsToJoinUsingInMemHash, newPartitionIdx)
   598  							}
   599  						}
   600  					}
   601  				}
   602  				// We have successfully repartitioned the partitions with index
   603  				// 'parentPartitionIdx' on both sides, so we delete that index from the
   604  				// map and proceed on joining the newly created partitions.
   605  				delete(hj.partitionsToJoinUsingInMemHash, parentPartitionIdx)
   606  				hj.partitionIdxOffset += hj.numBuckets
   607  				hj.state = externalHJJoinNewPartition
   608  				continue StateChanged
   609  			}
   610  
   611  		case externalHJJoinNewPartition:
   612  			if hj.testingKnobs.numForcedRepartitions > 0 && len(hj.partitionsToJoinUsingInMemHash) > 0 {
   613  				hj.testingKnobs.numForcedRepartitions--
   614  				hj.state = externalHJRecursivePartitioning
   615  				continue
   616  			}
   617  			// Find next partition that we can join without having to recursively
   618  			// repartition.
   619  			for partitionIdx, partitionInfo := range hj.partitionsToJoinUsingInMemHash {
   620  				if partitionInfo.rightMemSize <= hj.memState.maxRightPartitionSizeToJoin {
   621  					// Update the inputs to in-memory hash joiner and reset the latter.
   622  					hj.leftJoinerInput.partitionIdx = partitionIdx
   623  					hj.rightJoinerInput.partitionIdx = partitionIdx
   624  					hj.inMemHashJoiner.reset(ctx)
   625  					delete(hj.partitionsToJoinUsingInMemHash, partitionIdx)
   626  					hj.state = externalHJJoining
   627  					continue StateChanged
   628  				}
   629  			}
   630  			if len(hj.partitionsToJoinUsingInMemHash) == 0 {
   631  				// All partitions to join using the hash joiner have been processed.
   632  				if len(hj.partitionsToJoinUsingSortMerge) > 0 {
   633  					// But there are still some partitions to join using sort + merge
   634  					// join strategy.
   635  					hj.diskBackedSortMerge.Init()
   636  					if log.V(2) {
   637  						log.Infof(ctx,
   638  							"external hash joiner will join %d partitions using sort + merge join",
   639  							len(hj.partitionsToJoinUsingSortMerge),
   640  						)
   641  					}
   642  					hj.state = externalHJSortMergeNewPartition
   643  					continue
   644  				}
   645  				// All partitions have been processed, so we transition to finished
   646  				// state.
   647  				hj.state = externalHJFinished
   648  				continue
   649  			}
   650  			// We have partitions that we cannot join without recursively
   651  			// repartitioning first, so we transition to the corresponding state.
   652  			hj.state = externalHJRecursivePartitioning
   653  			continue
   654  
   655  		case externalHJJoining:
   656  			b := hj.inMemHashJoiner.Next(ctx)
   657  			if b.Length() == 0 {
   658  				// We're done joining these partitions, so we close them and transition
   659  				// to joining new ones.
   660  				if err := hj.leftPartitioner.CloseInactiveReadPartitions(ctx); err != nil {
   661  					colexecerror.InternalError(err)
   662  				}
   663  				if err := hj.rightPartitioner.CloseInactiveReadPartitions(ctx); err != nil {
   664  					colexecerror.InternalError(err)
   665  				}
   666  				hj.state = externalHJJoinNewPartition
   667  				continue
   668  			}
   669  			return b
   670  
   671  		case externalHJSortMergeNewPartition:
   672  			if len(hj.partitionsToJoinUsingSortMerge) == 0 {
   673  				// All partitions have been processed, so we transition to finished
   674  				// state.
   675  				hj.state = externalHJFinished
   676  				continue
   677  			}
   678  			partitionIdx := hj.partitionsToJoinUsingSortMerge[0]
   679  			hj.partitionsToJoinUsingSortMerge = hj.partitionsToJoinUsingSortMerge[1:]
   680  			// Update the inputs to sort + merge joiner and reset that chain.
   681  			hj.leftJoinerInput.partitionIdx = partitionIdx
   682  			hj.rightJoinerInput.partitionIdx = partitionIdx
   683  			hj.diskBackedSortMerge.reset(ctx)
   684  			hj.state = externalHJSortMergeJoining
   685  			continue
   686  
   687  		case externalHJSortMergeJoining:
   688  			b := hj.diskBackedSortMerge.Next(ctx)
   689  			if b.Length() == 0 {
   690  				// We're done joining these partitions, so we close them and transition
   691  				// to joining new ones.
   692  				if err := hj.leftPartitioner.CloseInactiveReadPartitions(ctx); err != nil {
   693  					colexecerror.InternalError(err)
   694  				}
   695  				if err := hj.rightPartitioner.CloseInactiveReadPartitions(ctx); err != nil {
   696  					colexecerror.InternalError(err)
   697  				}
   698  				hj.state = externalHJSortMergeNewPartition
   699  				continue
   700  			}
   701  			return b
   702  
   703  		case externalHJFinished:
   704  			if err := hj.idempotentCloseLocked(ctx); err != nil {
   705  				colexecerror.InternalError(err)
   706  			}
   707  			return coldata.ZeroBatch
   708  		default:
   709  			colexecerror.InternalError(fmt.Sprintf("unexpected externalHashJoinerState %d", hj.state))
   710  		}
   711  	}
   712  }
   713  
   714  func (hj *externalHashJoiner) IdempotentClose(ctx context.Context) error {
   715  	hj.mu.Lock()
   716  	defer hj.mu.Unlock()
   717  	return hj.idempotentCloseLocked(ctx)
   718  }
   719  
   720  func (hj *externalHashJoiner) idempotentCloseLocked(ctx context.Context) error {
   721  	if !hj.close() {
   722  		return nil
   723  	}
   724  	var retErr error
   725  	if err := hj.leftPartitioner.Close(ctx); err != nil {
   726  		retErr = err
   727  	}
   728  	if err := hj.rightPartitioner.Close(ctx); err != nil && retErr == nil {
   729  		retErr = err
   730  	}
   731  	if c, ok := hj.diskBackedSortMerge.(IdempotentCloser); ok {
   732  		if err := c.IdempotentClose(ctx); err != nil && retErr == nil {
   733  			retErr = err
   734  		}
   735  	}
   736  	if !hj.testingKnobs.delegateFDAcquisitions && hj.fdState.acquiredFDs > 0 {
   737  		hj.fdState.fdSemaphore.Release(hj.fdState.acquiredFDs)
   738  		hj.fdState.acquiredFDs = 0
   739  	}
   740  	return retErr
   741  }