github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/distinct_test.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"testing"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    23  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    24  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    25  )
    26  
    27  func TestDistinct(t *testing.T) {
    28  	defer leaktest.AfterTest(t)()
    29  	rng, _ := randutil.NewPseudoRand()
    30  	tcs := []struct {
    31  		distinctCols            []uint32
    32  		typs                    []*types.T
    33  		tuples                  []tuple
    34  		expected                []tuple
    35  		isOrderedOnDistinctCols bool
    36  	}{
    37  		{
    38  			distinctCols: []uint32{0, 1, 2},
    39  			typs:         []*types.T{types.Float, types.Int, types.String, types.Int},
    40  			tuples: tuples{
    41  				{nil, nil, nil, nil},
    42  				{nil, nil, nil, nil},
    43  				{nil, nil, "30", nil},
    44  				{1.0, 2, "30", 4},
    45  				{1.0, 2, "30", 4},
    46  				{2.0, 2, "30", 4},
    47  				{2.0, 3, "30", 4},
    48  				{2.0, 3, "40", 4},
    49  				{2.0, 3, "40", 4},
    50  			},
    51  			expected: tuples{
    52  				{nil, nil, nil, nil},
    53  				{nil, nil, "30", nil},
    54  				{1.0, 2, "30", 4},
    55  				{2.0, 2, "30", 4},
    56  				{2.0, 3, "30", 4},
    57  				{2.0, 3, "40", 4},
    58  			},
    59  			isOrderedOnDistinctCols: true,
    60  		},
    61  		{
    62  			distinctCols: []uint32{1, 0, 2},
    63  			typs:         []*types.T{types.Float, types.Int, types.Bytes, types.Int},
    64  			tuples: tuples{
    65  				{nil, nil, nil, nil},
    66  				{nil, nil, nil, nil},
    67  				{nil, nil, "30", nil},
    68  				{1.0, 2, "30", 4},
    69  				{1.0, 2, "30", 4},
    70  				{2.0, 2, "30", 4},
    71  				{2.0, 3, "30", 4},
    72  				{2.0, 3, "40", 4},
    73  				{2.0, 3, "40", 4},
    74  			},
    75  			expected: tuples{
    76  				{nil, nil, nil, nil},
    77  				{nil, nil, "30", nil},
    78  				{1.0, 2, "30", 4},
    79  				{2.0, 2, "30", 4},
    80  				{2.0, 3, "30", 4},
    81  				{2.0, 3, "40", 4},
    82  			},
    83  			isOrderedOnDistinctCols: true,
    84  		},
    85  		{
    86  			distinctCols: []uint32{0, 1, 2},
    87  			typs:         []*types.T{types.Float, types.Int, types.String, types.Int},
    88  			tuples: tuples{
    89  				{1.0, 2, "30", 4},
    90  				{1.0, 2, "30", 4},
    91  				{nil, nil, nil, nil},
    92  				{nil, nil, nil, nil},
    93  				{2.0, 2, "30", 4},
    94  				{2.0, 3, "30", 4},
    95  				{nil, nil, "30", nil},
    96  				{2.0, 3, "40", 4},
    97  				{2.0, 3, "40", 4},
    98  			},
    99  			expected: tuples{
   100  				{1.0, 2, "30", 4},
   101  				{nil, nil, nil, nil},
   102  				{2.0, 2, "30", 4},
   103  				{2.0, 3, "30", 4},
   104  				{nil, nil, "30", nil},
   105  				{2.0, 3, "40", 4},
   106  			},
   107  		},
   108  		{
   109  			distinctCols: []uint32{0},
   110  			typs:         []*types.T{types.Int, types.Bytes},
   111  			tuples: tuples{
   112  				{1, "a"},
   113  				{2, "b"},
   114  				{3, "c"},
   115  				{nil, "d"},
   116  				{5, "e"},
   117  				{6, "f"},
   118  				{1, "1"},
   119  				{2, "2"},
   120  				{3, "3"},
   121  			},
   122  			expected: tuples{
   123  				{1, "a"},
   124  				{2, "b"},
   125  				{3, "c"},
   126  				{nil, "d"},
   127  				{5, "e"},
   128  				{6, "f"},
   129  			},
   130  		},
   131  		{
   132  			// This is to test hashTable deduplication with various batch size
   133  			// boundaries and ensure it always emits the first tuple it encountered.
   134  			distinctCols: []uint32{0},
   135  			typs:         []*types.T{types.Int, types.String},
   136  			tuples: tuples{
   137  				{1, "1"},
   138  				{1, "2"},
   139  				{1, "3"},
   140  				{1, "4"},
   141  				{1, "5"},
   142  				{2, "6"},
   143  				{2, "7"},
   144  				{2, "8"},
   145  				{2, "9"},
   146  				{2, "10"},
   147  				{0, "11"},
   148  				{0, "12"},
   149  				{0, "13"},
   150  				{1, "14"},
   151  				{1, "15"},
   152  				{1, "16"},
   153  			},
   154  			expected: tuples{
   155  				{1, "1"},
   156  				{2, "6"},
   157  				{0, "11"},
   158  			},
   159  		},
   160  		{
   161  			distinctCols: []uint32{0},
   162  			typs:         []*types.T{types.Jsonb, types.String},
   163  			tuples: tuples{
   164  				{`{"id": 1}`, "a"},
   165  				{`{"id": 2}`, "b"},
   166  				{`{"id": 3}`, "c"},
   167  				{`{"id": 1}`, "1"},
   168  				{`{"id": null}`, "d"},
   169  				{`{"id": 2}`, "2"},
   170  				{`{"id": 5}`, "e"},
   171  				{`{"id": 6}`, "f"},
   172  				{`{"id": 3}`, "3"},
   173  			},
   174  			expected: tuples{
   175  				{`{"id": 1}`, "a"},
   176  				{`{"id": 2}`, "b"},
   177  				{`{"id": 3}`, "c"},
   178  				{`{"id": null}`, "d"},
   179  				{`{"id": 5}`, "e"},
   180  				{`{"id": 6}`, "f"},
   181  			},
   182  		},
   183  	}
   184  
   185  	for _, tc := range tcs {
   186  		for _, numOfBuckets := range []uint64{1, 3, 5, hashTableNumBuckets} {
   187  			t.Run(fmt.Sprintf("unordered/numOfBuckets=%d", numOfBuckets), func(t *testing.T) {
   188  				runTestsWithTyps(t, []tuples{tc.tuples}, [][]*types.T{tc.typs}, tc.expected, orderedVerifier,
   189  					func(input []colexecbase.Operator) (colexecbase.Operator, error) {
   190  						return NewUnorderedDistinct(
   191  							testAllocator, input[0], tc.distinctCols, tc.typs,
   192  							numOfBuckets), nil
   193  					})
   194  			})
   195  		}
   196  		if tc.isOrderedOnDistinctCols {
   197  			for numOrderedCols := 1; numOrderedCols < len(tc.distinctCols); numOrderedCols++ {
   198  				t.Run(fmt.Sprintf("partiallyOrdered/ordCols=%d", numOrderedCols), func(t *testing.T) {
   199  					orderedCols := make([]uint32, numOrderedCols)
   200  					for i, j := range rng.Perm(len(tc.distinctCols))[:numOrderedCols] {
   201  						orderedCols[i] = tc.distinctCols[j]
   202  					}
   203  					runTestsWithTyps(t, []tuples{tc.tuples}, [][]*types.T{tc.typs}, tc.expected, orderedVerifier,
   204  						func(input []colexecbase.Operator) (colexecbase.Operator, error) {
   205  							return newPartiallyOrderedDistinct(
   206  								testAllocator, input[0], tc.distinctCols,
   207  								orderedCols, tc.typs,
   208  							)
   209  						})
   210  				})
   211  			}
   212  			t.Run("ordered", func(t *testing.T) {
   213  				runTestsWithTyps(t, []tuples{tc.tuples}, [][]*types.T{tc.typs}, tc.expected, orderedVerifier,
   214  					func(input []colexecbase.Operator) (colexecbase.Operator, error) {
   215  						return NewOrderedDistinct(input[0], tc.distinctCols, tc.typs)
   216  					})
   217  			})
   218  		}
   219  	}
   220  }
   221  
   222  func BenchmarkDistinct(b *testing.B) {
   223  	rng, _ := randutil.NewPseudoRand()
   224  	ctx := context.Background()
   225  
   226  	distinctConstructors := []func(*colmem.Allocator, colexecbase.Operator, []uint32, int, []*types.T) (colexecbase.Operator, error){
   227  		func(allocator *colmem.Allocator, input colexecbase.Operator, distinctCols []uint32, numOrderedCols int, typs []*types.T) (colexecbase.Operator, error) {
   228  			return NewUnorderedDistinct(allocator, input, distinctCols, typs, hashTableNumBuckets), nil
   229  		},
   230  		func(allocator *colmem.Allocator, input colexecbase.Operator, distinctCols []uint32, numOrderedCols int, typs []*types.T) (colexecbase.Operator, error) {
   231  			return newPartiallyOrderedDistinct(allocator, input, distinctCols, distinctCols[:numOrderedCols], typs)
   232  		},
   233  		func(allocator *colmem.Allocator, input colexecbase.Operator, distinctCols []uint32, numOrderedCols int, typs []*types.T) (colexecbase.Operator, error) {
   234  			return NewOrderedDistinct(input, distinctCols, typs)
   235  		},
   236  	}
   237  	distinctNames := []string{"Unordered", "PartiallyOrdered", "Ordered"}
   238  	orderedColsFraction := []float64{0, 0.5, 1.0}
   239  	for _, hasNulls := range []bool{false, true} {
   240  		for _, newTupleProbability := range []float64{0.001, 0.01, 0.1} {
   241  			for _, nBatches := range []int{1 << 2, 1 << 6} {
   242  				for _, nCols := range []int{2, 4} {
   243  					typs := make([]*types.T, nCols)
   244  					for i := range typs {
   245  						typs[i] = types.Int
   246  					}
   247  					batch := testAllocator.NewMemBatch(typs)
   248  					batch.SetLength(coldata.BatchSize())
   249  					distinctCols := []uint32{0, 1, 2, 3}[:nCols]
   250  					// We have the following equation:
   251  					//   newTupleProbability = 1 - (1 - newValueProbability) ^ nCols,
   252  					// so applying some manipulations we get:
   253  					//   newValueProbability = 1 - (1 - newTupleProbability) ^ (1 / nCols).
   254  					newValueProbability := 1.0 - math.Pow(1-newTupleProbability, 1.0/float64(nCols))
   255  					for i := range distinctCols {
   256  						col := batch.ColVec(i).Int64()
   257  						col[0] = 0
   258  						for j := 1; j < coldata.BatchSize(); j++ {
   259  							col[j] = col[j-1]
   260  							if rng.Float64() < newValueProbability {
   261  								col[j]++
   262  							}
   263  						}
   264  						nulls := batch.ColVec(i).Nulls()
   265  						if hasNulls {
   266  							nulls.SetNull(0)
   267  						} else {
   268  							nulls.UnsetNulls()
   269  						}
   270  					}
   271  					for distinctIdx, distinctConstructor := range distinctConstructors {
   272  						numOrderedCols := int(float64(nCols) * orderedColsFraction[distinctIdx])
   273  						b.Run(
   274  							fmt.Sprintf("%s/hasNulls=%v/newTupleProbability=%.3f/rows=%d/cols=%d/ordCols=%d",
   275  								distinctNames[distinctIdx], hasNulls, newTupleProbability,
   276  								nBatches*coldata.BatchSize(), nCols, numOrderedCols,
   277  							),
   278  							func(b *testing.B) {
   279  								b.SetBytes(int64(8 * nBatches * coldata.BatchSize() * nCols))
   280  								b.ResetTimer()
   281  								for n := 0; n < b.N; n++ {
   282  									// Note that the source will be ordered on all nCols so that the
   283  									// number of distinct tuples doesn't vary between different
   284  									// distinct operator variations.
   285  									source := newFiniteChunksSource(batch, typs, nBatches, nCols)
   286  									distinct, err := distinctConstructor(testAllocator, source, distinctCols, numOrderedCols, typs)
   287  									if err != nil {
   288  										b.Fatal(err)
   289  									}
   290  									distinct.Init()
   291  									for b := distinct.Next(ctx); b.Length() > 0; b = distinct.Next(ctx) {
   292  									}
   293  								}
   294  								b.StopTimer()
   295  							})
   296  					}
   297  				}
   298  			}
   299  		}
   300  	}
   301  }