github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/distinct_test.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "testing" 18 19 "github.com/cockroachdb/cockroach/pkg/col/coldata" 20 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 21 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 22 "github.com/cockroachdb/cockroach/pkg/sql/types" 23 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 24 "github.com/cockroachdb/cockroach/pkg/util/randutil" 25 ) 26 27 func TestDistinct(t *testing.T) { 28 defer leaktest.AfterTest(t)() 29 rng, _ := randutil.NewPseudoRand() 30 tcs := []struct { 31 distinctCols []uint32 32 typs []*types.T 33 tuples []tuple 34 expected []tuple 35 isOrderedOnDistinctCols bool 36 }{ 37 { 38 distinctCols: []uint32{0, 1, 2}, 39 typs: []*types.T{types.Float, types.Int, types.String, types.Int}, 40 tuples: tuples{ 41 {nil, nil, nil, nil}, 42 {nil, nil, nil, nil}, 43 {nil, nil, "30", nil}, 44 {1.0, 2, "30", 4}, 45 {1.0, 2, "30", 4}, 46 {2.0, 2, "30", 4}, 47 {2.0, 3, "30", 4}, 48 {2.0, 3, "40", 4}, 49 {2.0, 3, "40", 4}, 50 }, 51 expected: tuples{ 52 {nil, nil, nil, nil}, 53 {nil, nil, "30", nil}, 54 {1.0, 2, "30", 4}, 55 {2.0, 2, "30", 4}, 56 {2.0, 3, "30", 4}, 57 {2.0, 3, "40", 4}, 58 }, 59 isOrderedOnDistinctCols: true, 60 }, 61 { 62 distinctCols: []uint32{1, 0, 2}, 63 typs: []*types.T{types.Float, types.Int, types.Bytes, types.Int}, 64 tuples: tuples{ 65 {nil, nil, nil, nil}, 66 {nil, nil, nil, nil}, 67 {nil, nil, "30", nil}, 68 {1.0, 2, "30", 4}, 69 {1.0, 2, "30", 4}, 70 {2.0, 2, "30", 4}, 71 {2.0, 3, "30", 4}, 72 {2.0, 3, "40", 4}, 73 {2.0, 3, "40", 4}, 74 }, 75 expected: tuples{ 76 {nil, nil, nil, nil}, 77 {nil, nil, "30", nil}, 78 {1.0, 2, "30", 4}, 79 {2.0, 2, "30", 4}, 80 {2.0, 3, "30", 4}, 81 {2.0, 3, "40", 4}, 82 }, 83 isOrderedOnDistinctCols: true, 84 }, 85 { 86 distinctCols: []uint32{0, 1, 2}, 87 typs: []*types.T{types.Float, types.Int, types.String, types.Int}, 88 tuples: tuples{ 89 {1.0, 2, "30", 4}, 90 {1.0, 2, "30", 4}, 91 {nil, nil, nil, nil}, 92 {nil, nil, nil, nil}, 93 {2.0, 2, "30", 4}, 94 {2.0, 3, "30", 4}, 95 {nil, nil, "30", nil}, 96 {2.0, 3, "40", 4}, 97 {2.0, 3, "40", 4}, 98 }, 99 expected: tuples{ 100 {1.0, 2, "30", 4}, 101 {nil, nil, nil, nil}, 102 {2.0, 2, "30", 4}, 103 {2.0, 3, "30", 4}, 104 {nil, nil, "30", nil}, 105 {2.0, 3, "40", 4}, 106 }, 107 }, 108 { 109 distinctCols: []uint32{0}, 110 typs: []*types.T{types.Int, types.Bytes}, 111 tuples: tuples{ 112 {1, "a"}, 113 {2, "b"}, 114 {3, "c"}, 115 {nil, "d"}, 116 {5, "e"}, 117 {6, "f"}, 118 {1, "1"}, 119 {2, "2"}, 120 {3, "3"}, 121 }, 122 expected: tuples{ 123 {1, "a"}, 124 {2, "b"}, 125 {3, "c"}, 126 {nil, "d"}, 127 {5, "e"}, 128 {6, "f"}, 129 }, 130 }, 131 { 132 // This is to test hashTable deduplication with various batch size 133 // boundaries and ensure it always emits the first tuple it encountered. 134 distinctCols: []uint32{0}, 135 typs: []*types.T{types.Int, types.String}, 136 tuples: tuples{ 137 {1, "1"}, 138 {1, "2"}, 139 {1, "3"}, 140 {1, "4"}, 141 {1, "5"}, 142 {2, "6"}, 143 {2, "7"}, 144 {2, "8"}, 145 {2, "9"}, 146 {2, "10"}, 147 {0, "11"}, 148 {0, "12"}, 149 {0, "13"}, 150 {1, "14"}, 151 {1, "15"}, 152 {1, "16"}, 153 }, 154 expected: tuples{ 155 {1, "1"}, 156 {2, "6"}, 157 {0, "11"}, 158 }, 159 }, 160 { 161 distinctCols: []uint32{0}, 162 typs: []*types.T{types.Jsonb, types.String}, 163 tuples: tuples{ 164 {`{"id": 1}`, "a"}, 165 {`{"id": 2}`, "b"}, 166 {`{"id": 3}`, "c"}, 167 {`{"id": 1}`, "1"}, 168 {`{"id": null}`, "d"}, 169 {`{"id": 2}`, "2"}, 170 {`{"id": 5}`, "e"}, 171 {`{"id": 6}`, "f"}, 172 {`{"id": 3}`, "3"}, 173 }, 174 expected: tuples{ 175 {`{"id": 1}`, "a"}, 176 {`{"id": 2}`, "b"}, 177 {`{"id": 3}`, "c"}, 178 {`{"id": null}`, "d"}, 179 {`{"id": 5}`, "e"}, 180 {`{"id": 6}`, "f"}, 181 }, 182 }, 183 } 184 185 for _, tc := range tcs { 186 for _, numOfBuckets := range []uint64{1, 3, 5, hashTableNumBuckets} { 187 t.Run(fmt.Sprintf("unordered/numOfBuckets=%d", numOfBuckets), func(t *testing.T) { 188 runTestsWithTyps(t, []tuples{tc.tuples}, [][]*types.T{tc.typs}, tc.expected, orderedVerifier, 189 func(input []colexecbase.Operator) (colexecbase.Operator, error) { 190 return NewUnorderedDistinct( 191 testAllocator, input[0], tc.distinctCols, tc.typs, 192 numOfBuckets), nil 193 }) 194 }) 195 } 196 if tc.isOrderedOnDistinctCols { 197 for numOrderedCols := 1; numOrderedCols < len(tc.distinctCols); numOrderedCols++ { 198 t.Run(fmt.Sprintf("partiallyOrdered/ordCols=%d", numOrderedCols), func(t *testing.T) { 199 orderedCols := make([]uint32, numOrderedCols) 200 for i, j := range rng.Perm(len(tc.distinctCols))[:numOrderedCols] { 201 orderedCols[i] = tc.distinctCols[j] 202 } 203 runTestsWithTyps(t, []tuples{tc.tuples}, [][]*types.T{tc.typs}, tc.expected, orderedVerifier, 204 func(input []colexecbase.Operator) (colexecbase.Operator, error) { 205 return newPartiallyOrderedDistinct( 206 testAllocator, input[0], tc.distinctCols, 207 orderedCols, tc.typs, 208 ) 209 }) 210 }) 211 } 212 t.Run("ordered", func(t *testing.T) { 213 runTestsWithTyps(t, []tuples{tc.tuples}, [][]*types.T{tc.typs}, tc.expected, orderedVerifier, 214 func(input []colexecbase.Operator) (colexecbase.Operator, error) { 215 return NewOrderedDistinct(input[0], tc.distinctCols, tc.typs) 216 }) 217 }) 218 } 219 } 220 } 221 222 func BenchmarkDistinct(b *testing.B) { 223 rng, _ := randutil.NewPseudoRand() 224 ctx := context.Background() 225 226 distinctConstructors := []func(*colmem.Allocator, colexecbase.Operator, []uint32, int, []*types.T) (colexecbase.Operator, error){ 227 func(allocator *colmem.Allocator, input colexecbase.Operator, distinctCols []uint32, numOrderedCols int, typs []*types.T) (colexecbase.Operator, error) { 228 return NewUnorderedDistinct(allocator, input, distinctCols, typs, hashTableNumBuckets), nil 229 }, 230 func(allocator *colmem.Allocator, input colexecbase.Operator, distinctCols []uint32, numOrderedCols int, typs []*types.T) (colexecbase.Operator, error) { 231 return newPartiallyOrderedDistinct(allocator, input, distinctCols, distinctCols[:numOrderedCols], typs) 232 }, 233 func(allocator *colmem.Allocator, input colexecbase.Operator, distinctCols []uint32, numOrderedCols int, typs []*types.T) (colexecbase.Operator, error) { 234 return NewOrderedDistinct(input, distinctCols, typs) 235 }, 236 } 237 distinctNames := []string{"Unordered", "PartiallyOrdered", "Ordered"} 238 orderedColsFraction := []float64{0, 0.5, 1.0} 239 for _, hasNulls := range []bool{false, true} { 240 for _, newTupleProbability := range []float64{0.001, 0.01, 0.1} { 241 for _, nBatches := range []int{1 << 2, 1 << 6} { 242 for _, nCols := range []int{2, 4} { 243 typs := make([]*types.T, nCols) 244 for i := range typs { 245 typs[i] = types.Int 246 } 247 batch := testAllocator.NewMemBatch(typs) 248 batch.SetLength(coldata.BatchSize()) 249 distinctCols := []uint32{0, 1, 2, 3}[:nCols] 250 // We have the following equation: 251 // newTupleProbability = 1 - (1 - newValueProbability) ^ nCols, 252 // so applying some manipulations we get: 253 // newValueProbability = 1 - (1 - newTupleProbability) ^ (1 / nCols). 254 newValueProbability := 1.0 - math.Pow(1-newTupleProbability, 1.0/float64(nCols)) 255 for i := range distinctCols { 256 col := batch.ColVec(i).Int64() 257 col[0] = 0 258 for j := 1; j < coldata.BatchSize(); j++ { 259 col[j] = col[j-1] 260 if rng.Float64() < newValueProbability { 261 col[j]++ 262 } 263 } 264 nulls := batch.ColVec(i).Nulls() 265 if hasNulls { 266 nulls.SetNull(0) 267 } else { 268 nulls.UnsetNulls() 269 } 270 } 271 for distinctIdx, distinctConstructor := range distinctConstructors { 272 numOrderedCols := int(float64(nCols) * orderedColsFraction[distinctIdx]) 273 b.Run( 274 fmt.Sprintf("%s/hasNulls=%v/newTupleProbability=%.3f/rows=%d/cols=%d/ordCols=%d", 275 distinctNames[distinctIdx], hasNulls, newTupleProbability, 276 nBatches*coldata.BatchSize(), nCols, numOrderedCols, 277 ), 278 func(b *testing.B) { 279 b.SetBytes(int64(8 * nBatches * coldata.BatchSize() * nCols)) 280 b.ResetTimer() 281 for n := 0; n < b.N; n++ { 282 // Note that the source will be ordered on all nCols so that the 283 // number of distinct tuples doesn't vary between different 284 // distinct operator variations. 285 source := newFiniteChunksSource(batch, typs, nBatches, nCols) 286 distinct, err := distinctConstructor(testAllocator, source, distinctCols, numOrderedCols, typs) 287 if err != nil { 288 b.Fatal(err) 289 } 290 distinct.Init() 291 for b := distinct.Next(ctx); b.Length() > 0; b = distinct.Next(ctx) { 292 } 293 } 294 b.StopTimer() 295 }) 296 } 297 } 298 } 299 } 300 } 301 }