github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/sampler_test.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowexec 12 13 import ( 14 "context" 15 "fmt" 16 "testing" 17 18 "github.com/axiomhq/hyperloglog" 19 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 20 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 21 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 22 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 23 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 24 "github.com/cockroachdb/cockroach/pkg/sql/types" 25 "github.com/cockroachdb/cockroach/pkg/testutils/distsqlutils" 26 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 27 ) 28 29 // runSampler runs the sampler aggregator on numRows and returns numSamples rows. 30 func runSampler( 31 t *testing.T, numRows, numSamples int, memLimitBytes int64, expectOutOfMemory bool, 32 ) []int { 33 rows := make([]sqlbase.EncDatumRow, numRows) 34 for i := range rows { 35 rows[i] = sqlbase.EncDatumRow{sqlbase.IntEncDatum(i)} 36 } 37 in := distsqlutils.NewRowBuffer(sqlbase.OneIntCol, rows, distsqlutils.RowBufferArgs{}) 38 outTypes := []*types.T{ 39 types.Int, // original column 40 types.Int, // rank 41 types.Int, // sketch index 42 types.Int, // num rows 43 types.Int, // null vals 44 types.Bytes, 45 } 46 47 out := distsqlutils.NewRowBuffer(outTypes, nil /* rows */, distsqlutils.RowBufferArgs{}) 48 49 st := cluster.MakeTestingClusterSettings() 50 evalCtx := tree.MakeTestingEvalContext(st) 51 defer evalCtx.Stop(context.Background()) 52 flowCtx := execinfra.FlowCtx{ 53 Cfg: &execinfra.ServerConfig{Settings: st}, 54 EvalCtx: &evalCtx, 55 } 56 // Override the default memory limit. If memLimitBytes is small but 57 // non-zero, the processor will hit this limit and disable sampling. 58 flowCtx.Cfg.TestingKnobs.MemoryLimitBytes = memLimitBytes 59 60 spec := &execinfrapb.SamplerSpec{ 61 Sketches: []execinfrapb.SketchSpec{ 62 { 63 SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1, 64 Columns: []uint32{0}, 65 GenerateHistogram: true, 66 }, 67 }, 68 SampleSize: uint32(numSamples), 69 } 70 p, err := newSamplerProcessor( 71 &flowCtx, 0 /* processorID */, spec, in, &execinfrapb.PostProcessSpec{}, out, 72 ) 73 if err != nil { 74 t.Fatal(err) 75 } 76 p.Run(context.Background()) 77 78 // Verify we have numSamples distinct rows. 79 res := make([]int, 0, numSamples) 80 seen := make(map[tree.DInt]bool) 81 histogramDisabled := false 82 n := 0 83 for { 84 row, meta := out.Next() 85 if meta != nil { 86 if meta.SamplerProgress == nil { 87 t.Fatalf("unexpected metadata: %v", meta) 88 } 89 if meta.SamplerProgress.HistogramDisabled { 90 histogramDisabled = true 91 } 92 continue 93 } else if row == nil { 94 break 95 } 96 if row[0].IsNull() { 97 // This is a sketch row. 98 continue 99 } 100 for i := 2; i < len(outTypes); i++ { 101 if !row[i].IsNull() { 102 t.Fatalf("expected NULL on column %d, got %s", i, row[i].Datum) 103 } 104 } 105 v := *row[0].Datum.(*tree.DInt) 106 if seen[v] { 107 t.Fatalf("duplicate row %d", v) 108 } 109 seen[v] = true 110 res = append(res, int(v)) 111 n++ 112 } 113 if expectOutOfMemory { 114 if !histogramDisabled { 115 t.Fatal("expected processor to disable histogram collection") 116 } 117 } else if n != numSamples { 118 t.Fatalf("expected %d rows, got %d", numSamples, n) 119 } 120 return res 121 } 122 123 func TestSampler(t *testing.T) { 124 defer leaktest.AfterTest(t)() 125 126 // We run many samplings and record the frequencies. 127 numRows := 100 128 numSamples := 20 129 minRuns := 200 130 maxRuns := 5000 131 delta := 0.5 132 133 freq := make([]int, numRows) 134 var err error 135 // Instead of doing maxRuns and checking at the end, we do minRuns at a time 136 // and exit early. This speeds up the test. 137 for r := 0; r < maxRuns; r += minRuns { 138 for i := 0; i < minRuns; i++ { 139 for _, v := range runSampler( 140 t, numRows, numSamples, 0 /* memLimitBytes */, false, /* expectOutOfMemory */ 141 ) { 142 freq[v]++ 143 } 144 } 145 146 // The expected frequency of each row is f = numRuns * (numSamples / numRows). 147 f := float64(r) * float64(numSamples) / float64(numRows) 148 149 // Verify that no frequency is outside of the range (f / (1+delta), f * (1+delta)); 150 // the probability of a given row violating this is subject to the Chernoff 151 // bound which decreases exponentially (with exponent f). 152 err = nil 153 for i := range freq { 154 if float64(freq[i]) < f/(1+delta) || float64(freq[i]) > f*(1+delta) { 155 err = fmt.Errorf("frequency %d out of bound (expected value %f)", freq[i], f) 156 break 157 } 158 } 159 if err == nil { 160 return 161 } 162 } 163 t.Error(err) 164 } 165 166 func TestSamplerMemoryLimit(t *testing.T) { 167 defer leaktest.AfterTest(t)() 168 numRows := 100 169 numSamples := 20 170 171 runSampler(t, numRows, numSamples, 0 /* memLimitBytes */, false /* expectOutOfMemory */) 172 runSampler(t, numRows, numSamples, 1 /* memLimitBytes */, true /* expectOutOfMemory */) 173 runSampler(t, numRows, numSamples, 20 /* memLimitBytes */, true /* expectOutOfMemory */) 174 runSampler(t, numRows, numSamples, 20*1024 /* memLimitBytes */, false /* expectOutOfMemory */) 175 } 176 177 func TestSamplerSketch(t *testing.T) { 178 defer leaktest.AfterTest(t)() 179 180 inputRows := [][]int{ 181 {1, 1}, 182 {2, 2}, 183 {1, 3}, 184 {2, 4}, 185 {1, 5}, 186 {2, 6}, 187 {1, 7}, 188 {2, 8}, 189 {-1, 1}, 190 {-1, 3}, 191 {1, -1}, 192 {2, 8}, 193 {-1, 1}, 194 {-1, -1}, 195 } 196 cardinalities := []int{3, 9, 12} 197 numNulls := []int{4, 2, 1} 198 199 rows := sqlbase.GenEncDatumRowsInt(inputRows) 200 in := distsqlutils.NewRowBuffer(sqlbase.TwoIntCols, rows, distsqlutils.RowBufferArgs{}) 201 outTypes := []*types.T{ 202 types.Int, // original column 203 types.Int, // original column 204 types.Int, // rank 205 types.Int, // sketch index 206 types.Int, // num rows 207 types.Int, // null vals 208 types.Bytes, // sketch data 209 } 210 211 out := distsqlutils.NewRowBuffer(outTypes, nil /* rows */, distsqlutils.RowBufferArgs{}) 212 213 st := cluster.MakeTestingClusterSettings() 214 evalCtx := tree.MakeTestingEvalContext(st) 215 defer evalCtx.Stop(context.Background()) 216 flowCtx := execinfra.FlowCtx{ 217 Cfg: &execinfra.ServerConfig{Settings: st}, 218 EvalCtx: &evalCtx, 219 } 220 221 spec := &execinfrapb.SamplerSpec{ 222 SampleSize: uint32(1), 223 Sketches: []execinfrapb.SketchSpec{ 224 { 225 SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1, 226 Columns: []uint32{0}, 227 }, 228 { 229 SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1, 230 Columns: []uint32{1}, 231 }, 232 { 233 SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1, 234 Columns: []uint32{0, 1}, 235 }, 236 }, 237 } 238 p, err := newSamplerProcessor(&flowCtx, 0 /* processorID */, spec, in, &execinfrapb.PostProcessSpec{}, out) 239 if err != nil { 240 t.Fatal(err) 241 } 242 p.Run(context.Background()) 243 244 // Collect the rows, excluding metadata. 245 rows = rows[:0] 246 for { 247 row, meta := out.Next() 248 if meta != nil { 249 if meta.SamplerProgress == nil { 250 t.Fatalf("unexpected metadata: %v", meta) 251 } 252 continue 253 } else if row == nil { 254 break 255 } 256 rows = append(rows, row) 257 } 258 259 // We expect one sampled row and three sketch rows. 260 if len(rows) != 4 { 261 t.Fatalf("expected 4 rows, got %v\n", rows.String(outTypes)) 262 } 263 rows = rows[1:] 264 265 for sketchIdx, r := range rows { 266 // First three columns are for sampled rows. 267 for i := 0; i < 3; i++ { 268 if !r[i].IsNull() { 269 t.Errorf("expected NULL on column %d, got %s", i, r[i].Datum) 270 } 271 } 272 if v := int(*r[3].Datum.(*tree.DInt)); v != sketchIdx { 273 t.Errorf("expected sketch index %d, got %d", sketchIdx, v) 274 } 275 if v := int(*r[4].Datum.(*tree.DInt)); v != len(inputRows) { 276 t.Errorf("expected numRows %d, got %d", len(inputRows), v) 277 } 278 if v := int(*r[5].Datum.(*tree.DInt)); v != numNulls[sketchIdx] { 279 t.Errorf("expected numNulls %d, got %d", numNulls[sketchIdx], v) 280 } 281 data := []byte(*r[6].Datum.(*tree.DBytes)) 282 var s hyperloglog.Sketch 283 if err := s.UnmarshalBinary(data); err != nil { 284 t.Fatal(err) 285 } 286 // HLL++ should be exact on small datasets. 287 if v := int(s.Estimate()); v != cardinalities[sketchIdx] { 288 t.Errorf("expected cardinality %d, got %d", cardinalities[sketchIdx], v) 289 } 290 } 291 }