github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/sampler_test.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"testing"
    17  
    18  	"github.com/axiomhq/hyperloglog"
    19  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    25  	"github.com/cockroachdb/cockroach/pkg/testutils/distsqlutils"
    26  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    27  )
    28  
    29  // runSampler runs the sampler aggregator on numRows and returns numSamples rows.
    30  func runSampler(
    31  	t *testing.T, numRows, numSamples int, memLimitBytes int64, expectOutOfMemory bool,
    32  ) []int {
    33  	rows := make([]sqlbase.EncDatumRow, numRows)
    34  	for i := range rows {
    35  		rows[i] = sqlbase.EncDatumRow{sqlbase.IntEncDatum(i)}
    36  	}
    37  	in := distsqlutils.NewRowBuffer(sqlbase.OneIntCol, rows, distsqlutils.RowBufferArgs{})
    38  	outTypes := []*types.T{
    39  		types.Int, // original column
    40  		types.Int, // rank
    41  		types.Int, // sketch index
    42  		types.Int, // num rows
    43  		types.Int, // null vals
    44  		types.Bytes,
    45  	}
    46  
    47  	out := distsqlutils.NewRowBuffer(outTypes, nil /* rows */, distsqlutils.RowBufferArgs{})
    48  
    49  	st := cluster.MakeTestingClusterSettings()
    50  	evalCtx := tree.MakeTestingEvalContext(st)
    51  	defer evalCtx.Stop(context.Background())
    52  	flowCtx := execinfra.FlowCtx{
    53  		Cfg:     &execinfra.ServerConfig{Settings: st},
    54  		EvalCtx: &evalCtx,
    55  	}
    56  	// Override the default memory limit. If memLimitBytes is small but
    57  	// non-zero, the processor will hit this limit and disable sampling.
    58  	flowCtx.Cfg.TestingKnobs.MemoryLimitBytes = memLimitBytes
    59  
    60  	spec := &execinfrapb.SamplerSpec{
    61  		Sketches: []execinfrapb.SketchSpec{
    62  			{
    63  				SketchType:        execinfrapb.SketchType_HLL_PLUS_PLUS_V1,
    64  				Columns:           []uint32{0},
    65  				GenerateHistogram: true,
    66  			},
    67  		},
    68  		SampleSize: uint32(numSamples),
    69  	}
    70  	p, err := newSamplerProcessor(
    71  		&flowCtx, 0 /* processorID */, spec, in, &execinfrapb.PostProcessSpec{}, out,
    72  	)
    73  	if err != nil {
    74  		t.Fatal(err)
    75  	}
    76  	p.Run(context.Background())
    77  
    78  	// Verify we have numSamples distinct rows.
    79  	res := make([]int, 0, numSamples)
    80  	seen := make(map[tree.DInt]bool)
    81  	histogramDisabled := false
    82  	n := 0
    83  	for {
    84  		row, meta := out.Next()
    85  		if meta != nil {
    86  			if meta.SamplerProgress == nil {
    87  				t.Fatalf("unexpected metadata: %v", meta)
    88  			}
    89  			if meta.SamplerProgress.HistogramDisabled {
    90  				histogramDisabled = true
    91  			}
    92  			continue
    93  		} else if row == nil {
    94  			break
    95  		}
    96  		if row[0].IsNull() {
    97  			// This is a sketch row.
    98  			continue
    99  		}
   100  		for i := 2; i < len(outTypes); i++ {
   101  			if !row[i].IsNull() {
   102  				t.Fatalf("expected NULL on column %d, got %s", i, row[i].Datum)
   103  			}
   104  		}
   105  		v := *row[0].Datum.(*tree.DInt)
   106  		if seen[v] {
   107  			t.Fatalf("duplicate row %d", v)
   108  		}
   109  		seen[v] = true
   110  		res = append(res, int(v))
   111  		n++
   112  	}
   113  	if expectOutOfMemory {
   114  		if !histogramDisabled {
   115  			t.Fatal("expected processor to disable histogram collection")
   116  		}
   117  	} else if n != numSamples {
   118  		t.Fatalf("expected %d rows, got %d", numSamples, n)
   119  	}
   120  	return res
   121  }
   122  
   123  func TestSampler(t *testing.T) {
   124  	defer leaktest.AfterTest(t)()
   125  
   126  	// We run many samplings and record the frequencies.
   127  	numRows := 100
   128  	numSamples := 20
   129  	minRuns := 200
   130  	maxRuns := 5000
   131  	delta := 0.5
   132  
   133  	freq := make([]int, numRows)
   134  	var err error
   135  	// Instead of doing maxRuns and checking at the end, we do minRuns at a time
   136  	// and exit early. This speeds up the test.
   137  	for r := 0; r < maxRuns; r += minRuns {
   138  		for i := 0; i < minRuns; i++ {
   139  			for _, v := range runSampler(
   140  				t, numRows, numSamples, 0 /* memLimitBytes */, false, /* expectOutOfMemory */
   141  			) {
   142  				freq[v]++
   143  			}
   144  		}
   145  
   146  		// The expected frequency of each row is f = numRuns * (numSamples / numRows).
   147  		f := float64(r) * float64(numSamples) / float64(numRows)
   148  
   149  		// Verify that no frequency is outside of the range (f / (1+delta), f * (1+delta));
   150  		// the probability of a given row violating this is subject to the Chernoff
   151  		// bound which decreases exponentially (with exponent f).
   152  		err = nil
   153  		for i := range freq {
   154  			if float64(freq[i]) < f/(1+delta) || float64(freq[i]) > f*(1+delta) {
   155  				err = fmt.Errorf("frequency %d out of bound (expected value %f)", freq[i], f)
   156  				break
   157  			}
   158  		}
   159  		if err == nil {
   160  			return
   161  		}
   162  	}
   163  	t.Error(err)
   164  }
   165  
   166  func TestSamplerMemoryLimit(t *testing.T) {
   167  	defer leaktest.AfterTest(t)()
   168  	numRows := 100
   169  	numSamples := 20
   170  
   171  	runSampler(t, numRows, numSamples, 0 /* memLimitBytes */, false /* expectOutOfMemory */)
   172  	runSampler(t, numRows, numSamples, 1 /* memLimitBytes */, true /* expectOutOfMemory */)
   173  	runSampler(t, numRows, numSamples, 20 /* memLimitBytes */, true /* expectOutOfMemory */)
   174  	runSampler(t, numRows, numSamples, 20*1024 /* memLimitBytes */, false /* expectOutOfMemory */)
   175  }
   176  
   177  func TestSamplerSketch(t *testing.T) {
   178  	defer leaktest.AfterTest(t)()
   179  
   180  	inputRows := [][]int{
   181  		{1, 1},
   182  		{2, 2},
   183  		{1, 3},
   184  		{2, 4},
   185  		{1, 5},
   186  		{2, 6},
   187  		{1, 7},
   188  		{2, 8},
   189  		{-1, 1},
   190  		{-1, 3},
   191  		{1, -1},
   192  		{2, 8},
   193  		{-1, 1},
   194  		{-1, -1},
   195  	}
   196  	cardinalities := []int{3, 9, 12}
   197  	numNulls := []int{4, 2, 1}
   198  
   199  	rows := sqlbase.GenEncDatumRowsInt(inputRows)
   200  	in := distsqlutils.NewRowBuffer(sqlbase.TwoIntCols, rows, distsqlutils.RowBufferArgs{})
   201  	outTypes := []*types.T{
   202  		types.Int,   // original column
   203  		types.Int,   // original column
   204  		types.Int,   // rank
   205  		types.Int,   // sketch index
   206  		types.Int,   // num rows
   207  		types.Int,   // null vals
   208  		types.Bytes, // sketch data
   209  	}
   210  
   211  	out := distsqlutils.NewRowBuffer(outTypes, nil /* rows */, distsqlutils.RowBufferArgs{})
   212  
   213  	st := cluster.MakeTestingClusterSettings()
   214  	evalCtx := tree.MakeTestingEvalContext(st)
   215  	defer evalCtx.Stop(context.Background())
   216  	flowCtx := execinfra.FlowCtx{
   217  		Cfg:     &execinfra.ServerConfig{Settings: st},
   218  		EvalCtx: &evalCtx,
   219  	}
   220  
   221  	spec := &execinfrapb.SamplerSpec{
   222  		SampleSize: uint32(1),
   223  		Sketches: []execinfrapb.SketchSpec{
   224  			{
   225  				SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1,
   226  				Columns:    []uint32{0},
   227  			},
   228  			{
   229  				SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1,
   230  				Columns:    []uint32{1},
   231  			},
   232  			{
   233  				SketchType: execinfrapb.SketchType_HLL_PLUS_PLUS_V1,
   234  				Columns:    []uint32{0, 1},
   235  			},
   236  		},
   237  	}
   238  	p, err := newSamplerProcessor(&flowCtx, 0 /* processorID */, spec, in, &execinfrapb.PostProcessSpec{}, out)
   239  	if err != nil {
   240  		t.Fatal(err)
   241  	}
   242  	p.Run(context.Background())
   243  
   244  	// Collect the rows, excluding metadata.
   245  	rows = rows[:0]
   246  	for {
   247  		row, meta := out.Next()
   248  		if meta != nil {
   249  			if meta.SamplerProgress == nil {
   250  				t.Fatalf("unexpected metadata: %v", meta)
   251  			}
   252  			continue
   253  		} else if row == nil {
   254  			break
   255  		}
   256  		rows = append(rows, row)
   257  	}
   258  
   259  	// We expect one sampled row and three sketch rows.
   260  	if len(rows) != 4 {
   261  		t.Fatalf("expected 4 rows, got %v\n", rows.String(outTypes))
   262  	}
   263  	rows = rows[1:]
   264  
   265  	for sketchIdx, r := range rows {
   266  		// First three columns are for sampled rows.
   267  		for i := 0; i < 3; i++ {
   268  			if !r[i].IsNull() {
   269  				t.Errorf("expected NULL on column %d, got %s", i, r[i].Datum)
   270  			}
   271  		}
   272  		if v := int(*r[3].Datum.(*tree.DInt)); v != sketchIdx {
   273  			t.Errorf("expected sketch index %d, got %d", sketchIdx, v)
   274  		}
   275  		if v := int(*r[4].Datum.(*tree.DInt)); v != len(inputRows) {
   276  			t.Errorf("expected numRows %d, got %d", len(inputRows), v)
   277  		}
   278  		if v := int(*r[5].Datum.(*tree.DInt)); v != numNulls[sketchIdx] {
   279  			t.Errorf("expected numNulls %d, got %d", numNulls[sketchIdx], v)
   280  		}
   281  		data := []byte(*r[6].Datum.(*tree.DBytes))
   282  		var s hyperloglog.Sketch
   283  		if err := s.UnmarshalBinary(data); err != nil {
   284  			t.Fatal(err)
   285  		}
   286  		// HLL++ should be exact on small datasets.
   287  		if v := int(s.Estimate()); v != cardinalities[sketchIdx] {
   288  			t.Errorf("expected cardinality %d, got %d", cardinalities[sketchIdx], v)
   289  		}
   290  	}
   291  }