github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_plan_stats.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/jobs"
    18  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    19  	"github.com/cockroachdb/cockroach/pkg/kv"
    20  	"github.com/cockroachdb/cockroach/pkg/settings"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/span"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/stats"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    27  	"github.com/cockroachdb/cockroach/pkg/util"
    28  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    29  	"github.com/cockroachdb/errors"
    30  	"github.com/cockroachdb/logtags"
    31  )
    32  
    33  type requestedStat struct {
    34  	columns             []sqlbase.ColumnID
    35  	histogram           bool
    36  	histogramMaxBuckets int
    37  	name                string
    38  }
    39  
    40  const histogramSamples = 10000
    41  const histogramBuckets = 200
    42  
    43  // maxTimestampAge is the maximum allowed age of a scan timestamp during table
    44  // stats collection, used when creating statistics AS OF SYSTEM TIME. The
    45  // timestamp is advanced during long operations as needed. See TableReaderSpec.
    46  //
    47  // The lowest TTL we recommend is 10 minutes. This value must be be lower than
    48  // that.
    49  var maxTimestampAge = settings.RegisterDurationSetting(
    50  	"sql.stats.max_timestamp_age",
    51  	"maximum age of timestamp during table statistics collection",
    52  	5*time.Minute,
    53  )
    54  
    55  func (dsp *DistSQLPlanner) createStatsPlan(
    56  	planCtx *PlanningCtx,
    57  	desc *sqlbase.ImmutableTableDescriptor,
    58  	reqStats []requestedStat,
    59  	job *jobs.Job,
    60  ) (*PhysicalPlan, error) {
    61  	if len(reqStats) == 0 {
    62  		return nil, errors.New("no stats requested")
    63  	}
    64  
    65  	details := job.Details().(jobspb.CreateStatsDetails)
    66  
    67  	// Calculate the set of columns we need to scan.
    68  	var colCfg scanColumnsConfig
    69  	var tableColSet util.FastIntSet
    70  	for _, s := range reqStats {
    71  		for _, c := range s.columns {
    72  			if !tableColSet.Contains(int(c)) {
    73  				tableColSet.Add(int(c))
    74  				colCfg.wantedColumns = append(colCfg.wantedColumns, tree.ColumnID(c))
    75  			}
    76  		}
    77  	}
    78  
    79  	// Create the table readers; for this we initialize a dummy scanNode.
    80  	scan := scanNode{desc: desc}
    81  	err := scan.initDescDefaults(colCfg)
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  	sb := span.MakeBuilder(planCtx.planner.ExecCfg().Codec, desc.TableDesc(), scan.index)
    86  	scan.spans, err = sb.UnconstrainedSpans()
    87  	if err != nil {
    88  		return nil, err
    89  	}
    90  	scan.isFull = true
    91  
    92  	p, err := dsp.createTableReaders(planCtx, &scan)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  
    97  	if details.AsOf != nil {
    98  		// If the read is historical, set the max timestamp age.
    99  		val := maxTimestampAge.Get(&dsp.st.SV)
   100  		for i := range p.Processors {
   101  			spec := p.Processors[i].Spec.Core.TableReader
   102  			spec.MaxTimestampAgeNanos = uint64(val)
   103  		}
   104  	}
   105  
   106  	sketchSpecs := make([]execinfrapb.SketchSpec, len(reqStats))
   107  	sampledColumnIDs := make([]sqlbase.ColumnID, len(scan.cols))
   108  	for i, s := range reqStats {
   109  		spec := execinfrapb.SketchSpec{
   110  			SketchType:          execinfrapb.SketchType_HLL_PLUS_PLUS_V1,
   111  			GenerateHistogram:   s.histogram,
   112  			HistogramMaxBuckets: uint32(s.histogramMaxBuckets),
   113  			Columns:             make([]uint32, len(s.columns)),
   114  			StatName:            s.name,
   115  		}
   116  		for i, colID := range s.columns {
   117  			colIdx, ok := scan.colIdxMap[colID]
   118  			if !ok {
   119  				panic("necessary column not scanned")
   120  			}
   121  			streamColIdx := p.PlanToStreamColMap[colIdx]
   122  			spec.Columns[i] = uint32(streamColIdx)
   123  			sampledColumnIDs[streamColIdx] = colID
   124  		}
   125  
   126  		sketchSpecs[i] = spec
   127  	}
   128  
   129  	// Set up the samplers.
   130  	sampler := &execinfrapb.SamplerSpec{Sketches: sketchSpecs}
   131  	for _, s := range reqStats {
   132  		sampler.MaxFractionIdle = details.MaxFractionIdle
   133  		if s.histogram {
   134  			sampler.SampleSize = histogramSamples
   135  		}
   136  	}
   137  
   138  	// The sampler outputs the original columns plus a rank column and four sketch columns.
   139  	outTypes := make([]*types.T, 0, len(p.ResultTypes)+5)
   140  	outTypes = append(outTypes, p.ResultTypes...)
   141  	// An INT column for the rank of each row.
   142  	outTypes = append(outTypes, types.Int)
   143  	// An INT column indicating the sketch index.
   144  	outTypes = append(outTypes, types.Int)
   145  	// An INT column indicating the number of rows processed.
   146  	outTypes = append(outTypes, types.Int)
   147  	// An INT column indicating the number of rows that have a NULL in any sketch
   148  	// column.
   149  	outTypes = append(outTypes, types.Int)
   150  	// A BYTES column with the sketch data.
   151  	outTypes = append(outTypes, types.Bytes)
   152  
   153  	p.AddNoGroupingStage(
   154  		execinfrapb.ProcessorCoreUnion{Sampler: sampler},
   155  		execinfrapb.PostProcessSpec{},
   156  		outTypes,
   157  		execinfrapb.Ordering{},
   158  	)
   159  
   160  	// Estimate the expected number of rows based on existing stats in the cache.
   161  	tableStats, err := planCtx.planner.execCfg.TableStatsCache.GetTableStats(planCtx.ctx, desc.ID)
   162  	if err != nil {
   163  		return nil, err
   164  	}
   165  
   166  	var rowsExpected uint64
   167  	if len(tableStats) > 0 {
   168  		overhead := stats.AutomaticStatisticsFractionStaleRows.Get(&dsp.st.SV)
   169  		// Convert to a signed integer first to make the linter happy.
   170  		rowsExpected = uint64(int64(
   171  			// The total expected number of rows is the same number that was measured
   172  			// most recently, plus some overhead for possible insertions.
   173  			float64(tableStats[0].RowCount) * (1 + overhead),
   174  		))
   175  	}
   176  
   177  	var jobID int64
   178  	if job.ID() != nil {
   179  		jobID = *job.ID()
   180  	}
   181  
   182  	// Set up the final SampleAggregator stage.
   183  	agg := &execinfrapb.SampleAggregatorSpec{
   184  		Sketches:         sketchSpecs,
   185  		SampleSize:       sampler.SampleSize,
   186  		SampledColumnIDs: sampledColumnIDs,
   187  		TableID:          desc.ID,
   188  		JobID:            jobID,
   189  		RowsExpected:     rowsExpected,
   190  	}
   191  	// Plan the SampleAggregator on the gateway, unless we have a single Sampler.
   192  	node := dsp.nodeDesc.NodeID
   193  	if len(p.ResultRouters) == 1 {
   194  		node = p.Processors[p.ResultRouters[0]].Node
   195  	}
   196  	p.AddSingleGroupStage(
   197  		node,
   198  		execinfrapb.ProcessorCoreUnion{SampleAggregator: agg},
   199  		execinfrapb.PostProcessSpec{},
   200  		[]*types.T{},
   201  	)
   202  
   203  	return p, nil
   204  }
   205  
   206  func (dsp *DistSQLPlanner) createPlanForCreateStats(
   207  	planCtx *PlanningCtx, job *jobs.Job,
   208  ) (*PhysicalPlan, error) {
   209  	details := job.Details().(jobspb.CreateStatsDetails)
   210  	reqStats := make([]requestedStat, len(details.ColumnStats))
   211  	histogramCollectionEnabled := stats.HistogramClusterMode.Get(&dsp.st.SV)
   212  	for i := 0; i < len(reqStats); i++ {
   213  		histogram := details.ColumnStats[i].HasHistogram && histogramCollectionEnabled
   214  		reqStats[i] = requestedStat{
   215  			columns:             details.ColumnStats[i].ColumnIDs,
   216  			histogram:           histogram,
   217  			histogramMaxBuckets: histogramBuckets,
   218  			name:                details.Name,
   219  		}
   220  	}
   221  
   222  	tableDesc := sqlbase.NewImmutableTableDescriptor(details.Table)
   223  	return dsp.createStatsPlan(planCtx, tableDesc, reqStats, job)
   224  }
   225  
   226  func (dsp *DistSQLPlanner) planAndRunCreateStats(
   227  	ctx context.Context,
   228  	evalCtx *extendedEvalContext,
   229  	planCtx *PlanningCtx,
   230  	txn *kv.Txn,
   231  	job *jobs.Job,
   232  	resultRows *RowResultWriter,
   233  ) error {
   234  	ctx = logtags.AddTag(ctx, "create-stats-distsql", nil)
   235  
   236  	physPlan, err := dsp.createPlanForCreateStats(planCtx, job)
   237  	if err != nil {
   238  		return err
   239  	}
   240  
   241  	dsp.FinalizePlan(planCtx, physPlan)
   242  
   243  	recv := MakeDistSQLReceiver(
   244  		ctx,
   245  		resultRows,
   246  		tree.DDL,
   247  		evalCtx.ExecCfg.RangeDescriptorCache,
   248  		evalCtx.ExecCfg.LeaseHolderCache,
   249  		txn,
   250  		func(ts hlc.Timestamp) {
   251  			evalCtx.ExecCfg.Clock.Update(ts)
   252  		},
   253  		evalCtx.Tracing,
   254  	)
   255  	defer recv.Release()
   256  
   257  	dsp.Run(planCtx, txn, physPlan, recv, evalCtx, nil /* finishedSetupFn */)()
   258  	return resultRows.Err()
   259  }