github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/stats/create_stats_job_test.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package stats_test
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"testing"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/base"
    20  	"github.com/cockroachdb/cockroach/pkg/jobs"
    21  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
    27  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    28  	"github.com/cockroachdb/cockroach/pkg/testutils"
    29  	"github.com/cockroachdb/cockroach/pkg/testutils/jobutils"
    30  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    31  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    32  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    33  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    34  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    35  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    37  	"github.com/cockroachdb/errors"
    38  )
    39  
    40  // TestCreateStatsControlJob tests that PAUSE JOB, RESUME JOB, and CANCEL JOB
    41  // work as intended on create statistics jobs.
    42  func TestCreateStatsControlJob(t *testing.T) {
    43  	defer leaktest.AfterTest(t)()
    44  
    45  	defer func(oldInterval time.Duration) {
    46  		jobs.DefaultAdoptInterval = oldInterval
    47  	}(jobs.DefaultAdoptInterval)
    48  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
    49  
    50  	// Test with 3 nodes and rowexec.SamplerProgressInterval=100 to ensure
    51  	// that progress metadata is sent correctly after every 100 input rows.
    52  	const nodes = 3
    53  	defer func(oldSamplerInterval int, oldSampleAgggregatorInterval time.Duration) {
    54  		rowexec.SamplerProgressInterval = oldSamplerInterval
    55  		rowexec.SampleAggregatorProgressInterval = oldSampleAgggregatorInterval
    56  	}(rowexec.SamplerProgressInterval, rowexec.SampleAggregatorProgressInterval)
    57  	rowexec.SamplerProgressInterval = 100
    58  	rowexec.SampleAggregatorProgressInterval = time.Millisecond
    59  
    60  	var allowRequest chan struct{}
    61  
    62  	var serverArgs base.TestServerArgs
    63  	params := base.TestClusterArgs{ServerArgs: serverArgs}
    64  	params.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{
    65  		TestingRequestFilter: createStatsRequestFilter(&allowRequest),
    66  	}
    67  
    68  	ctx := context.Background()
    69  	tc := testcluster.StartTestCluster(t, nodes, params)
    70  	defer tc.Stopper().Stop(ctx)
    71  	sqlDB := sqlutils.MakeSQLRunner(tc.Conns[0])
    72  	sqlDB.Exec(t, `CREATE DATABASE d`)
    73  	sqlDB.Exec(t, `CREATE TABLE d.t (x INT PRIMARY KEY)`)
    74  	sqlDB.Exec(t, `INSERT INTO d.t SELECT generate_series(1,1000)`)
    75  
    76  	t.Run("cancel", func(t *testing.T) {
    77  		// Test that CREATE STATISTICS can be canceled.
    78  		query := fmt.Sprintf(`CREATE STATISTICS s1 FROM d.t`)
    79  
    80  		if _, err := jobutils.RunJob(
    81  			t, sqlDB, &allowRequest, []string{"cancel"}, query,
    82  		); err == nil {
    83  			t.Fatal("expected an error")
    84  		}
    85  
    86  		// There should be no results here.
    87  		sqlDB.CheckQueryResults(t,
    88  			`SELECT statistics_name, column_names, row_count FROM [SHOW STATISTICS FOR TABLE d.t]`,
    89  			[][]string{})
    90  	})
    91  
    92  	t.Run("pause", func(t *testing.T) {
    93  		// Test that CREATE STATISTICS can be paused and resumed.
    94  		query := fmt.Sprintf(`CREATE STATISTICS s2 FROM d.t`)
    95  
    96  		jobID, err := jobutils.RunJob(
    97  			t, sqlDB, &allowRequest, []string{"PAUSE"}, query,
    98  		)
    99  		if !testutils.IsError(err, "pause") && !testutils.IsError(err, "liveness") {
   100  			t.Fatalf("unexpected: %v", err)
   101  		}
   102  
   103  		// There should be no results here.
   104  		sqlDB.CheckQueryResults(t,
   105  			`SELECT statistics_name, column_names, row_count FROM [SHOW STATISTICS FOR TABLE d.t]`,
   106  			[][]string{})
   107  		opts := retry.Options{
   108  			InitialBackoff: 1 * time.Millisecond,
   109  			MaxBackoff:     time.Second,
   110  			Multiplier:     2,
   111  		}
   112  		if err := retry.WithMaxAttempts(context.Background(), opts, 10, func() error {
   113  			_, err := sqlDB.DB.ExecContext(context.Background(), `RESUME JOB $1`, jobID)
   114  			return err
   115  		}); err != nil {
   116  			t.Fatal(err)
   117  		}
   118  
   119  		jobutils.WaitForJob(t, sqlDB, jobID)
   120  
   121  		// Now the job should have succeeded in producing stats.
   122  		sqlDB.CheckQueryResults(t,
   123  			`SELECT statistics_name, column_names, row_count FROM [SHOW STATISTICS FOR TABLE d.t]`,
   124  			[][]string{
   125  				{"s2", "{x}", "1000"},
   126  			})
   127  	})
   128  }
   129  
   130  // TestCreateStatsLivenessWithRestart tests that a node liveness transition
   131  // during CREATE STATISTICS correctly resumes after the node executing the job
   132  // becomes non-live (from the perspective of the jobs registry).
   133  func TestCreateStatsLivenessWithRestart(t *testing.T) {
   134  	defer leaktest.AfterTest(t)()
   135  
   136  	defer func(oldAdoptInterval, oldCancelInterval time.Duration) {
   137  		jobs.DefaultAdoptInterval = oldAdoptInterval
   138  		jobs.DefaultCancelInterval = oldCancelInterval
   139  	}(jobs.DefaultAdoptInterval, jobs.DefaultCancelInterval)
   140  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
   141  	jobs.DefaultCancelInterval = 100 * time.Millisecond
   142  
   143  	const nodes = 1
   144  	nl := jobs.NewFakeNodeLiveness(nodes)
   145  	serverArgs := base.TestServerArgs{
   146  		Knobs: base.TestingKnobs{
   147  			RegistryLiveness: nl,
   148  		},
   149  	}
   150  
   151  	var allowRequest chan struct{}
   152  	params := base.TestClusterArgs{ServerArgs: serverArgs}
   153  	params.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{
   154  		TestingRequestFilter: createStatsRequestFilter(&allowRequest),
   155  	}
   156  
   157  	ctx := context.Background()
   158  	tc := testcluster.StartTestCluster(t, nodes, params)
   159  	defer tc.Stopper().Stop(ctx)
   160  	conn := tc.Conns[0]
   161  	sqlDB := sqlutils.MakeSQLRunner(conn)
   162  
   163  	sqlDB.Exec(t, `CREATE DATABASE liveness`)
   164  	sqlDB.Exec(t, `CREATE TABLE liveness.t (i INT8 PRIMARY KEY)`)
   165  	sqlDB.Exec(t, `INSERT INTO liveness.t SELECT generate_series(1,1000)`)
   166  
   167  	const query = `CREATE STATISTICS s1 FROM liveness.t`
   168  
   169  	// Start a CREATE STATISTICS run and wait until it's done one scan.
   170  	allowRequest = make(chan struct{})
   171  	errCh := make(chan error)
   172  	go func() {
   173  		_, err := conn.Exec(query)
   174  		errCh <- err
   175  	}()
   176  	select {
   177  	case allowRequest <- struct{}{}:
   178  	case err := <-errCh:
   179  		t.Fatal(err)
   180  	}
   181  
   182  	// Fetch the new job ID and lease since we know it's running now.
   183  	var jobID int64
   184  	originalLease := &jobspb.Progress{}
   185  	{
   186  		var expectedLeaseBytes []byte
   187  		sqlDB.QueryRow(
   188  			t, `SELECT id, progress FROM system.jobs ORDER BY created DESC LIMIT 1`,
   189  		).Scan(&jobID, &expectedLeaseBytes)
   190  		if err := protoutil.Unmarshal(expectedLeaseBytes, originalLease); err != nil {
   191  			t.Fatal(err)
   192  		}
   193  	}
   194  
   195  	// Make the node non-live and wait for cancellation.
   196  	nl.FakeSetExpiration(1, hlc.MinTimestamp)
   197  	// Wait for the registry cancel loop to run and cancel the job.
   198  	<-nl.SelfCalledCh
   199  	<-nl.SelfCalledCh
   200  	close(allowRequest)
   201  	err := <-errCh
   202  	if !testutils.IsError(err, "job .*: node liveness error") {
   203  		t.Fatalf("unexpected: %v", err)
   204  	}
   205  
   206  	// Ensure that complete progress has not been recorded.
   207  	partialProgress := jobutils.GetJobProgress(t, sqlDB, jobID)
   208  	if partialProgress.Progress != nil &&
   209  		partialProgress.Progress.(*jobspb.Progress_FractionCompleted).FractionCompleted == 1 {
   210  		t.Fatal("create stats should not have recorded progress")
   211  	}
   212  
   213  	// Make the node live again.
   214  	nl.FakeSetExpiration(1, hlc.MaxTimestamp)
   215  
   216  	// The registry should now adopt the job and resume it.
   217  	jobutils.WaitForJob(t, sqlDB, jobID)
   218  
   219  	// Verify that the job lease was updated.
   220  	rescheduledProgress := jobutils.GetJobProgress(t, sqlDB, jobID)
   221  	if rescheduledProgress.ModifiedMicros <= originalLease.ModifiedMicros {
   222  		t.Fatalf("expecting rescheduled job to have a later modification time: %d vs %d",
   223  			rescheduledProgress.ModifiedMicros, originalLease.ModifiedMicros)
   224  	}
   225  
   226  	// Verify that progress is now recorded.
   227  	if rescheduledProgress.Progress.(*jobspb.Progress_FractionCompleted).FractionCompleted != 1 {
   228  		t.Fatal("create stats should have recorded progress")
   229  	}
   230  
   231  	// Now the job should have succeeded in producing stats.
   232  	sqlDB.CheckQueryResults(t,
   233  		`SELECT statistics_name, column_names, row_count FROM [SHOW STATISTICS FOR TABLE liveness.t]`,
   234  		[][]string{
   235  			{"s1", "{i}", "1000"},
   236  		})
   237  }
   238  
   239  // TestCreateStatsLivenessWithLeniency tests that a temporary node liveness
   240  // transition during CREATE STATISTICS doesn't cancel the job, but allows the
   241  // owning node to continue processing.
   242  func TestCreateStatsLivenessWithLeniency(t *testing.T) {
   243  	defer leaktest.AfterTest(t)()
   244  
   245  	defer func(oldAdoptInterval, oldCancelInterval time.Duration) {
   246  		jobs.DefaultAdoptInterval = oldAdoptInterval
   247  		jobs.DefaultCancelInterval = oldCancelInterval
   248  	}(jobs.DefaultAdoptInterval, jobs.DefaultCancelInterval)
   249  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
   250  	jobs.DefaultCancelInterval = 100 * time.Millisecond
   251  
   252  	const nodes = 1
   253  	nl := jobs.NewFakeNodeLiveness(nodes)
   254  	serverArgs := base.TestServerArgs{
   255  		Knobs: base.TestingKnobs{
   256  			RegistryLiveness: nl,
   257  		},
   258  	}
   259  
   260  	var allowRequest chan struct{}
   261  	params := base.TestClusterArgs{ServerArgs: serverArgs}
   262  	params.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{
   263  		TestingRequestFilter: createStatsRequestFilter(&allowRequest),
   264  	}
   265  
   266  	ctx := context.Background()
   267  	tc := testcluster.StartTestCluster(t, nodes, params)
   268  	defer tc.Stopper().Stop(ctx)
   269  	conn := tc.Conns[0]
   270  	sqlDB := sqlutils.MakeSQLRunner(conn)
   271  
   272  	// We want to know exactly how much leniency is configured.
   273  	sqlDB.Exec(t, `SET CLUSTER SETTING jobs.registry.leniency = '1m'`)
   274  	sqlDB.Exec(t, `CREATE DATABASE liveness`)
   275  	sqlDB.Exec(t, `CREATE TABLE liveness.t (i INT8 PRIMARY KEY)`)
   276  	sqlDB.Exec(t, `INSERT INTO liveness.t SELECT generate_series(1,1000)`)
   277  
   278  	const query = `CREATE STATISTICS s1 FROM liveness.t`
   279  
   280  	// Start a CREATE STATISTICS run and wait until it's done one scan.
   281  	allowRequest = make(chan struct{})
   282  	errCh := make(chan error)
   283  	go func() {
   284  		_, err := conn.Exec(query)
   285  		errCh <- err
   286  	}()
   287  	select {
   288  	case allowRequest <- struct{}{}:
   289  	case err := <-errCh:
   290  		t.Fatal(err)
   291  	}
   292  
   293  	// Fetch the new job ID and lease since we know it's running now.
   294  	var jobID int64
   295  	originalLease := &jobspb.Payload{}
   296  	{
   297  		var expectedLeaseBytes []byte
   298  		sqlDB.QueryRow(
   299  			t, `SELECT id, payload FROM system.jobs ORDER BY created DESC LIMIT 1`,
   300  		).Scan(&jobID, &expectedLeaseBytes)
   301  		if err := protoutil.Unmarshal(expectedLeaseBytes, originalLease); err != nil {
   302  			t.Fatal(err)
   303  		}
   304  	}
   305  
   306  	// Make the node slightly tardy.
   307  	nl.FakeSetExpiration(1, hlc.Timestamp{
   308  		WallTime: hlc.UnixNano() - (15 * time.Second).Nanoseconds(),
   309  	})
   310  
   311  	// Wait for the registry cancel loop to run and not cancel the job.
   312  	<-nl.SelfCalledCh
   313  	<-nl.SelfCalledCh
   314  	close(allowRequest)
   315  
   316  	// Set the node to be fully live again.  This prevents the registry
   317  	// from canceling all of the jobs if the test node is saturated
   318  	// and the create stats runs slowly.
   319  	nl.FakeSetExpiration(1, hlc.MaxTimestamp)
   320  
   321  	// Verify that the client didn't see anything amiss.
   322  	if err := <-errCh; err != nil {
   323  		t.Fatalf("create stats job should have completed: %s", err)
   324  	}
   325  
   326  	// The job should have completed normally.
   327  	jobutils.WaitForJob(t, sqlDB, jobID)
   328  }
   329  
   330  func TestAtMostOneRunningCreateStats(t *testing.T) {
   331  	defer leaktest.AfterTest(t)()
   332  
   333  	defer func(oldAdoptInterval time.Duration) {
   334  		jobs.DefaultAdoptInterval = oldAdoptInterval
   335  	}(jobs.DefaultAdoptInterval)
   336  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
   337  
   338  	var allowRequest chan struct{}
   339  
   340  	var serverArgs base.TestServerArgs
   341  	params := base.TestClusterArgs{ServerArgs: serverArgs}
   342  	params.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{
   343  		TestingRequestFilter: createStatsRequestFilter(&allowRequest),
   344  	}
   345  
   346  	ctx := context.Background()
   347  	const nodes = 1
   348  	tc := testcluster.StartTestCluster(t, nodes, params)
   349  	defer tc.Stopper().Stop(ctx)
   350  	conn := tc.Conns[0]
   351  	sqlDB := sqlutils.MakeSQLRunner(conn)
   352  
   353  	sqlDB.Exec(t, `CREATE DATABASE d`)
   354  	sqlDB.Exec(t, `CREATE TABLE d.t (x INT PRIMARY KEY)`)
   355  	sqlDB.Exec(t, `INSERT INTO d.t SELECT generate_series(1,1000)`)
   356  
   357  	// Start a CREATE STATISTICS run and wait until it's done one scan.
   358  	allowRequest = make(chan struct{})
   359  	errCh := make(chan error)
   360  	go func() {
   361  		_, err := conn.Exec(`CREATE STATISTICS s1 FROM d.t`)
   362  		errCh <- err
   363  	}()
   364  	select {
   365  	case allowRequest <- struct{}{}:
   366  	case err := <-errCh:
   367  		t.Fatal(err)
   368  	}
   369  
   370  	autoStatsRunShouldFail := func() {
   371  		expectErrCh := make(chan error, 1)
   372  		go func() {
   373  			_, err := conn.Exec(`CREATE STATISTICS __auto__ FROM d.t`)
   374  			expectErrCh <- err
   375  		}()
   376  		select {
   377  		case err := <-expectErrCh:
   378  			expected := "another CREATE STATISTICS job is already running"
   379  			if !testutils.IsError(err, expected) {
   380  				t.Fatalf("expected '%s' error, but got %v", expected, err)
   381  			}
   382  		case <-time.After(time.Second):
   383  			panic("CREATE STATISTICS job which was expected to fail, timed out instead")
   384  		}
   385  	}
   386  
   387  	// Attempt to start an automatic stats run. It should fail.
   388  	autoStatsRunShouldFail()
   389  
   390  	// PAUSE JOB does not bloack until the job is paused but only requests it.
   391  	// Wait until the job is set to paused.
   392  	var jobID int64
   393  	sqlDB.QueryRow(t, `SELECT id FROM system.jobs ORDER BY created DESC LIMIT 1`).Scan(&jobID)
   394  	opts := retry.Options{
   395  		InitialBackoff: 1 * time.Millisecond,
   396  		MaxBackoff:     time.Second,
   397  		Multiplier:     2,
   398  	}
   399  	if err := retry.WithMaxAttempts(context.Background(), opts, 10, func() error {
   400  		_, err := sqlDB.DB.ExecContext(context.Background(), `PAUSE JOB $1`, jobID)
   401  		if err != nil {
   402  			t.Fatal(err)
   403  		}
   404  		var status string
   405  		sqlDB.QueryRow(t, `SELECT status FROM system.jobs WHERE id = $1 LIMIT 1`, jobID).Scan(&status)
   406  		if status != "paused" {
   407  			return errors.New("could not pause job")
   408  		}
   409  		return err
   410  	}); err != nil {
   411  		t.Fatal(err)
   412  	}
   413  
   414  	// Starting another automatic stats run should still fail.
   415  	autoStatsRunShouldFail()
   416  
   417  	// Attempt to start a regular stats run. It should succeed.
   418  	errCh2 := make(chan error)
   419  	go func() {
   420  		_, err := conn.Exec(`CREATE STATISTICS s2 FROM d.t`)
   421  		errCh2 <- err
   422  	}()
   423  	select {
   424  	case allowRequest <- struct{}{}:
   425  	case err := <-errCh:
   426  		t.Fatal(err)
   427  	case err := <-errCh2:
   428  		t.Fatal(err)
   429  	}
   430  	close(allowRequest)
   431  
   432  	// Verify that the second job completed successfully.
   433  	if err := <-errCh2; err != nil {
   434  		t.Fatalf("create stats job should have completed: %s", err)
   435  	}
   436  
   437  	// Verify that the first job completed successfully.
   438  	sqlDB.Exec(t, fmt.Sprintf("RESUME JOB %d", jobID))
   439  	jobutils.WaitForJob(t, sqlDB, jobID)
   440  	<-errCh
   441  }
   442  
   443  func TestDeleteFailedJob(t *testing.T) {
   444  	defer leaktest.AfterTest(t)()
   445  
   446  	defer func(oldAdoptInterval time.Duration) {
   447  		jobs.DefaultAdoptInterval = oldAdoptInterval
   448  	}(jobs.DefaultAdoptInterval)
   449  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
   450  
   451  	ctx := context.Background()
   452  	tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{})
   453  	defer tc.Stopper().Stop(ctx)
   454  	conn := tc.Conns[0]
   455  	sqlDB := sqlutils.MakeSQLRunner(conn)
   456  
   457  	sqlDB.Exec(t, `SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false`)
   458  	sqlDB.Exec(t, `CREATE DATABASE d`)
   459  	sqlDB.Exec(t, `CREATE TABLE d.t (x INT PRIMARY KEY)`)
   460  	sqlDB.Exec(t, `CREATE TABLE d.u (x INT PRIMARY KEY)`)
   461  	sqlDB.Exec(t, `INSERT INTO d.t SELECT generate_series(1,1000)`)
   462  	sqlDB.Exec(t, `INSERT INTO d.u SELECT generate_series(1,1000)`)
   463  
   464  	// Start two CREATE STATISTICS runs at once.
   465  	errCh1 := make(chan error)
   466  	go func() {
   467  		_, err := conn.Exec(`CREATE STATISTICS __auto__ FROM d.t`)
   468  		errCh1 <- err
   469  	}()
   470  	errCh2 := make(chan error)
   471  	go func() {
   472  		_, err := conn.Exec(`CREATE STATISTICS __auto__ FROM d.u`)
   473  		errCh2 <- err
   474  	}()
   475  
   476  	err1 := <-errCh1
   477  	err2 := <-errCh2
   478  
   479  	// At least one of the jobs should have succeeded.
   480  	if err1 != nil && err2 != nil {
   481  		t.Fatalf("one job should have succeeded but both failed. err1:%v, err2:%v", err1, err2)
   482  	}
   483  
   484  	// Check that if one of the jobs failed, it was deleted and doesn't show up in
   485  	// SHOW AUTOMATIC JOBS.
   486  	// Note: if this test fails, it will likely show up by using stressrace.
   487  	if res := sqlDB.QueryStr(t,
   488  		`SELECT statement, status, error FROM [SHOW AUTOMATIC JOBS] WHERE status = $1`,
   489  		jobs.StatusFailed,
   490  	); len(res) != 0 {
   491  		t.Fatalf("job should have been deleted but found: %v", res)
   492  	}
   493  }
   494  
   495  // TestCreateStatsProgress tests that progress reporting works correctly
   496  // for the CREATE STATISTICS job.
   497  func TestCreateStatsProgress(t *testing.T) {
   498  	defer leaktest.AfterTest(t)()
   499  
   500  	defer func(oldProgressInterval time.Duration) {
   501  		rowexec.SampleAggregatorProgressInterval = oldProgressInterval
   502  	}(rowexec.SampleAggregatorProgressInterval)
   503  	rowexec.SampleAggregatorProgressInterval = time.Nanosecond
   504  
   505  	defer func(oldProgressInterval int) {
   506  		rowexec.SamplerProgressInterval = oldProgressInterval
   507  	}(rowexec.SamplerProgressInterval)
   508  	rowexec.SamplerProgressInterval = 10
   509  
   510  	resetKVBatchSize := row.TestingSetKVBatchSize(10)
   511  	defer resetKVBatchSize()
   512  
   513  	var allowRequest chan struct{}
   514  	var serverArgs base.TestServerArgs
   515  	params := base.TestClusterArgs{ServerArgs: serverArgs}
   516  	params.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{
   517  		TestingRequestFilter: createStatsRequestFilter(&allowRequest),
   518  	}
   519  
   520  	ctx := context.Background()
   521  	const nodes = 1
   522  	tc := testcluster.StartTestCluster(t, nodes, params)
   523  	defer tc.Stopper().Stop(ctx)
   524  	conn := tc.Conns[0]
   525  	sqlDB := sqlutils.MakeSQLRunner(conn)
   526  
   527  	sqlDB.Exec(t, `SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false`)
   528  	sqlDB.Exec(t, `CREATE DATABASE d`)
   529  	sqlDB.Exec(t, `CREATE TABLE d.t (i INT8 PRIMARY KEY)`)
   530  	sqlDB.Exec(t, `INSERT INTO d.t SELECT generate_series(1,1000)`)
   531  
   532  	getFractionCompleted := func(jobID int64) float32 {
   533  		var progress *jobspb.Progress
   534  		testutils.SucceedsSoon(t, func() error {
   535  			progress = jobutils.GetJobProgress(t, sqlDB, jobID)
   536  			if progress.Progress == nil {
   537  				return errors.Errorf("progress is nil. jobID: %d", jobID)
   538  			}
   539  			return nil
   540  		})
   541  		return progress.Progress.(*jobspb.Progress_FractionCompleted).FractionCompleted
   542  	}
   543  
   544  	const query = `CREATE STATISTICS s1 FROM d.t`
   545  
   546  	// Start a CREATE STATISTICS run and wait until it has scanned part of the
   547  	// table.
   548  	allowRequest = make(chan struct{})
   549  	errCh := make(chan error)
   550  	go func() {
   551  		_, err := conn.Exec(query)
   552  		errCh <- err
   553  	}()
   554  	// Ten iterations here allows us to read some of the rows but not all.
   555  	for i := 0; i < 10; i++ {
   556  		select {
   557  		case allowRequest <- struct{}{}:
   558  		case err := <-errCh:
   559  			t.Fatal(err)
   560  		}
   561  	}
   562  
   563  	// Fetch the new job ID since we know it's running now.
   564  	jobID := jobutils.GetLastJobID(t, sqlDB)
   565  
   566  	// Ensure that 0 progress has been recorded since there are no existing
   567  	// stats available to estimate progress.
   568  	fractionCompleted := getFractionCompleted(jobID)
   569  	if fractionCompleted != 0 {
   570  		t.Fatalf(
   571  			"create stats should not have recorded progress, but progress is %f",
   572  			fractionCompleted,
   573  		)
   574  	}
   575  
   576  	// Allow the job to complete and verify that the client didn't see anything
   577  	// amiss.
   578  	close(allowRequest)
   579  	if err := <-errCh; err != nil {
   580  		t.Fatalf("create stats job should have completed: %s", err)
   581  	}
   582  
   583  	// Verify that full progress is now recorded.
   584  	fractionCompleted = getFractionCompleted(jobID)
   585  	if fractionCompleted != 1 {
   586  		t.Fatalf(
   587  			"create stats should have recorded full progress, but progress is %f",
   588  			fractionCompleted,
   589  		)
   590  	}
   591  
   592  	// Start another CREATE STATISTICS run and wait until it has scanned part of
   593  	// the table.
   594  	allowRequest = make(chan struct{})
   595  	go func() {
   596  		_, err := conn.Exec(query)
   597  		errCh <- err
   598  	}()
   599  	// Ten iterations here allows us to read some of the rows but not all.
   600  	for i := 0; i < 10; i++ {
   601  		select {
   602  		case allowRequest <- struct{}{}:
   603  		case err := <-errCh:
   604  			t.Fatal(err)
   605  		}
   606  	}
   607  
   608  	// Fetch the new job ID since we know it's running now.
   609  	jobID = jobutils.GetLastJobID(t, sqlDB)
   610  
   611  	// Ensure that partial progress has been recorded since there are existing
   612  	// stats available.
   613  	fractionCompleted = getFractionCompleted(jobID)
   614  	if fractionCompleted <= 0 || fractionCompleted > 0.99 {
   615  		t.Fatalf(
   616  			"create stats should have recorded partial progress, but progress is %f",
   617  			fractionCompleted,
   618  		)
   619  	}
   620  
   621  	// Allow the job to complete and verify that the client didn't see anything
   622  	// amiss.
   623  	close(allowRequest)
   624  	if err := <-errCh; err != nil {
   625  		t.Fatalf("create stats job should have completed: %s", err)
   626  	}
   627  
   628  	// Verify that full progress is now recorded.
   629  	fractionCompleted = getFractionCompleted(jobID)
   630  	if fractionCompleted != 1 {
   631  		t.Fatalf(
   632  			"create stats should have recorded full progress, but progress is %f",
   633  			fractionCompleted,
   634  		)
   635  	}
   636  }
   637  
   638  func TestCreateStatsAsOfTime(t *testing.T) {
   639  	defer leaktest.AfterTest(t)()
   640  
   641  	ctx := context.Background()
   642  	tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{})
   643  	defer tc.Stopper().Stop(ctx)
   644  	sqlDB := sqlutils.MakeSQLRunner(tc.Conns[0])
   645  	sqlDB.Exec(t, `CREATE DATABASE d`)
   646  	sqlDB.Exec(t, `CREATE TABLE d.t (x INT PRIMARY KEY)`)
   647  
   648  	var ts1 []uint8
   649  	sqlDB.QueryRow(t, `
   650  			INSERT INTO d.t VALUES (1)
   651  			RETURNING cluster_logical_timestamp();
   652  		`).Scan(&ts1)
   653  
   654  	sqlDB.Exec(t, `INSERT INTO d.t VALUES (2)`)
   655  
   656  	sqlDB.Exec(t, fmt.Sprintf("CREATE STATISTICS s FROM d.t AS OF SYSTEM TIME %s", string(ts1)))
   657  
   658  	// Check that we only see the first row, not the second.
   659  	sqlDB.CheckQueryResults(t,
   660  		`SELECT statistics_name, column_names, row_count FROM [SHOW STATISTICS FOR TABLE d.t]`,
   661  		[][]string{
   662  			{"s", "{x}", "1"},
   663  		})
   664  }
   665  
   666  // Create a blocking request filter for the actions related
   667  // to CREATE STATISTICS, i.e. Scanning a user table. See discussion
   668  // on jobutils.RunJob for where this might be useful.
   669  func createStatsRequestFilter(allowProgressIota *chan struct{}) kvserverbase.ReplicaRequestFilter {
   670  	return func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
   671  		if req, ok := ba.GetArg(roachpb.Scan); ok {
   672  			_, tableID, _ := encoding.DecodeUvarintAscending(req.(*roachpb.ScanRequest).Key)
   673  			// Ensure that the tableID is within the expected range for a table,
   674  			// but is not a system table.
   675  			if tableID > 0 && tableID < 100 && !sqlbase.IsReservedID(sqlbase.ID(tableID)) {
   676  				// Read from the channel twice to allow jobutils.RunJob to complete
   677  				// even though there is only one ScanRequest.
   678  				<-*allowProgressIota
   679  				<-*allowProgressIota
   680  			}
   681  		}
   682  		return nil
   683  	}
   684  }