github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/registry_external_test.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package jobs_test
    12  
    13  import (
    14  	"context"
    15  	"math"
    16  	"reflect"
    17  	"testing"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/jobs"
    22  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    27  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    28  	"github.com/cockroachdb/cockroach/pkg/testutils"
    29  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    30  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    31  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    32  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    33  	"github.com/cockroachdb/cockroach/pkg/util/log"
    34  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    35  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    37  	"github.com/cockroachdb/errors"
    38  )
    39  
    40  func TestRoundtripJob(t *testing.T) {
    41  	defer leaktest.AfterTest(t)()
    42  
    43  	ctx := context.Background()
    44  	s, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
    45  	registry := s.JobRegistry().(*jobs.Registry)
    46  	defer s.Stopper().Stop(ctx)
    47  
    48  	storedJob := registry.NewJob(jobs.Record{
    49  		Description:   "beep boop",
    50  		Username:      "robot",
    51  		DescriptorIDs: sqlbase.IDs{42},
    52  		Details:       jobspb.RestoreDetails{},
    53  		Progress:      jobspb.RestoreProgress{},
    54  	})
    55  	if err := storedJob.Created(ctx); err != nil {
    56  		t.Fatal(err)
    57  	}
    58  	retrievedJob, err := registry.LoadJob(ctx, *storedJob.ID())
    59  	if err != nil {
    60  		t.Fatal(err)
    61  	}
    62  	if e, a := storedJob, retrievedJob; !reflect.DeepEqual(e, a) {
    63  		//diff := strings.Join(pretty.Diff(e, a), "\n")
    64  		t.Fatalf("stored job did not match retrieved job:\n%+v\n%+v", e, a)
    65  	}
    66  }
    67  
    68  func TestRegistryResumeExpiredLease(t *testing.T) {
    69  	defer leaktest.AfterTest(t)()
    70  	defer jobs.ResetConstructors()()
    71  
    72  	ctx := context.Background()
    73  	s, _, _ := serverutils.StartServer(t, base.TestServerArgs{})
    74  	defer s.Stopper().Stop(ctx)
    75  
    76  	// Disable leniency for instant expiration
    77  	jobs.LeniencySetting.Override(&s.ClusterSettings().SV, 0)
    78  
    79  	db := s.DB()
    80  	clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond)
    81  	nodeLiveness := jobs.NewFakeNodeLiveness(4)
    82  	newRegistry := func(id roachpb.NodeID) *jobs.Registry {
    83  		const cancelInterval = time.Duration(math.MaxInt64)
    84  		const adoptInterval = time.Nanosecond
    85  
    86  		var c base.NodeIDContainer
    87  		c.Set(ctx, id)
    88  		idContainer := base.NewSQLIDContainer(0, &c, true /* exposed */)
    89  		ac := log.AmbientContext{Tracer: tracing.NewTracer()}
    90  		r := jobs.MakeRegistry(
    91  			ac, s.Stopper(), clock, sqlbase.MakeOptionalNodeLiveness(nodeLiveness), db, s.InternalExecutor().(sqlutil.InternalExecutor),
    92  			idContainer, s.ClusterSettings(), base.DefaultHistogramWindowInterval(), jobs.FakePHS, "",
    93  		)
    94  		if err := r.Start(ctx, s.Stopper(), cancelInterval, adoptInterval); err != nil {
    95  			t.Fatal(err)
    96  		}
    97  		return r
    98  	}
    99  
   100  	const jobCount = 3
   101  
   102  	drainAdoptionLoop := func() {
   103  		// Every turn of the registry's adoption loop will generate exactly one call
   104  		// to nodeLiveness.GetLivenesses. Only after we've witnessed one call for
   105  		// each job, plus one more call, can we be sure that all work has been
   106  		// completed.
   107  		//
   108  		// Waiting for only jobCount calls to nodeLiveness.GetLivenesses is racy, as
   109  		// we might perform our assertions just as the last turn of registry loop
   110  		// observes our injected liveness failure, if any.
   111  		for i := 0; i < jobCount+1; i++ {
   112  			<-nodeLiveness.GetLivenessesCalledCh
   113  		}
   114  	}
   115  
   116  	// jobMap maps node IDs to job IDs.
   117  	jobMap := make(map[roachpb.NodeID]int64)
   118  	hookCallCount := 0
   119  	// resumeCounts maps jobs IDs to number of start/resumes.
   120  	resumeCounts := make(map[int64]int)
   121  	// done prevents jobs from finishing.
   122  	done := make(chan struct{})
   123  	// resumeCalled does a locked, blocking send when a job is started/resumed. A
   124  	// receive on it will block until a job is running.
   125  	resumeCalled := make(chan struct{})
   126  	var lock syncutil.Mutex
   127  	jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer {
   128  		lock.Lock()
   129  		hookCallCount++
   130  		lock.Unlock()
   131  		return jobs.FakeResumer{
   132  			OnResume: func(ctx context.Context, _ chan<- tree.Datums) error {
   133  				select {
   134  				case <-ctx.Done():
   135  					return ctx.Err()
   136  				case resumeCalled <- struct{}{}:
   137  				case <-done:
   138  				}
   139  				lock.Lock()
   140  				resumeCounts[*job.ID()]++
   141  				lock.Unlock()
   142  				select {
   143  				case <-ctx.Done():
   144  					return ctx.Err()
   145  				case <-done:
   146  					return nil
   147  				}
   148  			},
   149  		}
   150  	})
   151  
   152  	for i := 0; i < jobCount; i++ {
   153  		nodeid := roachpb.NodeID(i + 1)
   154  		rec := jobs.Record{
   155  			Details:  jobspb.BackupDetails{},
   156  			Progress: jobspb.BackupProgress{},
   157  		}
   158  		job, _, err := newRegistry(nodeid).CreateAndStartJob(ctx, nil, rec)
   159  		if err != nil {
   160  			t.Fatal(err)
   161  		}
   162  		// Wait until the job is running.
   163  		<-resumeCalled
   164  		lock.Lock()
   165  		jobMap[nodeid] = *job.ID()
   166  		lock.Unlock()
   167  	}
   168  
   169  	drainAdoptionLoop()
   170  	if e, a := jobCount, hookCallCount; e != a {
   171  		t.Fatalf("expected hookCallCount to be %d, but got %d", e, a)
   172  	}
   173  
   174  	drainAdoptionLoop()
   175  	if e, a := jobCount, hookCallCount; e != a {
   176  		t.Fatalf("expected hookCallCount to be %d, but got %d", e, a)
   177  	}
   178  
   179  	nodeLiveness.FakeSetExpiration(1, hlc.MinTimestamp)
   180  	drainAdoptionLoop()
   181  	<-resumeCalled
   182  	testutils.SucceedsSoon(t, func() error {
   183  		lock.Lock()
   184  		defer lock.Unlock()
   185  		if hookCallCount <= jobCount {
   186  			return errors.Errorf("expected hookCallCount to be > %d, but got %d", jobCount, hookCallCount)
   187  		}
   188  		return nil
   189  	})
   190  
   191  	testutils.SucceedsSoon(t, func() error {
   192  		lock.Lock()
   193  		defer lock.Unlock()
   194  		if e, a := 2, resumeCounts[jobMap[1]]; e != a {
   195  			return errors.Errorf("expected resumeCount to be %d, but got %d", e, a)
   196  		}
   197  		return nil
   198  	})
   199  
   200  	// We want to verify that simply incrementing the epoch does not
   201  	// result in the job being rescheduled.
   202  	nodeLiveness.FakeIncrementEpoch(3)
   203  	drainAdoptionLoop()
   204  	select {
   205  	case <-resumeCalled:
   206  		t.Fatal("Incrementing an epoch should not reschedule a job")
   207  	default:
   208  	}
   209  
   210  	// When we reset the liveness of the node, though, we should get
   211  	// a reschedule.
   212  	nodeLiveness.FakeSetExpiration(3, hlc.MinTimestamp)
   213  	drainAdoptionLoop()
   214  	<-resumeCalled
   215  	close(done)
   216  
   217  	testutils.SucceedsSoon(t, func() error {
   218  		lock.Lock()
   219  		defer lock.Unlock()
   220  		if e, a := 1, resumeCounts[jobMap[3]]; e > a {
   221  			return errors.Errorf("expected resumeCount to be > %d, but got %d", e, a)
   222  		}
   223  		if e, a := 1, resumeCounts[jobMap[2]]; e > a {
   224  			return errors.Errorf("expected resumeCount to be > %d, but got %d", e, a)
   225  		}
   226  		count := 0
   227  		for _, ct := range resumeCounts {
   228  			count += ct
   229  		}
   230  
   231  		if e, a := 4, count; e > a {
   232  			return errors.Errorf("expected total jobs to be > %d, but got %d", e, a)
   233  		}
   234  		return nil
   235  	})
   236  }
   237  
   238  func TestRegistryResumeActiveLease(t *testing.T) {
   239  	defer leaktest.AfterTest(t)()
   240  
   241  	defer func(oldInterval time.Duration) {
   242  		jobs.DefaultAdoptInterval = oldInterval
   243  	}(jobs.DefaultAdoptInterval)
   244  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
   245  
   246  	resumeCh := make(chan int64)
   247  	defer jobs.ResetConstructors()()
   248  	jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer {
   249  		return jobs.FakeResumer{
   250  			OnResume: func(ctx context.Context, _ chan<- tree.Datums) error {
   251  				select {
   252  				case <-ctx.Done():
   253  					return ctx.Err()
   254  				case resumeCh <- *job.ID():
   255  					return nil
   256  				}
   257  			},
   258  		}
   259  	})
   260  
   261  	ctx := context.Background()
   262  	s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{})
   263  	defer s.Stopper().Stop(ctx)
   264  
   265  	payload, err := protoutil.Marshal(&jobspb.Payload{
   266  		Lease:   &jobspb.Lease{NodeID: 1, Epoch: 1},
   267  		Details: jobspb.WrapPayloadDetails(jobspb.BackupDetails{}),
   268  	})
   269  	if err != nil {
   270  		t.Fatal(err)
   271  	}
   272  
   273  	progress, err := protoutil.Marshal(&jobspb.Progress{
   274  		Details: jobspb.WrapProgressDetails(jobspb.BackupProgress{}),
   275  	})
   276  	if err != nil {
   277  		t.Fatal(err)
   278  	}
   279  
   280  	var id int64
   281  	sqlutils.MakeSQLRunner(sqlDB).QueryRow(t,
   282  		`INSERT INTO system.jobs (status, payload, progress) VALUES ($1, $2, $3) RETURNING id`,
   283  		jobs.StatusRunning, payload, progress).Scan(&id)
   284  
   285  	if e, a := id, <-resumeCh; e != a {
   286  		t.Fatalf("expected job %d to be resumed, but got %d", e, a)
   287  	}
   288  }