github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/registry_external_test.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package jobs_test 12 13 import ( 14 "context" 15 "math" 16 "reflect" 17 "testing" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/base" 21 "github.com/cockroachdb/cockroach/pkg/jobs" 22 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 25 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 26 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 27 "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" 28 "github.com/cockroachdb/cockroach/pkg/testutils" 29 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 30 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 31 "github.com/cockroachdb/cockroach/pkg/util/hlc" 32 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 33 "github.com/cockroachdb/cockroach/pkg/util/log" 34 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 35 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 36 "github.com/cockroachdb/cockroach/pkg/util/tracing" 37 "github.com/cockroachdb/errors" 38 ) 39 40 func TestRoundtripJob(t *testing.T) { 41 defer leaktest.AfterTest(t)() 42 43 ctx := context.Background() 44 s, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) 45 registry := s.JobRegistry().(*jobs.Registry) 46 defer s.Stopper().Stop(ctx) 47 48 storedJob := registry.NewJob(jobs.Record{ 49 Description: "beep boop", 50 Username: "robot", 51 DescriptorIDs: sqlbase.IDs{42}, 52 Details: jobspb.RestoreDetails{}, 53 Progress: jobspb.RestoreProgress{}, 54 }) 55 if err := storedJob.Created(ctx); err != nil { 56 t.Fatal(err) 57 } 58 retrievedJob, err := registry.LoadJob(ctx, *storedJob.ID()) 59 if err != nil { 60 t.Fatal(err) 61 } 62 if e, a := storedJob, retrievedJob; !reflect.DeepEqual(e, a) { 63 //diff := strings.Join(pretty.Diff(e, a), "\n") 64 t.Fatalf("stored job did not match retrieved job:\n%+v\n%+v", e, a) 65 } 66 } 67 68 func TestRegistryResumeExpiredLease(t *testing.T) { 69 defer leaktest.AfterTest(t)() 70 defer jobs.ResetConstructors()() 71 72 ctx := context.Background() 73 s, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) 74 defer s.Stopper().Stop(ctx) 75 76 // Disable leniency for instant expiration 77 jobs.LeniencySetting.Override(&s.ClusterSettings().SV, 0) 78 79 db := s.DB() 80 clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) 81 nodeLiveness := jobs.NewFakeNodeLiveness(4) 82 newRegistry := func(id roachpb.NodeID) *jobs.Registry { 83 const cancelInterval = time.Duration(math.MaxInt64) 84 const adoptInterval = time.Nanosecond 85 86 var c base.NodeIDContainer 87 c.Set(ctx, id) 88 idContainer := base.NewSQLIDContainer(0, &c, true /* exposed */) 89 ac := log.AmbientContext{Tracer: tracing.NewTracer()} 90 r := jobs.MakeRegistry( 91 ac, s.Stopper(), clock, sqlbase.MakeOptionalNodeLiveness(nodeLiveness), db, s.InternalExecutor().(sqlutil.InternalExecutor), 92 idContainer, s.ClusterSettings(), base.DefaultHistogramWindowInterval(), jobs.FakePHS, "", 93 ) 94 if err := r.Start(ctx, s.Stopper(), cancelInterval, adoptInterval); err != nil { 95 t.Fatal(err) 96 } 97 return r 98 } 99 100 const jobCount = 3 101 102 drainAdoptionLoop := func() { 103 // Every turn of the registry's adoption loop will generate exactly one call 104 // to nodeLiveness.GetLivenesses. Only after we've witnessed one call for 105 // each job, plus one more call, can we be sure that all work has been 106 // completed. 107 // 108 // Waiting for only jobCount calls to nodeLiveness.GetLivenesses is racy, as 109 // we might perform our assertions just as the last turn of registry loop 110 // observes our injected liveness failure, if any. 111 for i := 0; i < jobCount+1; i++ { 112 <-nodeLiveness.GetLivenessesCalledCh 113 } 114 } 115 116 // jobMap maps node IDs to job IDs. 117 jobMap := make(map[roachpb.NodeID]int64) 118 hookCallCount := 0 119 // resumeCounts maps jobs IDs to number of start/resumes. 120 resumeCounts := make(map[int64]int) 121 // done prevents jobs from finishing. 122 done := make(chan struct{}) 123 // resumeCalled does a locked, blocking send when a job is started/resumed. A 124 // receive on it will block until a job is running. 125 resumeCalled := make(chan struct{}) 126 var lock syncutil.Mutex 127 jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { 128 lock.Lock() 129 hookCallCount++ 130 lock.Unlock() 131 return jobs.FakeResumer{ 132 OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { 133 select { 134 case <-ctx.Done(): 135 return ctx.Err() 136 case resumeCalled <- struct{}{}: 137 case <-done: 138 } 139 lock.Lock() 140 resumeCounts[*job.ID()]++ 141 lock.Unlock() 142 select { 143 case <-ctx.Done(): 144 return ctx.Err() 145 case <-done: 146 return nil 147 } 148 }, 149 } 150 }) 151 152 for i := 0; i < jobCount; i++ { 153 nodeid := roachpb.NodeID(i + 1) 154 rec := jobs.Record{ 155 Details: jobspb.BackupDetails{}, 156 Progress: jobspb.BackupProgress{}, 157 } 158 job, _, err := newRegistry(nodeid).CreateAndStartJob(ctx, nil, rec) 159 if err != nil { 160 t.Fatal(err) 161 } 162 // Wait until the job is running. 163 <-resumeCalled 164 lock.Lock() 165 jobMap[nodeid] = *job.ID() 166 lock.Unlock() 167 } 168 169 drainAdoptionLoop() 170 if e, a := jobCount, hookCallCount; e != a { 171 t.Fatalf("expected hookCallCount to be %d, but got %d", e, a) 172 } 173 174 drainAdoptionLoop() 175 if e, a := jobCount, hookCallCount; e != a { 176 t.Fatalf("expected hookCallCount to be %d, but got %d", e, a) 177 } 178 179 nodeLiveness.FakeSetExpiration(1, hlc.MinTimestamp) 180 drainAdoptionLoop() 181 <-resumeCalled 182 testutils.SucceedsSoon(t, func() error { 183 lock.Lock() 184 defer lock.Unlock() 185 if hookCallCount <= jobCount { 186 return errors.Errorf("expected hookCallCount to be > %d, but got %d", jobCount, hookCallCount) 187 } 188 return nil 189 }) 190 191 testutils.SucceedsSoon(t, func() error { 192 lock.Lock() 193 defer lock.Unlock() 194 if e, a := 2, resumeCounts[jobMap[1]]; e != a { 195 return errors.Errorf("expected resumeCount to be %d, but got %d", e, a) 196 } 197 return nil 198 }) 199 200 // We want to verify that simply incrementing the epoch does not 201 // result in the job being rescheduled. 202 nodeLiveness.FakeIncrementEpoch(3) 203 drainAdoptionLoop() 204 select { 205 case <-resumeCalled: 206 t.Fatal("Incrementing an epoch should not reschedule a job") 207 default: 208 } 209 210 // When we reset the liveness of the node, though, we should get 211 // a reschedule. 212 nodeLiveness.FakeSetExpiration(3, hlc.MinTimestamp) 213 drainAdoptionLoop() 214 <-resumeCalled 215 close(done) 216 217 testutils.SucceedsSoon(t, func() error { 218 lock.Lock() 219 defer lock.Unlock() 220 if e, a := 1, resumeCounts[jobMap[3]]; e > a { 221 return errors.Errorf("expected resumeCount to be > %d, but got %d", e, a) 222 } 223 if e, a := 1, resumeCounts[jobMap[2]]; e > a { 224 return errors.Errorf("expected resumeCount to be > %d, but got %d", e, a) 225 } 226 count := 0 227 for _, ct := range resumeCounts { 228 count += ct 229 } 230 231 if e, a := 4, count; e > a { 232 return errors.Errorf("expected total jobs to be > %d, but got %d", e, a) 233 } 234 return nil 235 }) 236 } 237 238 func TestRegistryResumeActiveLease(t *testing.T) { 239 defer leaktest.AfterTest(t)() 240 241 defer func(oldInterval time.Duration) { 242 jobs.DefaultAdoptInterval = oldInterval 243 }(jobs.DefaultAdoptInterval) 244 jobs.DefaultAdoptInterval = 100 * time.Millisecond 245 246 resumeCh := make(chan int64) 247 defer jobs.ResetConstructors()() 248 jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { 249 return jobs.FakeResumer{ 250 OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { 251 select { 252 case <-ctx.Done(): 253 return ctx.Err() 254 case resumeCh <- *job.ID(): 255 return nil 256 } 257 }, 258 } 259 }) 260 261 ctx := context.Background() 262 s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) 263 defer s.Stopper().Stop(ctx) 264 265 payload, err := protoutil.Marshal(&jobspb.Payload{ 266 Lease: &jobspb.Lease{NodeID: 1, Epoch: 1}, 267 Details: jobspb.WrapPayloadDetails(jobspb.BackupDetails{}), 268 }) 269 if err != nil { 270 t.Fatal(err) 271 } 272 273 progress, err := protoutil.Marshal(&jobspb.Progress{ 274 Details: jobspb.WrapProgressDetails(jobspb.BackupProgress{}), 275 }) 276 if err != nil { 277 t.Fatal(err) 278 } 279 280 var id int64 281 sqlutils.MakeSQLRunner(sqlDB).QueryRow(t, 282 `INSERT INTO system.jobs (status, payload, progress) VALUES ($1, $2, $3) RETURNING id`, 283 jobs.StatusRunning, payload, progress).Scan(&id) 284 285 if e, a := id, <-resumeCh; e != a { 286 t.Fatalf("expected job %d to be resumed, but got %d", e, a) 287 } 288 }