github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/registry_test.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package jobs 12 13 import ( 14 "context" 15 "math" 16 "strconv" 17 "testing" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/base" 21 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 22 "github.com/cockroachdb/cockroach/pkg/keys" 23 "github.com/cockroachdb/cockroach/pkg/kv" 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 26 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 27 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 28 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 29 "github.com/cockroachdb/cockroach/pkg/util/hlc" 30 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 31 "github.com/cockroachdb/cockroach/pkg/util/log" 32 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 33 "github.com/cockroachdb/cockroach/pkg/util/stop" 34 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 35 ) 36 37 func FakePHS(opName, user string) (interface{}, func()) { 38 return nil, func() {} 39 } 40 41 func TestRegistryCancelation(t *testing.T) { 42 defer leaktest.AfterTest(t)() 43 44 ctx, stopper := context.Background(), stop.NewStopper() 45 defer stopper.Stop(ctx) 46 47 // Not using the server.DefaultHistogramWindowInterval constant because 48 // of a dep cycle. 49 const histogramWindowInterval = 60 * time.Second 50 51 const nodeCount = 1 52 nodeLiveness := NewFakeNodeLiveness(nodeCount) 53 54 var db *kv.DB 55 // Insulate this test from wall time. 56 mClock := hlc.NewManualClock(hlc.UnixNano()) 57 clock := hlc.NewClock(mClock.UnixNano, time.Nanosecond) 58 registry := MakeRegistry( 59 log.AmbientContext{}, 60 stopper, 61 clock, 62 sqlbase.MakeOptionalNodeLiveness(nodeLiveness), 63 db, 64 nil, /* ex */ 65 base.TestingIDContainer, 66 cluster.NoSettings, 67 histogramWindowInterval, 68 FakePHS, 69 "", 70 ) 71 72 const cancelInterval = time.Nanosecond 73 const adoptInterval = time.Duration(math.MaxInt64) 74 if err := registry.Start(ctx, stopper, cancelInterval, adoptInterval); err != nil { 75 t.Fatal(err) 76 } 77 78 wait := func() { 79 // Every turn of the registry's liveness poll loop will generate exactly one 80 // call to nodeLiveness.Self. Only after we've witnessed two calls can we be 81 // sure that the first turn of the registry's loop has completed. 82 // 83 // Waiting for only the first call to nodeLiveness.Self is racy, as we'd 84 // perform our assertions concurrently with the registry loop's observation 85 // of our injected liveness failure, if any. 86 <-nodeLiveness.SelfCalledCh 87 <-nodeLiveness.SelfCalledCh 88 } 89 90 cancelCount := 0 91 didRegister := false 92 jobID := int64(1) 93 const nodeID = roachpb.NodeID(1) 94 95 register := func() { 96 didRegister = true 97 jobID++ 98 if err := registry.register(jobID, func() { cancelCount++ }); err != nil { 99 t.Fatal(err) 100 } 101 } 102 unregister := func() { 103 registry.unregister(jobID) 104 didRegister = false 105 } 106 expectCancel := func(expect bool) { 107 t.Helper() 108 109 wait() 110 var e int 111 if expect { 112 e = 1 113 } 114 if a := cancelCount; e != a { 115 t.Errorf("expected cancelCount of %d, but got %d", e, a) 116 } 117 } 118 check := func(fn func()) { 119 fn() 120 if didRegister { 121 unregister() 122 wait() 123 } 124 cancelCount = 0 125 } 126 // inWindow slews the expiration time of the node's expiration. 127 inWindow := func(in bool) { 128 nanos := -defaultLeniencySetting.Nanoseconds() 129 if in { 130 nanos = nanos / 2 131 } else { 132 nanos = nanos * 2 133 } 134 nodeLiveness.FakeSetExpiration(nodeID, clock.Now().Add(nanos, 0)) 135 } 136 137 // Jobs that complete while the node is live should be canceled once. 138 check(func() { 139 register() 140 expectCancel(false) 141 unregister() 142 expectCancel(true) 143 }) 144 145 // Jobs that are in-progress when the liveness epoch is incremented 146 // should not be canceled. 147 check(func() { 148 register() 149 nodeLiveness.FakeIncrementEpoch(nodeID) 150 expectCancel(false) 151 unregister() 152 expectCancel(true) 153 }) 154 155 // Jobs started in the new epoch that complete while the new epoch is live 156 // should be canceled once. 157 check(func() { 158 register() 159 expectCancel(false) 160 unregister() 161 expectCancel(true) 162 }) 163 164 // Jobs **alive** within the leniency period should not be canceled. 165 check(func() { 166 register() 167 inWindow(true) 168 expectCancel(false) 169 unregister() 170 expectCancel(true) 171 }) 172 173 // Jobs **started** within the leniency period should not be canceled. 174 check(func() { 175 inWindow(true) 176 register() 177 expectCancel(false) 178 }) 179 180 // Jobs **alive** outside of the leniency period should be canceled. 181 check(func() { 182 register() 183 inWindow(false) 184 expectCancel(true) 185 }) 186 187 // Jobs **started** outside of the leniency period should be canceled. 188 check(func() { 189 inWindow(false) 190 register() 191 expectCancel(true) 192 }) 193 } 194 195 func TestRegistryGC(t *testing.T) { 196 defer leaktest.AfterTest(t)() 197 t.Skip("") 198 // TODO (lucy): This test probably shouldn't continue to exist in its current 199 // form if GCMutations will cease to be used. Refactor or get rid of it. 200 201 ctx := context.Background() 202 s, sqlDB, kvDB := serverutils.StartServer(t, base.TestServerArgs{}) 203 defer s.Stopper().Stop(ctx) 204 205 db := sqlutils.MakeSQLRunner(sqlDB) 206 207 type mutationOptions struct { 208 // Set if the desc should have any mutations of any sort. 209 hasMutation bool 210 // Set if the mutation being inserted is a GCMutation. 211 hasGCMutation bool 212 // Set if the desc should have a job that is dropping it. 213 hasDropJob bool 214 } 215 216 ts := timeutil.Now() 217 earlier := ts.Add(-1 * time.Hour) 218 muchEarlier := ts.Add(-2 * time.Hour) 219 220 setMutations := func(mutations []sqlbase.DescriptorMutation) sqlbase.ID { 221 desc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "to_be_mutated") 222 desc.Mutations = mutations 223 if err := kvDB.Put( 224 context.Background(), 225 sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, desc.GetID()), 226 sqlbase.WrapDescriptor(desc), 227 ); err != nil { 228 t.Fatal(err) 229 } 230 return desc.GetID() 231 } 232 233 setGCMutations := func(gcMutations []sqlbase.TableDescriptor_GCDescriptorMutation) sqlbase.ID { 234 desc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "to_be_mutated") 235 desc.GCMutations = gcMutations 236 if err := kvDB.Put( 237 context.Background(), 238 sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, desc.GetID()), 239 sqlbase.WrapDescriptor(desc), 240 ); err != nil { 241 t.Fatal(err) 242 } 243 return desc.GetID() 244 } 245 246 setDropJob := func(shouldDrop bool) sqlbase.ID { 247 desc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "to_be_mutated") 248 if shouldDrop { 249 desc.DropJobID = 123 250 } else { 251 // Set it back to the default val. 252 desc.DropJobID = 0 253 } 254 if err := kvDB.Put( 255 context.Background(), 256 sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, desc.GetID()), 257 sqlbase.WrapDescriptor(desc), 258 ); err != nil { 259 t.Fatal(err) 260 } 261 return desc.GetID() 262 } 263 264 writeJob := func(name string, created, finished time.Time, status Status, mutOptions mutationOptions) string { 265 if _, err := sqlDB.Exec(` 266 CREATE DATABASE IF NOT EXISTS t; CREATE TABLE IF NOT EXISTS t.to_be_mutated AS SELECT 1`); err != nil { 267 t.Fatal(err) 268 } 269 descriptorID := setDropJob(mutOptions.hasDropJob) 270 if mutOptions.hasMutation { 271 descriptorID = setMutations([]sqlbase.DescriptorMutation{{}}) 272 } 273 if mutOptions.hasGCMutation { 274 descriptorID = setGCMutations([]sqlbase.TableDescriptor_GCDescriptorMutation{{}}) 275 } 276 277 payload, err := protoutil.Marshal(&jobspb.Payload{ 278 Description: name, 279 Lease: &jobspb.Lease{NodeID: 1, Epoch: 1}, 280 // register a mutation on the table so that jobs that reference 281 // the table are not considered orphaned 282 DescriptorIDs: []sqlbase.ID{ 283 descriptorID, 284 sqlbase.InvalidID, // invalid id to test handling of missing descriptors. 285 }, 286 Details: jobspb.WrapPayloadDetails(jobspb.SchemaChangeDetails{}), 287 StartedMicros: timeutil.ToUnixMicros(created), 288 FinishedMicros: timeutil.ToUnixMicros(finished), 289 }) 290 if err != nil { 291 t.Fatal(err) 292 } 293 progress, err := protoutil.Marshal(&jobspb.Progress{ 294 Details: jobspb.WrapProgressDetails(jobspb.SchemaChangeProgress{}), 295 }) 296 if err != nil { 297 t.Fatal(err) 298 } 299 300 var id int64 301 db.QueryRow(t, 302 `INSERT INTO system.jobs (status, payload, progress, created) VALUES ($1, $2, $3, $4) RETURNING id`, 303 status, payload, progress, created).Scan(&id) 304 return strconv.Itoa(int(id)) 305 } 306 307 // Test the descriptor when any of the following are set. 308 // 1. Mutations 309 // 2. GC Mutations 310 // 3. A drop job 311 for _, hasMutation := range []bool{true, false} { 312 for _, hasGCMutation := range []bool{true, false} { 313 for _, hasDropJob := range []bool{true, false} { 314 if !hasMutation && !hasGCMutation && !hasDropJob { 315 continue 316 } 317 mutOptions := mutationOptions{ 318 hasMutation: hasMutation, 319 hasGCMutation: hasGCMutation, 320 hasDropJob: hasDropJob, 321 } 322 oldRunningJob := writeJob("old_running", muchEarlier, time.Time{}, StatusRunning, mutOptions) 323 oldSucceededJob := writeJob("old_succeeded", muchEarlier, muchEarlier.Add(time.Minute), StatusSucceeded, mutOptions) 324 oldSucceededJob2 := writeJob("old_succeeded2", muchEarlier, muchEarlier.Add(time.Minute), StatusSucceeded, mutOptions) 325 newRunningJob := writeJob("new_running", earlier, time.Time{}, StatusRunning, mutOptions) 326 newSucceededJob := writeJob("new_succeeded", earlier, earlier.Add(time.Minute), StatusSucceeded, mutOptions) 327 328 db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{ 329 {oldRunningJob}, {oldSucceededJob}, {oldSucceededJob2}, {newRunningJob}, {newSucceededJob}}) 330 331 if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, earlier); err != nil { 332 t.Fatal(err) 333 } 334 db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{ 335 {oldRunningJob}, {newRunningJob}, {newSucceededJob}}) 336 337 if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, earlier); err != nil { 338 t.Fatal(err) 339 } 340 db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{ 341 {oldRunningJob}, {newRunningJob}, {newSucceededJob}}) 342 343 if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, ts.Add(time.Minute*-10)); err != nil { 344 t.Fatal(err) 345 } 346 db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{ 347 {oldRunningJob}, {newRunningJob}}) 348 349 // force the running jobs to become orphaned 350 _ = setMutations(nil) 351 _ = setGCMutations(nil) 352 _ = setDropJob(false) 353 if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, ts.Add(time.Minute*-10)); err != nil { 354 t.Fatal(err) 355 } 356 db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{}) 357 } 358 } 359 } 360 }