github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/registry_test.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package jobs
    12  
    13  import (
    14  	"context"
    15  	"math"
    16  	"strconv"
    17  	"testing"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    22  	"github.com/cockroachdb/cockroach/pkg/keys"
    23  	"github.com/cockroachdb/cockroach/pkg/kv"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    27  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    28  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    29  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    30  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    31  	"github.com/cockroachdb/cockroach/pkg/util/log"
    32  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    33  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    34  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    35  )
    36  
    37  func FakePHS(opName, user string) (interface{}, func()) {
    38  	return nil, func() {}
    39  }
    40  
    41  func TestRegistryCancelation(t *testing.T) {
    42  	defer leaktest.AfterTest(t)()
    43  
    44  	ctx, stopper := context.Background(), stop.NewStopper()
    45  	defer stopper.Stop(ctx)
    46  
    47  	// Not using the server.DefaultHistogramWindowInterval constant because
    48  	// of a dep cycle.
    49  	const histogramWindowInterval = 60 * time.Second
    50  
    51  	const nodeCount = 1
    52  	nodeLiveness := NewFakeNodeLiveness(nodeCount)
    53  
    54  	var db *kv.DB
    55  	// Insulate this test from wall time.
    56  	mClock := hlc.NewManualClock(hlc.UnixNano())
    57  	clock := hlc.NewClock(mClock.UnixNano, time.Nanosecond)
    58  	registry := MakeRegistry(
    59  		log.AmbientContext{},
    60  		stopper,
    61  		clock,
    62  		sqlbase.MakeOptionalNodeLiveness(nodeLiveness),
    63  		db,
    64  		nil, /* ex */
    65  		base.TestingIDContainer,
    66  		cluster.NoSettings,
    67  		histogramWindowInterval,
    68  		FakePHS,
    69  		"",
    70  	)
    71  
    72  	const cancelInterval = time.Nanosecond
    73  	const adoptInterval = time.Duration(math.MaxInt64)
    74  	if err := registry.Start(ctx, stopper, cancelInterval, adoptInterval); err != nil {
    75  		t.Fatal(err)
    76  	}
    77  
    78  	wait := func() {
    79  		// Every turn of the registry's liveness poll loop will generate exactly one
    80  		// call to nodeLiveness.Self. Only after we've witnessed two calls can we be
    81  		// sure that the first turn of the registry's loop has completed.
    82  		//
    83  		// Waiting for only the first call to nodeLiveness.Self is racy, as we'd
    84  		// perform our assertions concurrently with the registry loop's observation
    85  		// of our injected liveness failure, if any.
    86  		<-nodeLiveness.SelfCalledCh
    87  		<-nodeLiveness.SelfCalledCh
    88  	}
    89  
    90  	cancelCount := 0
    91  	didRegister := false
    92  	jobID := int64(1)
    93  	const nodeID = roachpb.NodeID(1)
    94  
    95  	register := func() {
    96  		didRegister = true
    97  		jobID++
    98  		if err := registry.register(jobID, func() { cancelCount++ }); err != nil {
    99  			t.Fatal(err)
   100  		}
   101  	}
   102  	unregister := func() {
   103  		registry.unregister(jobID)
   104  		didRegister = false
   105  	}
   106  	expectCancel := func(expect bool) {
   107  		t.Helper()
   108  
   109  		wait()
   110  		var e int
   111  		if expect {
   112  			e = 1
   113  		}
   114  		if a := cancelCount; e != a {
   115  			t.Errorf("expected cancelCount of %d, but got %d", e, a)
   116  		}
   117  	}
   118  	check := func(fn func()) {
   119  		fn()
   120  		if didRegister {
   121  			unregister()
   122  			wait()
   123  		}
   124  		cancelCount = 0
   125  	}
   126  	// inWindow slews the expiration time of the node's expiration.
   127  	inWindow := func(in bool) {
   128  		nanos := -defaultLeniencySetting.Nanoseconds()
   129  		if in {
   130  			nanos = nanos / 2
   131  		} else {
   132  			nanos = nanos * 2
   133  		}
   134  		nodeLiveness.FakeSetExpiration(nodeID, clock.Now().Add(nanos, 0))
   135  	}
   136  
   137  	// Jobs that complete while the node is live should be canceled once.
   138  	check(func() {
   139  		register()
   140  		expectCancel(false)
   141  		unregister()
   142  		expectCancel(true)
   143  	})
   144  
   145  	// Jobs that are in-progress when the liveness epoch is incremented
   146  	// should not be canceled.
   147  	check(func() {
   148  		register()
   149  		nodeLiveness.FakeIncrementEpoch(nodeID)
   150  		expectCancel(false)
   151  		unregister()
   152  		expectCancel(true)
   153  	})
   154  
   155  	// Jobs started in the new epoch that complete while the new epoch is live
   156  	// should be canceled once.
   157  	check(func() {
   158  		register()
   159  		expectCancel(false)
   160  		unregister()
   161  		expectCancel(true)
   162  	})
   163  
   164  	// Jobs **alive** within the leniency period should not be canceled.
   165  	check(func() {
   166  		register()
   167  		inWindow(true)
   168  		expectCancel(false)
   169  		unregister()
   170  		expectCancel(true)
   171  	})
   172  
   173  	// Jobs **started** within the leniency period should not be canceled.
   174  	check(func() {
   175  		inWindow(true)
   176  		register()
   177  		expectCancel(false)
   178  	})
   179  
   180  	// Jobs **alive** outside of the leniency period should be canceled.
   181  	check(func() {
   182  		register()
   183  		inWindow(false)
   184  		expectCancel(true)
   185  	})
   186  
   187  	// Jobs **started** outside of the leniency period should be canceled.
   188  	check(func() {
   189  		inWindow(false)
   190  		register()
   191  		expectCancel(true)
   192  	})
   193  }
   194  
   195  func TestRegistryGC(t *testing.T) {
   196  	defer leaktest.AfterTest(t)()
   197  	t.Skip("")
   198  	// TODO (lucy): This test probably shouldn't continue to exist in its current
   199  	// form if GCMutations will cease to be used. Refactor or get rid of it.
   200  
   201  	ctx := context.Background()
   202  	s, sqlDB, kvDB := serverutils.StartServer(t, base.TestServerArgs{})
   203  	defer s.Stopper().Stop(ctx)
   204  
   205  	db := sqlutils.MakeSQLRunner(sqlDB)
   206  
   207  	type mutationOptions struct {
   208  		// Set if the desc should have any mutations of any sort.
   209  		hasMutation bool
   210  		// Set if the mutation being inserted is a GCMutation.
   211  		hasGCMutation bool
   212  		// Set if the desc should have a job that is dropping it.
   213  		hasDropJob bool
   214  	}
   215  
   216  	ts := timeutil.Now()
   217  	earlier := ts.Add(-1 * time.Hour)
   218  	muchEarlier := ts.Add(-2 * time.Hour)
   219  
   220  	setMutations := func(mutations []sqlbase.DescriptorMutation) sqlbase.ID {
   221  		desc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "to_be_mutated")
   222  		desc.Mutations = mutations
   223  		if err := kvDB.Put(
   224  			context.Background(),
   225  			sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, desc.GetID()),
   226  			sqlbase.WrapDescriptor(desc),
   227  		); err != nil {
   228  			t.Fatal(err)
   229  		}
   230  		return desc.GetID()
   231  	}
   232  
   233  	setGCMutations := func(gcMutations []sqlbase.TableDescriptor_GCDescriptorMutation) sqlbase.ID {
   234  		desc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "to_be_mutated")
   235  		desc.GCMutations = gcMutations
   236  		if err := kvDB.Put(
   237  			context.Background(),
   238  			sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, desc.GetID()),
   239  			sqlbase.WrapDescriptor(desc),
   240  		); err != nil {
   241  			t.Fatal(err)
   242  		}
   243  		return desc.GetID()
   244  	}
   245  
   246  	setDropJob := func(shouldDrop bool) sqlbase.ID {
   247  		desc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "to_be_mutated")
   248  		if shouldDrop {
   249  			desc.DropJobID = 123
   250  		} else {
   251  			// Set it back to the default val.
   252  			desc.DropJobID = 0
   253  		}
   254  		if err := kvDB.Put(
   255  			context.Background(),
   256  			sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, desc.GetID()),
   257  			sqlbase.WrapDescriptor(desc),
   258  		); err != nil {
   259  			t.Fatal(err)
   260  		}
   261  		return desc.GetID()
   262  	}
   263  
   264  	writeJob := func(name string, created, finished time.Time, status Status, mutOptions mutationOptions) string {
   265  		if _, err := sqlDB.Exec(`
   266  CREATE DATABASE IF NOT EXISTS t; CREATE TABLE IF NOT EXISTS t.to_be_mutated AS SELECT 1`); err != nil {
   267  			t.Fatal(err)
   268  		}
   269  		descriptorID := setDropJob(mutOptions.hasDropJob)
   270  		if mutOptions.hasMutation {
   271  			descriptorID = setMutations([]sqlbase.DescriptorMutation{{}})
   272  		}
   273  		if mutOptions.hasGCMutation {
   274  			descriptorID = setGCMutations([]sqlbase.TableDescriptor_GCDescriptorMutation{{}})
   275  		}
   276  
   277  		payload, err := protoutil.Marshal(&jobspb.Payload{
   278  			Description: name,
   279  			Lease:       &jobspb.Lease{NodeID: 1, Epoch: 1},
   280  			// register a mutation on the table so that jobs that reference
   281  			// the table are not considered orphaned
   282  			DescriptorIDs: []sqlbase.ID{
   283  				descriptorID,
   284  				sqlbase.InvalidID, // invalid id to test handling of missing descriptors.
   285  			},
   286  			Details:        jobspb.WrapPayloadDetails(jobspb.SchemaChangeDetails{}),
   287  			StartedMicros:  timeutil.ToUnixMicros(created),
   288  			FinishedMicros: timeutil.ToUnixMicros(finished),
   289  		})
   290  		if err != nil {
   291  			t.Fatal(err)
   292  		}
   293  		progress, err := protoutil.Marshal(&jobspb.Progress{
   294  			Details: jobspb.WrapProgressDetails(jobspb.SchemaChangeProgress{}),
   295  		})
   296  		if err != nil {
   297  			t.Fatal(err)
   298  		}
   299  
   300  		var id int64
   301  		db.QueryRow(t,
   302  			`INSERT INTO system.jobs (status, payload, progress, created) VALUES ($1, $2, $3, $4) RETURNING id`,
   303  			status, payload, progress, created).Scan(&id)
   304  		return strconv.Itoa(int(id))
   305  	}
   306  
   307  	// Test the descriptor when any of the following are set.
   308  	// 1. Mutations
   309  	// 2. GC Mutations
   310  	// 3. A drop job
   311  	for _, hasMutation := range []bool{true, false} {
   312  		for _, hasGCMutation := range []bool{true, false} {
   313  			for _, hasDropJob := range []bool{true, false} {
   314  				if !hasMutation && !hasGCMutation && !hasDropJob {
   315  					continue
   316  				}
   317  				mutOptions := mutationOptions{
   318  					hasMutation:   hasMutation,
   319  					hasGCMutation: hasGCMutation,
   320  					hasDropJob:    hasDropJob,
   321  				}
   322  				oldRunningJob := writeJob("old_running", muchEarlier, time.Time{}, StatusRunning, mutOptions)
   323  				oldSucceededJob := writeJob("old_succeeded", muchEarlier, muchEarlier.Add(time.Minute), StatusSucceeded, mutOptions)
   324  				oldSucceededJob2 := writeJob("old_succeeded2", muchEarlier, muchEarlier.Add(time.Minute), StatusSucceeded, mutOptions)
   325  				newRunningJob := writeJob("new_running", earlier, time.Time{}, StatusRunning, mutOptions)
   326  				newSucceededJob := writeJob("new_succeeded", earlier, earlier.Add(time.Minute), StatusSucceeded, mutOptions)
   327  
   328  				db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{
   329  					{oldRunningJob}, {oldSucceededJob}, {oldSucceededJob2}, {newRunningJob}, {newSucceededJob}})
   330  
   331  				if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, earlier); err != nil {
   332  					t.Fatal(err)
   333  				}
   334  				db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{
   335  					{oldRunningJob}, {newRunningJob}, {newSucceededJob}})
   336  
   337  				if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, earlier); err != nil {
   338  					t.Fatal(err)
   339  				}
   340  				db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{
   341  					{oldRunningJob}, {newRunningJob}, {newSucceededJob}})
   342  
   343  				if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, ts.Add(time.Minute*-10)); err != nil {
   344  					t.Fatal(err)
   345  				}
   346  				db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{
   347  					{oldRunningJob}, {newRunningJob}})
   348  
   349  				// force the running jobs to become orphaned
   350  				_ = setMutations(nil)
   351  				_ = setGCMutations(nil)
   352  				_ = setDropJob(false)
   353  				if err := s.JobRegistry().(*Registry).cleanupOldJobs(ctx, ts.Add(time.Minute*-10)); err != nil {
   354  					t.Fatal(err)
   355  				}
   356  				db.CheckQueryResults(t, `SELECT id FROM system.jobs ORDER BY id`, [][]string{})
   357  			}
   358  		}
   359  	}
   360  }