github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/work_pool.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/util/log"
    19  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    20  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    21  )
    22  
    23  // workPool keeps track of what tests still need to run and facilitates
    24  // selecting the next test to run.
    25  type workPool struct {
    26  	// count is the total number of times each test has to run. It is constant.
    27  	// Not to be confused with the count inside mu.tests, which tracks remaining
    28  	// runs.
    29  	count int
    30  	mu    struct {
    31  		syncutil.Mutex
    32  		// tests with remaining run count.
    33  		tests []testWithCount
    34  	}
    35  }
    36  
    37  func newWorkPool(tests []testSpec, count int) *workPool {
    38  	p := &workPool{count: count}
    39  	for _, spec := range tests {
    40  		p.mu.tests = append(p.mu.tests, testWithCount{spec: spec, count: count})
    41  	}
    42  	return p
    43  }
    44  
    45  // testToRunRes represents the return value of getTestToRun. It provides
    46  // information about what test to run (if any) and what cluster to use for it.
    47  type testToRunRes struct {
    48  	// noWork is set if the work pool was empty and thus no test was selected. No
    49  	// other fields are set.
    50  	noWork bool
    51  	// spec is the selected test.
    52  	spec testSpec
    53  	// runNum is run number. 1 if --count was not used.
    54  	runNum int
    55  
    56  	// canReuseCluster is true if the selected test can reuse the cluster passed
    57  	// to testToRun(). Will be false if noWork is set.
    58  	canReuseCluster bool
    59  	// alloc is set if canReuseCluster is false (and noWork is not set). It
    60  	// represents the resources to use for creating a new cluster (matching spec).
    61  	// The alloc needs to be transferred to the cluster that is created, or
    62  	// otherwise Release()d.
    63  	alloc *quotapool.IntAlloc
    64  }
    65  
    66  func (p *workPool) workRemaining() []testWithCount {
    67  	p.mu.Lock()
    68  	defer p.mu.Unlock()
    69  	res := make([]testWithCount, len(p.mu.tests))
    70  	copy(res, p.mu.tests)
    71  	return res
    72  }
    73  
    74  // getTestToRun selects a test. It optionally takes a cluster and will try to
    75  // select a test that can reuse that cluster. If it succeeds, then
    76  // testToRunRes.canReuseCluster will be set. Otherwise, the cluster is destroyed
    77  // so its resources are released, and the result will contain a quota alloc to
    78  // be used by the caller for creating a new cluster.
    79  //
    80  // If a new cluster needs to be created, the call blocks until enough resources
    81  // are taken out of qp.
    82  //
    83  // If there are no more tests to run, c will be destroyed and the result will
    84  // have noWork set.
    85  func (p *workPool) getTestToRun(
    86  	ctx context.Context,
    87  	c *cluster,
    88  	qp *quotapool.IntPool,
    89  	cr *clusterRegistry,
    90  	onDestroy func(),
    91  	l *logger,
    92  ) (testToRunRes, error) {
    93  	// If we've been given a cluster, see if we can reuse it.
    94  	if c != nil {
    95  		ttr := p.selectTestForCluster(ctx, c.spec, cr)
    96  		if ttr.noWork {
    97  			// We failed to find a test that can take advantage of this cluster. So
    98  			// we're going to release is, which will deallocate its resources, and
    99  			// then we'll look for a test below.
   100  			l.PrintfCtx(ctx,
   101  				"No tests that can reuse cluster %s found (or there are no further tests to run). "+
   102  					"Destroying.", c)
   103  			c.Destroy(ctx, closeLogger, l)
   104  			onDestroy()
   105  		} else {
   106  			return ttr, nil
   107  		}
   108  	}
   109  
   110  	return p.selectTest(ctx, qp)
   111  }
   112  
   113  // selectTestForCluster selects a test to run on a cluster with a given spec.
   114  //
   115  // Among tests that match the spec, we do the following:
   116  // - If the cluster is already tagged, we only look at tests with the same tag.
   117  // - Otherwise, we'll choose in the following order of preference:
   118  // 1) tests that leave the cluster usable by anybody afterwards
   119  // 2) tests that leave the cluster usable by some other tests
   120  // 	2.1) within this OnlyTagged<foo> category, we'll prefer the tag with the
   121  // 			 fewest existing clusters.
   122  // 3) tests that leave the cluster unusable by anybody
   123  //
   124  // Within each of the categories, we'll give preference to tests with fewer
   125  // runs.
   126  //
   127  // cr is used for its information about how many clusters with a given tag currently exist.
   128  func (p *workPool) selectTestForCluster(
   129  	ctx context.Context, spec clusterSpec, cr *clusterRegistry,
   130  ) testToRunRes {
   131  	p.mu.Lock()
   132  	defer p.mu.Unlock()
   133  	testsWithCounts := p.findCompatibleTestsLocked(spec)
   134  
   135  	if len(testsWithCounts) == 0 {
   136  		return testToRunRes{noWork: true}
   137  	}
   138  
   139  	tag := ""
   140  	if p, ok := spec.ReusePolicy.(reusePolicyTagged); ok {
   141  		tag = p.tag
   142  	}
   143  	// Find the best test to run.
   144  	candidateScore := 0
   145  	var candidate testWithCount
   146  	for _, tc := range testsWithCounts {
   147  		score := scoreTestAgainstCluster(tc, tag, cr)
   148  		if score > candidateScore {
   149  			candidateScore = score
   150  			candidate = tc
   151  		}
   152  	}
   153  
   154  	p.decTestLocked(ctx, candidate.spec.Name)
   155  	runNum := p.count - candidate.count + 1
   156  	return testToRunRes{
   157  		spec:            candidate.spec,
   158  		runNum:          runNum,
   159  		canReuseCluster: true,
   160  	}
   161  }
   162  
   163  // selectTest selects a test to run based on the available resources. If there are
   164  // no resources available to run any test, it blocks until enough resources become available.
   165  //
   166  // If multiple tests are eligible to run, one with the most runs left is chosen.
   167  // TODO(andrei): We could be smarter in guessing what kind of cluster is best to
   168  // allocate.
   169  func (p *workPool) selectTest(ctx context.Context, qp *quotapool.IntPool) (testToRunRes, error) {
   170  	var ttr testToRunRes
   171  	alloc, err := qp.AcquireFunc(ctx, func(ctx context.Context, pi quotapool.PoolInfo) (uint64, error) {
   172  		p.mu.Lock()
   173  		defer p.mu.Unlock()
   174  
   175  		if len(p.mu.tests) == 0 {
   176  			ttr = testToRunRes{
   177  				noWork: true,
   178  			}
   179  			return 0, nil
   180  		}
   181  
   182  		candidateIdx := -1
   183  		candidateCount := 0
   184  		smallestTest := math.MaxInt64
   185  		for i, t := range p.mu.tests {
   186  			cpu := t.spec.Cluster.NodeCount * t.spec.Cluster.CPUs
   187  			if cpu < smallestTest {
   188  				smallestTest = cpu
   189  			}
   190  			if uint64(cpu) > pi.Available {
   191  				continue
   192  			}
   193  			if t.count > candidateCount {
   194  				candidateIdx = i
   195  				candidateCount = t.count
   196  			}
   197  		}
   198  
   199  		if candidateIdx == -1 {
   200  			if uint64(smallestTest) > pi.Capacity {
   201  				return 0, fmt.Errorf("not enough CPU quota to run any of the remaining tests")
   202  			}
   203  
   204  			return 0, quotapool.ErrNotEnoughQuota
   205  		}
   206  
   207  		tc := p.mu.tests[candidateIdx]
   208  		runNum := p.count - tc.count + 1
   209  		p.decTestLocked(ctx, tc.spec.Name)
   210  		ttr = testToRunRes{
   211  			spec:            tc.spec,
   212  			runNum:          runNum,
   213  			canReuseCluster: false,
   214  		}
   215  		cpu := tc.spec.Cluster.NodeCount * tc.spec.Cluster.CPUs
   216  		return uint64(cpu), nil
   217  	})
   218  	if err != nil {
   219  		return testToRunRes{}, err
   220  	}
   221  	ttr.alloc = alloc
   222  	return ttr, nil
   223  }
   224  
   225  // scoreTestAgainstCluster scores the suitability of running a test against a
   226  // cluster currently tagged with tag (empty if cluster is not tagged).
   227  //
   228  // cr is used for its information about how many clusters with a given tag
   229  // currently exist.
   230  func scoreTestAgainstCluster(tc testWithCount, tag string, cr *clusterRegistry) int {
   231  	t := tc.spec
   232  	testPolicy := t.Cluster.ReusePolicy
   233  	if tag != "" && testPolicy != (reusePolicyTagged{tag: tag}) {
   234  		log.Fatalf(context.TODO(),
   235  			"incompatible test and cluster. Cluster tag: %s. Test policy: %+v",
   236  			tag, t.Cluster.ReusePolicy)
   237  	}
   238  	score := 0
   239  	if _, ok := testPolicy.(reusePolicyAny); ok {
   240  		score = 1000000
   241  	} else if _, ok := testPolicy.(reusePolicyTagged); ok {
   242  		score = 500000
   243  		if tag == "" {
   244  			// We have an untagged cluster and a tagged test. Within this category of
   245  			// tests, we prefer the tags with the fewest existing clusters.
   246  			score -= 1000 * cr.countForTag(tag)
   247  		}
   248  	} else { // NoReuse policy
   249  		score = 0
   250  	}
   251  
   252  	// We prefer tests that have run fewer times (so, that have more runs left).
   253  	score += tc.count
   254  
   255  	return score
   256  }
   257  
   258  // findCompatibleTestsLocked returns a list of tests compatible with a cluster spec.
   259  func (p *workPool) findCompatibleTestsLocked(clusterSpec clusterSpec) []testWithCount {
   260  	if _, ok := clusterSpec.ReusePolicy.(reusePolicyNone); ok {
   261  		panic("can't search for tests compatible with a ReuseNone policy")
   262  	}
   263  	var tests []testWithCount
   264  	for _, tc := range p.mu.tests {
   265  		if clustersCompatible(clusterSpec, tc.spec.Cluster) {
   266  			tests = append(tests, tc)
   267  		}
   268  	}
   269  	return tests
   270  }
   271  
   272  // decTestLocked decrements a test's remaining count and removes it
   273  // from the workPool if it was exhausted.
   274  func (p *workPool) decTestLocked(ctx context.Context, name string) {
   275  	idx := -1
   276  	for idx = range p.mu.tests {
   277  		if p.mu.tests[idx].spec.Name == name {
   278  			break
   279  		}
   280  	}
   281  	if idx == -1 {
   282  		log.Fatalf(ctx, "failed to find test: %s", name)
   283  	}
   284  	tc := &p.mu.tests[idx]
   285  	tc.count--
   286  	if tc.count == 0 {
   287  		// We've selected the last run for a test. Take that test out of the pool.
   288  		p.mu.tests = append(p.mu.tests[:idx], p.mu.tests[idx+1:]...)
   289  	}
   290  }