github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/work_pool.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 18 "github.com/cockroachdb/cockroach/pkg/util/log" 19 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 20 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 21 ) 22 23 // workPool keeps track of what tests still need to run and facilitates 24 // selecting the next test to run. 25 type workPool struct { 26 // count is the total number of times each test has to run. It is constant. 27 // Not to be confused with the count inside mu.tests, which tracks remaining 28 // runs. 29 count int 30 mu struct { 31 syncutil.Mutex 32 // tests with remaining run count. 33 tests []testWithCount 34 } 35 } 36 37 func newWorkPool(tests []testSpec, count int) *workPool { 38 p := &workPool{count: count} 39 for _, spec := range tests { 40 p.mu.tests = append(p.mu.tests, testWithCount{spec: spec, count: count}) 41 } 42 return p 43 } 44 45 // testToRunRes represents the return value of getTestToRun. It provides 46 // information about what test to run (if any) and what cluster to use for it. 47 type testToRunRes struct { 48 // noWork is set if the work pool was empty and thus no test was selected. No 49 // other fields are set. 50 noWork bool 51 // spec is the selected test. 52 spec testSpec 53 // runNum is run number. 1 if --count was not used. 54 runNum int 55 56 // canReuseCluster is true if the selected test can reuse the cluster passed 57 // to testToRun(). Will be false if noWork is set. 58 canReuseCluster bool 59 // alloc is set if canReuseCluster is false (and noWork is not set). It 60 // represents the resources to use for creating a new cluster (matching spec). 61 // The alloc needs to be transferred to the cluster that is created, or 62 // otherwise Release()d. 63 alloc *quotapool.IntAlloc 64 } 65 66 func (p *workPool) workRemaining() []testWithCount { 67 p.mu.Lock() 68 defer p.mu.Unlock() 69 res := make([]testWithCount, len(p.mu.tests)) 70 copy(res, p.mu.tests) 71 return res 72 } 73 74 // getTestToRun selects a test. It optionally takes a cluster and will try to 75 // select a test that can reuse that cluster. If it succeeds, then 76 // testToRunRes.canReuseCluster will be set. Otherwise, the cluster is destroyed 77 // so its resources are released, and the result will contain a quota alloc to 78 // be used by the caller for creating a new cluster. 79 // 80 // If a new cluster needs to be created, the call blocks until enough resources 81 // are taken out of qp. 82 // 83 // If there are no more tests to run, c will be destroyed and the result will 84 // have noWork set. 85 func (p *workPool) getTestToRun( 86 ctx context.Context, 87 c *cluster, 88 qp *quotapool.IntPool, 89 cr *clusterRegistry, 90 onDestroy func(), 91 l *logger, 92 ) (testToRunRes, error) { 93 // If we've been given a cluster, see if we can reuse it. 94 if c != nil { 95 ttr := p.selectTestForCluster(ctx, c.spec, cr) 96 if ttr.noWork { 97 // We failed to find a test that can take advantage of this cluster. So 98 // we're going to release is, which will deallocate its resources, and 99 // then we'll look for a test below. 100 l.PrintfCtx(ctx, 101 "No tests that can reuse cluster %s found (or there are no further tests to run). "+ 102 "Destroying.", c) 103 c.Destroy(ctx, closeLogger, l) 104 onDestroy() 105 } else { 106 return ttr, nil 107 } 108 } 109 110 return p.selectTest(ctx, qp) 111 } 112 113 // selectTestForCluster selects a test to run on a cluster with a given spec. 114 // 115 // Among tests that match the spec, we do the following: 116 // - If the cluster is already tagged, we only look at tests with the same tag. 117 // - Otherwise, we'll choose in the following order of preference: 118 // 1) tests that leave the cluster usable by anybody afterwards 119 // 2) tests that leave the cluster usable by some other tests 120 // 2.1) within this OnlyTagged<foo> category, we'll prefer the tag with the 121 // fewest existing clusters. 122 // 3) tests that leave the cluster unusable by anybody 123 // 124 // Within each of the categories, we'll give preference to tests with fewer 125 // runs. 126 // 127 // cr is used for its information about how many clusters with a given tag currently exist. 128 func (p *workPool) selectTestForCluster( 129 ctx context.Context, spec clusterSpec, cr *clusterRegistry, 130 ) testToRunRes { 131 p.mu.Lock() 132 defer p.mu.Unlock() 133 testsWithCounts := p.findCompatibleTestsLocked(spec) 134 135 if len(testsWithCounts) == 0 { 136 return testToRunRes{noWork: true} 137 } 138 139 tag := "" 140 if p, ok := spec.ReusePolicy.(reusePolicyTagged); ok { 141 tag = p.tag 142 } 143 // Find the best test to run. 144 candidateScore := 0 145 var candidate testWithCount 146 for _, tc := range testsWithCounts { 147 score := scoreTestAgainstCluster(tc, tag, cr) 148 if score > candidateScore { 149 candidateScore = score 150 candidate = tc 151 } 152 } 153 154 p.decTestLocked(ctx, candidate.spec.Name) 155 runNum := p.count - candidate.count + 1 156 return testToRunRes{ 157 spec: candidate.spec, 158 runNum: runNum, 159 canReuseCluster: true, 160 } 161 } 162 163 // selectTest selects a test to run based on the available resources. If there are 164 // no resources available to run any test, it blocks until enough resources become available. 165 // 166 // If multiple tests are eligible to run, one with the most runs left is chosen. 167 // TODO(andrei): We could be smarter in guessing what kind of cluster is best to 168 // allocate. 169 func (p *workPool) selectTest(ctx context.Context, qp *quotapool.IntPool) (testToRunRes, error) { 170 var ttr testToRunRes 171 alloc, err := qp.AcquireFunc(ctx, func(ctx context.Context, pi quotapool.PoolInfo) (uint64, error) { 172 p.mu.Lock() 173 defer p.mu.Unlock() 174 175 if len(p.mu.tests) == 0 { 176 ttr = testToRunRes{ 177 noWork: true, 178 } 179 return 0, nil 180 } 181 182 candidateIdx := -1 183 candidateCount := 0 184 smallestTest := math.MaxInt64 185 for i, t := range p.mu.tests { 186 cpu := t.spec.Cluster.NodeCount * t.spec.Cluster.CPUs 187 if cpu < smallestTest { 188 smallestTest = cpu 189 } 190 if uint64(cpu) > pi.Available { 191 continue 192 } 193 if t.count > candidateCount { 194 candidateIdx = i 195 candidateCount = t.count 196 } 197 } 198 199 if candidateIdx == -1 { 200 if uint64(smallestTest) > pi.Capacity { 201 return 0, fmt.Errorf("not enough CPU quota to run any of the remaining tests") 202 } 203 204 return 0, quotapool.ErrNotEnoughQuota 205 } 206 207 tc := p.mu.tests[candidateIdx] 208 runNum := p.count - tc.count + 1 209 p.decTestLocked(ctx, tc.spec.Name) 210 ttr = testToRunRes{ 211 spec: tc.spec, 212 runNum: runNum, 213 canReuseCluster: false, 214 } 215 cpu := tc.spec.Cluster.NodeCount * tc.spec.Cluster.CPUs 216 return uint64(cpu), nil 217 }) 218 if err != nil { 219 return testToRunRes{}, err 220 } 221 ttr.alloc = alloc 222 return ttr, nil 223 } 224 225 // scoreTestAgainstCluster scores the suitability of running a test against a 226 // cluster currently tagged with tag (empty if cluster is not tagged). 227 // 228 // cr is used for its information about how many clusters with a given tag 229 // currently exist. 230 func scoreTestAgainstCluster(tc testWithCount, tag string, cr *clusterRegistry) int { 231 t := tc.spec 232 testPolicy := t.Cluster.ReusePolicy 233 if tag != "" && testPolicy != (reusePolicyTagged{tag: tag}) { 234 log.Fatalf(context.TODO(), 235 "incompatible test and cluster. Cluster tag: %s. Test policy: %+v", 236 tag, t.Cluster.ReusePolicy) 237 } 238 score := 0 239 if _, ok := testPolicy.(reusePolicyAny); ok { 240 score = 1000000 241 } else if _, ok := testPolicy.(reusePolicyTagged); ok { 242 score = 500000 243 if tag == "" { 244 // We have an untagged cluster and a tagged test. Within this category of 245 // tests, we prefer the tags with the fewest existing clusters. 246 score -= 1000 * cr.countForTag(tag) 247 } 248 } else { // NoReuse policy 249 score = 0 250 } 251 252 // We prefer tests that have run fewer times (so, that have more runs left). 253 score += tc.count 254 255 return score 256 } 257 258 // findCompatibleTestsLocked returns a list of tests compatible with a cluster spec. 259 func (p *workPool) findCompatibleTestsLocked(clusterSpec clusterSpec) []testWithCount { 260 if _, ok := clusterSpec.ReusePolicy.(reusePolicyNone); ok { 261 panic("can't search for tests compatible with a ReuseNone policy") 262 } 263 var tests []testWithCount 264 for _, tc := range p.mu.tests { 265 if clustersCompatible(clusterSpec, tc.spec.Cluster) { 266 tests = append(tests, tc) 267 } 268 } 269 return tests 270 } 271 272 // decTestLocked decrements a test's remaining count and removes it 273 // from the workPool if it was exhausted. 274 func (p *workPool) decTestLocked(ctx context.Context, name string) { 275 idx := -1 276 for idx = range p.mu.tests { 277 if p.mu.tests[idx].spec.Name == name { 278 break 279 } 280 } 281 if idx == -1 { 282 log.Fatalf(ctx, "failed to find test: %s", name) 283 } 284 tc := &p.mu.tests[idx] 285 tc.count-- 286 if tc.count == 0 { 287 // We've selected the last run for a test. Take that test out of the pool. 288 p.mu.tests = append(p.mu.tests[:idx], p.mu.tests[idx+1:]...) 289 } 290 }