github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/exec/slicemachine_test.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package exec 6 7 import ( 8 "context" 9 "errors" 10 "fmt" 11 "math/rand" 12 "sync" 13 "testing" 14 "time" 15 16 "github.com/grailbio/bigmachine" 17 "github.com/grailbio/bigmachine/testsystem" 18 ) 19 20 func TestSlicemachineLoad(t *testing.T) { 21 for _, maxLoad := range []float64{0.5, 0.90, 1.5} { 22 t.Run(fmt.Sprint("maxLoad=", maxLoad), func(t *testing.T) { 23 const ( 24 Nproc = 100 25 Nmach = 10 26 ) 27 ntask := int(maxLoad * Nproc * Nmach) 28 system, _, mgr, cancel := startTestSystem( 29 Nproc, 30 ntask, 31 maxLoad, 32 ) 33 defer cancel() 34 35 if got, want := system.N(), 0; got != want { 36 t.Errorf("got %v, want %v", got, want) 37 } 38 ctx := context.Background() 39 ms := getMachines(ctx, mgr, 1) 40 if got, want := system.Wait(1), 1; got != want { 41 t.Errorf("got %v, want %v", got, want) 42 } 43 ms = append(ms, getMachines(ctx, mgr, ntask-1)...) 44 if got, want := system.Wait(Nmach), Nmach; got != want { 45 t.Errorf("got %v, want %v", got, want) 46 } 47 mustUnavailable(t, mgr) 48 if got, want := system.Wait(Nmach), Nmach; got != want { 49 t.Errorf("got %v, want %v", got, want) 50 } 51 // Machines should be balanced, and allow maxLoad load. 52 loads := make(map[*sliceMachine]int) 53 for i := range ms { 54 if ms[i] != nil { 55 loads[ms[i]]++ 56 } 57 } 58 if got, want := len(loads), Nmach; got != want { 59 t.Errorf("got %v, want %v", got, want) 60 } 61 for m, v := range loads { 62 if got, want := v, int(Nproc*maxLoad); got != want { 63 t.Errorf("%s: got %v, want %v", m, got, want) 64 } 65 } 66 }) 67 } 68 } 69 70 func TestSlicemachineExclusive(t *testing.T) { 71 var ( 72 system, _, mgr, cancel = startTestSystem(32, 64, 0) 73 ctx = context.Background() 74 ) 75 getMachines(ctx, mgr, 1) 76 if got, want := system.Wait(1), 1; got != want { 77 t.Errorf("got %v, want %v", got, want) 78 } 79 getMachines(ctx, mgr, 1) 80 mustUnavailable(t, mgr) 81 if got, want := system.Wait(2), 2; got != want { 82 t.Errorf("got %v, want %v", got, want) 83 } 84 cancel() 85 if got, want := system.N(), 2; got != want { 86 t.Errorf("got %v, want %v", got, want) 87 } 88 } 89 90 func TestSlicemachineProbation(t *testing.T) { 91 system, _, mgr, cancel := startTestSystem(2, 4, 1.0) 92 defer cancel() 93 94 ctx := context.Background() 95 ms := getMachines(ctx, mgr, 4) 96 if got, want := system.N(), 2; got != want { 97 t.Errorf("got %v, want %v", got, want) 98 } 99 ms[0].Done(1, errors.New("some error")) 100 mustUnavailable(t, mgr) 101 if got, want := ms[0].health, machineProbation; got != want { 102 t.Errorf("got %v, want %v", got, want) 103 } 104 ms[1].Done(1, nil) 105 ns := getMachines(ctx, mgr, 2) 106 if got, want := ns[0], ms[0]; got != want { 107 t.Errorf("got %v, want %v", got, want) 108 } 109 if got, want := ns[1], ms[1]; got != want { 110 t.Errorf("got %v, want %v", got, want) 111 } 112 if got, want := ms[0].health, machineOk; got != want { 113 t.Errorf("got %v, want %v", ms[0].health, want) 114 } 115 } 116 117 // TestSlicemachineProbationTimeout verifies that machines that have been put on 118 // probation and do not experience further errors are removed from probation. 119 func TestSlicemachineProbationTimeout(t *testing.T) { 120 const machinep = 2 121 const maxp = 16 122 if maxp < machinep*4 { 123 panic("maxp not big enough") 124 } 125 // This test takes way too long to recover with the default probation 126 // timeout. 127 save := ProbationTimeout 128 ProbationTimeout = time.Second 129 defer func() { 130 ProbationTimeout = save 131 }() 132 _, _, mgr, cancel := startTestSystem(machinep, maxp, 1.0) 133 defer cancel() 134 ctx := context.Background() 135 ms := getMachines(ctx, mgr, maxp) 136 for i := range ms { 137 if i%machinep != 0 { 138 continue 139 } 140 ms[i].Done(1, errors.New("some error")) 141 } 142 // Bring two machines back from probation with successful completions to 143 // make sure there's no surprising interaction with timeouts. 144 ms[0*machinep].Done(1, nil) 145 ms[2*machinep].Done(1, nil) 146 ctx, ctxcancel := context.WithTimeout(context.Background(), 10*time.Second) 147 defer ctxcancel() 148 for { 149 select { 150 case <-ctx.Done(): 151 t.Fatal("took too long") 152 default: 153 } 154 <-time.After(100 * time.Millisecond) 155 var healthyCount int 156 for i := range ms { 157 if i%machinep != 0 { 158 continue 159 } 160 if ms[i].health == machineOk { 161 healthyCount++ 162 } 163 } 164 if healthyCount == maxp/machinep { 165 break 166 } 167 } 168 } 169 170 func TestSlicemachineLost(t *testing.T) { 171 if testing.Short() { 172 t.Skip("skipping test in short mode") 173 } 174 system, _, mgr, cancel := startTestSystem(2, 4, 1.0) 175 defer cancel() 176 177 ctx := context.Background() 178 ms := getMachines(ctx, mgr, 4) 179 system.Kill(ms[0].Machine) 180 for ms[0].health != machineLost { 181 <-time.After(10 * time.Millisecond) 182 } 183 if got, want := system.Wait(2), 2; got != want { 184 t.Errorf("got %v, want %v", got, want) 185 } 186 } 187 188 // TestSlicemachinePriority verifies that higher-priority requests are serviced 189 // before lower-priority requests. 190 func TestSlicemachinePriority(t *testing.T) { 191 const maxp = 16 192 _, _, mgr, cancel := startTestSystem(2, maxp, 1.0) 193 defer cancel() 194 195 ctx, ctxcancel := context.WithCancel(context.Background()) 196 defer ctxcancel() 197 // Get machines up to our maximum parallelism. Any requests made afterwards 198 // will need to be queued until these offers are returned. 199 ms := getMachines(ctx, mgr, maxp) 200 sema := make(chan struct{}) 201 c := make(chan int) 202 // Queue up many offer requests with distinct priorities in [0, maxp*4). 203 // We'll expect that the offer requests with priorities in [0, maxp) will be 204 // serviced first. Queue in descending priority value in case requests are 205 // serviced in FIFO order. 206 for i := (maxp * 4) - 1; i >= 0; i-- { 207 i := i 208 go func() { 209 offerc, _ := mgr.Offer(i, 1) 210 sema <- struct{}{} 211 select { 212 case <-offerc: 213 case <-ctx.Done(): 214 return 215 } 216 c <- i 217 }() 218 // Wait for the goroutine offer request to be queued. 219 <-sema 220 } 221 // Return the original machines/procs to allow the machines to be offered to 222 // our blocked requests. 223 for _, m := range ms { 224 m.Done(1, nil) 225 } 226 for j := 0; j < maxp; j++ { 227 i := <-c 228 if i >= maxp { 229 t.Error("did not respect priority") 230 } 231 } 232 } 233 234 // TestSlicemachineNonblockingExclusive verifies that the scheduling 235 // algorithm does not allow an exclusive task to block progress on 236 // non-exclusive tasks while we wait to schedule it. 237 func TestSlicemachineNonblockingExclusive(t *testing.T) { 238 const ( 239 maxp = 128 240 machprocs = maxp / 2 241 ) 242 _, _, mgr, cancel := startTestSystem(machprocs, maxp, 1.0) 243 defer cancel() 244 245 ctx, ctxcancel := context.WithCancel(context.Background()) 246 defer ctxcancel() 247 248 ms := getMachines(ctx, mgr, maxp) 249 // Return about half of the machines/procs back to the pool immediately. 250 // Occupy the other half indefinitely, making it impossible to successfully 251 // schedule an "exclusive" task. 252 r := rand.New(rand.NewSource(0)) 253 for _, m := range ms { 254 if r.Float64() < 0.5 { 255 m.Done(1, nil) 256 continue 257 } 258 } 259 var wg sync.WaitGroup 260 // Attempt to schedule an exclusive task. We expect this to be impossible 261 // to schedule. 262 wg.Add(1) 263 go func() { 264 offerc, _ := mgr.Offer(1, machprocs) 265 wg.Done() 266 select { 267 case <-offerc: 268 // This means that we were able to schedule the exclusive task. We 269 // shouldn't get here, as we should have been able to use one of 270 // our half-loaded machines to schedule all of our lower priority 271 // requests first. 272 panic("impossible scheduling") 273 case <-ctx.Done(): 274 return 275 } 276 }() 277 wg.Wait() 278 // Schedule a bunch of lower priority (2) tasks. These should all be 279 // successfully scheduled to run on one of our machines while the other is 280 // reserved for the exclusive task. 281 wg.Add(maxp) 282 for i := 0; i < maxp; i++ { 283 go func() { 284 defer wg.Done() 285 offerc, _ := mgr.Offer(2, 1) 286 select { 287 case m := <-offerc: 288 m.Done(1, nil) 289 case <-ctx.Done(): 290 return 291 } 292 }() 293 } 294 wg.Wait() 295 // Returning means that the test passes. If we're blocked on scheduling the 296 // exclusive task, we'll never return, and the test will time out. 297 } 298 299 func startTestSystem(machinep, maxp int, maxLoad float64) (system *testsystem.System, b *bigmachine.B, m *machineManager, cancel func()) { 300 system = testsystem.New() 301 system.Machineprocs = machinep 302 // Customize timeouts so that tests run faster. 303 system.KeepalivePeriod = time.Second 304 system.KeepaliveTimeout = 5 * time.Second 305 system.KeepaliveRpcTimeout = time.Second 306 b = bigmachine.Start(system) 307 ctx, ctxcancel := context.WithCancel(context.Background()) 308 m = newMachineManager(b, nil, nil, maxp, maxLoad, &worker{MachineCombiners: false}) 309 var wg sync.WaitGroup 310 wg.Add(1) 311 go func() { 312 m.Do(ctx) 313 wg.Done() 314 }() 315 cancel = func() { 316 ctxcancel() 317 b.Shutdown() 318 wg.Wait() 319 } 320 return 321 } 322 323 // getMachines gets n machines from mgr and returns them. 324 func getMachines(ctx context.Context, mgr *machineManager, n int) []*sliceMachine { 325 ms := make([]*sliceMachine, n) 326 for i := range ms { 327 offerc, _ := mgr.Offer(0, 1) 328 ms[i] = <-offerc 329 } 330 return ms 331 } 332 333 // mustUnavailable asserts that no machine is immediately available from mgr. 334 func mustUnavailable(t *testing.T, mgr *machineManager) { 335 t.Helper() 336 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) 337 defer cancel() 338 offerc, cancel := mgr.Offer(0, 1) 339 select { 340 case <-offerc: 341 t.Fatal("unexpected machine available") 342 case <-ctx.Done(): 343 cancel() 344 } 345 }