github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/exec/slicemachine_test.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package exec
     6  
     7  import (
     8  	"context"
     9  	"errors"
    10  	"fmt"
    11  	"math/rand"
    12  	"sync"
    13  	"testing"
    14  	"time"
    15  
    16  	"github.com/grailbio/bigmachine"
    17  	"github.com/grailbio/bigmachine/testsystem"
    18  )
    19  
    20  func TestSlicemachineLoad(t *testing.T) {
    21  	for _, maxLoad := range []float64{0.5, 0.90, 1.5} {
    22  		t.Run(fmt.Sprint("maxLoad=", maxLoad), func(t *testing.T) {
    23  			const (
    24  				Nproc = 100
    25  				Nmach = 10
    26  			)
    27  			ntask := int(maxLoad * Nproc * Nmach)
    28  			system, _, mgr, cancel := startTestSystem(
    29  				Nproc,
    30  				ntask,
    31  				maxLoad,
    32  			)
    33  			defer cancel()
    34  
    35  			if got, want := system.N(), 0; got != want {
    36  				t.Errorf("got %v, want %v", got, want)
    37  			}
    38  			ctx := context.Background()
    39  			ms := getMachines(ctx, mgr, 1)
    40  			if got, want := system.Wait(1), 1; got != want {
    41  				t.Errorf("got %v, want %v", got, want)
    42  			}
    43  			ms = append(ms, getMachines(ctx, mgr, ntask-1)...)
    44  			if got, want := system.Wait(Nmach), Nmach; got != want {
    45  				t.Errorf("got %v, want %v", got, want)
    46  			}
    47  			mustUnavailable(t, mgr)
    48  			if got, want := system.Wait(Nmach), Nmach; got != want {
    49  				t.Errorf("got %v, want %v", got, want)
    50  			}
    51  			// Machines should be balanced, and allow maxLoad load.
    52  			loads := make(map[*sliceMachine]int)
    53  			for i := range ms {
    54  				if ms[i] != nil {
    55  					loads[ms[i]]++
    56  				}
    57  			}
    58  			if got, want := len(loads), Nmach; got != want {
    59  				t.Errorf("got %v, want %v", got, want)
    60  			}
    61  			for m, v := range loads {
    62  				if got, want := v, int(Nproc*maxLoad); got != want {
    63  					t.Errorf("%s: got %v, want %v", m, got, want)
    64  				}
    65  			}
    66  		})
    67  	}
    68  }
    69  
    70  func TestSlicemachineExclusive(t *testing.T) {
    71  	var (
    72  		system, _, mgr, cancel = startTestSystem(32, 64, 0)
    73  		ctx                    = context.Background()
    74  	)
    75  	getMachines(ctx, mgr, 1)
    76  	if got, want := system.Wait(1), 1; got != want {
    77  		t.Errorf("got %v, want %v", got, want)
    78  	}
    79  	getMachines(ctx, mgr, 1)
    80  	mustUnavailable(t, mgr)
    81  	if got, want := system.Wait(2), 2; got != want {
    82  		t.Errorf("got %v, want %v", got, want)
    83  	}
    84  	cancel()
    85  	if got, want := system.N(), 2; got != want {
    86  		t.Errorf("got %v, want %v", got, want)
    87  	}
    88  }
    89  
    90  func TestSlicemachineProbation(t *testing.T) {
    91  	system, _, mgr, cancel := startTestSystem(2, 4, 1.0)
    92  	defer cancel()
    93  
    94  	ctx := context.Background()
    95  	ms := getMachines(ctx, mgr, 4)
    96  	if got, want := system.N(), 2; got != want {
    97  		t.Errorf("got %v, want %v", got, want)
    98  	}
    99  	ms[0].Done(1, errors.New("some error"))
   100  	mustUnavailable(t, mgr)
   101  	if got, want := ms[0].health, machineProbation; got != want {
   102  		t.Errorf("got %v, want %v", got, want)
   103  	}
   104  	ms[1].Done(1, nil)
   105  	ns := getMachines(ctx, mgr, 2)
   106  	if got, want := ns[0], ms[0]; got != want {
   107  		t.Errorf("got %v, want %v", got, want)
   108  	}
   109  	if got, want := ns[1], ms[1]; got != want {
   110  		t.Errorf("got %v, want %v", got, want)
   111  	}
   112  	if got, want := ms[0].health, machineOk; got != want {
   113  		t.Errorf("got %v, want %v", ms[0].health, want)
   114  	}
   115  }
   116  
   117  // TestSlicemachineProbationTimeout verifies that machines that have been put on
   118  // probation and do not experience further errors are removed from probation.
   119  func TestSlicemachineProbationTimeout(t *testing.T) {
   120  	const machinep = 2
   121  	const maxp = 16
   122  	if maxp < machinep*4 {
   123  		panic("maxp not big enough")
   124  	}
   125  	// This test takes way too long to recover with the default probation
   126  	// timeout.
   127  	save := ProbationTimeout
   128  	ProbationTimeout = time.Second
   129  	defer func() {
   130  		ProbationTimeout = save
   131  	}()
   132  	_, _, mgr, cancel := startTestSystem(machinep, maxp, 1.0)
   133  	defer cancel()
   134  	ctx := context.Background()
   135  	ms := getMachines(ctx, mgr, maxp)
   136  	for i := range ms {
   137  		if i%machinep != 0 {
   138  			continue
   139  		}
   140  		ms[i].Done(1, errors.New("some error"))
   141  	}
   142  	// Bring two machines back from probation with successful completions to
   143  	// make sure there's no surprising interaction with timeouts.
   144  	ms[0*machinep].Done(1, nil)
   145  	ms[2*machinep].Done(1, nil)
   146  	ctx, ctxcancel := context.WithTimeout(context.Background(), 10*time.Second)
   147  	defer ctxcancel()
   148  	for {
   149  		select {
   150  		case <-ctx.Done():
   151  			t.Fatal("took too long")
   152  		default:
   153  		}
   154  		<-time.After(100 * time.Millisecond)
   155  		var healthyCount int
   156  		for i := range ms {
   157  			if i%machinep != 0 {
   158  				continue
   159  			}
   160  			if ms[i].health == machineOk {
   161  				healthyCount++
   162  			}
   163  		}
   164  		if healthyCount == maxp/machinep {
   165  			break
   166  		}
   167  	}
   168  }
   169  
   170  func TestSlicemachineLost(t *testing.T) {
   171  	if testing.Short() {
   172  		t.Skip("skipping test in short mode")
   173  	}
   174  	system, _, mgr, cancel := startTestSystem(2, 4, 1.0)
   175  	defer cancel()
   176  
   177  	ctx := context.Background()
   178  	ms := getMachines(ctx, mgr, 4)
   179  	system.Kill(ms[0].Machine)
   180  	for ms[0].health != machineLost {
   181  		<-time.After(10 * time.Millisecond)
   182  	}
   183  	if got, want := system.Wait(2), 2; got != want {
   184  		t.Errorf("got %v, want %v", got, want)
   185  	}
   186  }
   187  
   188  // TestSlicemachinePriority verifies that higher-priority requests are serviced
   189  // before lower-priority requests.
   190  func TestSlicemachinePriority(t *testing.T) {
   191  	const maxp = 16
   192  	_, _, mgr, cancel := startTestSystem(2, maxp, 1.0)
   193  	defer cancel()
   194  
   195  	ctx, ctxcancel := context.WithCancel(context.Background())
   196  	defer ctxcancel()
   197  	// Get machines up to our maximum parallelism. Any requests made afterwards
   198  	// will need to be queued until these offers are returned.
   199  	ms := getMachines(ctx, mgr, maxp)
   200  	sema := make(chan struct{})
   201  	c := make(chan int)
   202  	// Queue up many offer requests with distinct priorities in [0, maxp*4).
   203  	// We'll expect that the offer requests with priorities in [0, maxp) will be
   204  	// serviced first. Queue in descending priority value in case requests are
   205  	// serviced in FIFO order.
   206  	for i := (maxp * 4) - 1; i >= 0; i-- {
   207  		i := i
   208  		go func() {
   209  			offerc, _ := mgr.Offer(i, 1)
   210  			sema <- struct{}{}
   211  			select {
   212  			case <-offerc:
   213  			case <-ctx.Done():
   214  				return
   215  			}
   216  			c <- i
   217  		}()
   218  		// Wait for the goroutine offer request to be queued.
   219  		<-sema
   220  	}
   221  	// Return the original machines/procs to allow the machines to be offered to
   222  	// our blocked requests.
   223  	for _, m := range ms {
   224  		m.Done(1, nil)
   225  	}
   226  	for j := 0; j < maxp; j++ {
   227  		i := <-c
   228  		if i >= maxp {
   229  			t.Error("did not respect priority")
   230  		}
   231  	}
   232  }
   233  
   234  // TestSlicemachineNonblockingExclusive verifies that the scheduling
   235  // algorithm does not allow an exclusive task to block progress on
   236  // non-exclusive tasks while we wait to schedule it.
   237  func TestSlicemachineNonblockingExclusive(t *testing.T) {
   238  	const (
   239  		maxp      = 128
   240  		machprocs = maxp / 2
   241  	)
   242  	_, _, mgr, cancel := startTestSystem(machprocs, maxp, 1.0)
   243  	defer cancel()
   244  
   245  	ctx, ctxcancel := context.WithCancel(context.Background())
   246  	defer ctxcancel()
   247  
   248  	ms := getMachines(ctx, mgr, maxp)
   249  	// Return about half of the machines/procs back to the pool immediately.
   250  	// Occupy the other half indefinitely, making it impossible to successfully
   251  	// schedule an "exclusive" task.
   252  	r := rand.New(rand.NewSource(0))
   253  	for _, m := range ms {
   254  		if r.Float64() < 0.5 {
   255  			m.Done(1, nil)
   256  			continue
   257  		}
   258  	}
   259  	var wg sync.WaitGroup
   260  	// Attempt to schedule an exclusive task. We expect this to be impossible
   261  	// to schedule.
   262  	wg.Add(1)
   263  	go func() {
   264  		offerc, _ := mgr.Offer(1, machprocs)
   265  		wg.Done()
   266  		select {
   267  		case <-offerc:
   268  			// This means that we were able to schedule the exclusive task. We
   269  			// shouldn't get here, as we should have been able to use one of
   270  			// our half-loaded machines to schedule all of our lower priority
   271  			// requests first.
   272  			panic("impossible scheduling")
   273  		case <-ctx.Done():
   274  			return
   275  		}
   276  	}()
   277  	wg.Wait()
   278  	// Schedule a bunch of lower priority (2) tasks. These should all be
   279  	// successfully scheduled to run on one of our machines while the other is
   280  	// reserved for the exclusive task.
   281  	wg.Add(maxp)
   282  	for i := 0; i < maxp; i++ {
   283  		go func() {
   284  			defer wg.Done()
   285  			offerc, _ := mgr.Offer(2, 1)
   286  			select {
   287  			case m := <-offerc:
   288  				m.Done(1, nil)
   289  			case <-ctx.Done():
   290  				return
   291  			}
   292  		}()
   293  	}
   294  	wg.Wait()
   295  	// Returning means that the test passes. If we're blocked on scheduling the
   296  	// exclusive task, we'll never return, and the test will time out.
   297  }
   298  
   299  func startTestSystem(machinep, maxp int, maxLoad float64) (system *testsystem.System, b *bigmachine.B, m *machineManager, cancel func()) {
   300  	system = testsystem.New()
   301  	system.Machineprocs = machinep
   302  	// Customize timeouts so that tests run faster.
   303  	system.KeepalivePeriod = time.Second
   304  	system.KeepaliveTimeout = 5 * time.Second
   305  	system.KeepaliveRpcTimeout = time.Second
   306  	b = bigmachine.Start(system)
   307  	ctx, ctxcancel := context.WithCancel(context.Background())
   308  	m = newMachineManager(b, nil, nil, maxp, maxLoad, &worker{MachineCombiners: false})
   309  	var wg sync.WaitGroup
   310  	wg.Add(1)
   311  	go func() {
   312  		m.Do(ctx)
   313  		wg.Done()
   314  	}()
   315  	cancel = func() {
   316  		ctxcancel()
   317  		b.Shutdown()
   318  		wg.Wait()
   319  	}
   320  	return
   321  }
   322  
   323  // getMachines gets n machines from mgr and returns them.
   324  func getMachines(ctx context.Context, mgr *machineManager, n int) []*sliceMachine {
   325  	ms := make([]*sliceMachine, n)
   326  	for i := range ms {
   327  		offerc, _ := mgr.Offer(0, 1)
   328  		ms[i] = <-offerc
   329  	}
   330  	return ms
   331  }
   332  
   333  // mustUnavailable asserts that no machine is immediately available from mgr.
   334  func mustUnavailable(t *testing.T, mgr *machineManager) {
   335  	t.Helper()
   336  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
   337  	defer cancel()
   338  	offerc, cancel := mgr.Offer(0, 1)
   339  	select {
   340  	case <-offerc:
   341  		t.Fatal("unexpected machine available")
   342  	case <-ctx.Done():
   343  		cancel()
   344  	}
   345  }