github.com/square/finch@v0.0.0-20240412205204-6530c03e2b96/stage/stage.go (about)

     1  // Copyright 2024 Block, Inc.
     2  
     3  package stage
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"log"
     9  	"runtime/pprof"
    10  	"time"
    11  
    12  	"github.com/square/finch"
    13  	"github.com/square/finch/client"
    14  	"github.com/square/finch/config"
    15  	"github.com/square/finch/data"
    16  	"github.com/square/finch/dbconn"
    17  	"github.com/square/finch/limit"
    18  	"github.com/square/finch/stats"
    19  	"github.com/square/finch/trx"
    20  	"github.com/square/finch/workload"
    21  )
    22  
    23  // Stage allocates and runs a workload. It handles stats for the workload,
    24  // including reporting. A stage has a two-phase execute: Prepare to set up
    25  // everything, then Run to execute clients (which execute queries). Run is
    26  // optional; it's not run when --test is specified on the command line.
    27  type Stage struct {
    28  	cfg   config.Stage
    29  	gds   *data.Scope
    30  	stats *stats.Collector
    31  	// --
    32  	doneChan   chan *client.Client      // <-Client.Run()
    33  	execGroups [][]workload.ClientGroup // [n][Client]
    34  }
    35  
    36  func New(cfg config.Stage, gds *data.Scope, stats *stats.Collector) *Stage {
    37  	return &Stage{
    38  		cfg:   cfg,
    39  		gds:   gds,
    40  		stats: stats,
    41  		// --
    42  		doneChan: make(chan *client.Client, 1),
    43  	}
    44  }
    45  
    46  func (s *Stage) Prepare(ctxFinch context.Context) error {
    47  	if len(s.cfg.Trx) == 0 {
    48  		panic("Stage.Prepare called with zero trx")
    49  	}
    50  
    51  	// Test connection to MySQL
    52  	dbconn.SetConfig(s.cfg.MySQL)
    53  	db, dsnRedacted, err := dbconn.Make()
    54  	if err != nil {
    55  		return err
    56  	}
    57  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
    58  	defer cancel()
    59  	if err := db.PingContext(ctx); err != nil {
    60  		return fmt.Errorf("test connection to MySQL failed: %s: %s", dsnRedacted, err)
    61  	}
    62  	db.Close() // test conn
    63  	log.Printf("Connected to %s", dsnRedacted)
    64  
    65  	// Load and validate all config.stage.trx files. This makes and validates all
    66  	// data generators, too. Being valid means only that the Finch config/setup is
    67  	// valid, not the SQL statements because those aren't run yet, so MySQL might
    68  	// still return errors on Run.
    69  	finch.Debug("load trx")
    70  	trxSet, err := trx.Load(s.cfg.Trx, s.gds, s.cfg.Params)
    71  	if err != nil {
    72  		return err
    73  	}
    74  
    75  	// Allocate the workload (config.stage.workload): execution groups, client groups,
    76  	// clients, and trx assigned to clients. This is done in two steps. First, Groups
    77  	// returns the execution groups. Second, Clients returns the ready-to-run clients
    78  	// for each exec group. Both steps are required but separated for testing because
    79  	// the second is complex.
    80  	finch.Debug("alloc clients")
    81  	a := workload.Allocator{
    82  		Stage:     s.cfg.N,
    83  		StageName: s.cfg.Name,
    84  		TrxSet:    trxSet,
    85  		Workload:  s.cfg.Workload,
    86  		StageQPS:  limit.NewRate(finch.Uint(s.cfg.QPS)), // nil if config.stage.qps == 0
    87  		StageTPS:  limit.NewRate(finch.Uint(s.cfg.TPS)), // nil if config.stage.tps == 0
    88  		DoneChan:  s.doneChan,
    89  	}
    90  	groups, err := a.Groups()
    91  	if err != nil {
    92  		return err
    93  	}
    94  	s.execGroups, err = a.Clients(groups, s.stats != nil)
    95  	if err != nil {
    96  		return err
    97  	}
    98  
    99  	// Initialize all clients in all exec groups, and register their stats with
   100  	// the Collector
   101  	finch.Debug("init clients")
   102  	for egNo := range s.execGroups {
   103  		for cgNo := range s.execGroups[egNo] {
   104  			for _, c := range s.execGroups[egNo][cgNo].Clients {
   105  				if err := c.Init(); err != nil {
   106  					return err
   107  				}
   108  				if s.stats != nil {
   109  					s.stats.Watch(c.Stats)
   110  				}
   111  			}
   112  		}
   113  	}
   114  
   115  	return nil
   116  }
   117  
   118  func (s *Stage) Run(ctxFinch context.Context) {
   119  	// There are 3 levels of contexts:
   120  	//
   121  	//   ctxFinch			from startup.Finch, catches CTRL-C
   122  	//   └──ctxStage		config.stage.runtime, stage runtime
   123  	//      └──ctxClients	config.stage.workload.runtime, client group runtime
   124  	//
   125  	// The ctxClients can end before the ctxStage if, for example, a client group
   126  	// is conifgured to run for less than the full stage runtime. Different client
   127  	// groups can also have different runtimes.
   128  	var ctxStage context.Context
   129  	var cancelStage context.CancelFunc
   130  	if s.cfg.Runtime != "" {
   131  		d, _ := time.ParseDuration(s.cfg.Runtime) // already validated
   132  		ctxStage, cancelStage = context.WithDeadline(ctxFinch, time.Now().Add(d))
   133  		defer cancelStage() // stage and all clients
   134  		log.Printf("[%s] Running for %s", s.cfg.Name, s.cfg.Runtime)
   135  	} else {
   136  		ctxStage = ctxFinch
   137  		log.Printf("[%s] Running (no runtime limit)", s.cfg.Name)
   138  	}
   139  
   140  	if s.stats != nil {
   141  		s.stats.Start()
   142  	}
   143  
   144  	if finch.CPUProfile != nil {
   145  		pprof.StartCPUProfile(finch.CPUProfile)
   146  	}
   147  
   148  	for egNo := range s.execGroups { // ------------------------------------- execution groups
   149  		if ctxFinch.Err() != nil {
   150  			break
   151  		}
   152  		nClients := 0
   153  		for cgNo := range s.execGroups[egNo] { // --------------------------- client groups
   154  			log.Printf("[%s] Execution group %d, client group %d, runnning %d clients", s.cfg.Name, egNo+1, cgNo+1, len(s.execGroups[egNo][cgNo].Clients))
   155  			nClients += len(s.execGroups[egNo][cgNo].Clients)
   156  			var ctxClients context.Context
   157  			var cancelClients context.CancelFunc
   158  			if s.execGroups[egNo][cgNo].Runtime > 0 {
   159  				// Client group runtime (plus stage runtime, if any)
   160  				finch.Debug("eg %d/%d exec %s", s.execGroups[egNo][cgNo].Runtime)
   161  				ctxClients, cancelClients = context.WithDeadline(ctxStage, time.Now().Add(s.execGroups[egNo][cgNo].Runtime))
   162  				defer cancelClients()
   163  			} else {
   164  				// Stage runtime limit, if any
   165  				finch.Debug("%d/%d no limit", egNo, cgNo)
   166  				ctxClients = ctxStage
   167  			}
   168  			for _, c := range s.execGroups[egNo][cgNo].Clients { // --------- clients
   169  				go c.Run(ctxClients)
   170  			}
   171  		} // start all clients, then...
   172  
   173  		clientErrors := make([]*client.Client, 0, nClients)
   174  	CLIENTS:
   175  		for nClients > 0 { // wait for clients
   176  			select {
   177  			case c := <-s.doneChan:
   178  				finch.Debug("%s done: %v", c.RunLevel, c.Error)
   179  				nClients -= 1
   180  				if c.Error.Err != nil {
   181  					clientErrors = append(clientErrors, c)
   182  				}
   183  			case <-ctxStage.Done():
   184  				finch.Debug("stage runtime elapsed")
   185  				break CLIENTS
   186  			case <-ctxFinch.Done():
   187  				finch.Debug("finch terminated")
   188  				break CLIENTS
   189  			}
   190  		}
   191  		if nClients > 0 {
   192  			// spinWaitMs gives clients a _little_ time to finish when either
   193  			// context is cancelled. This must be done to avoid a data race in
   194  			// stats reporting: the CLIENTS loop finishes and stats are reported
   195  			// below while a client is still writing to those stats. (This is
   196  			// also due to fact that stats are lock-free.) So when a context is
   197  			// cancelled, start sleeping 1ms and decrementing spinWaitMs which
   198  			// lets this for loop continue (spin) but also timeout quickly.
   199  			finch.Debug("spin wait for %d clients", nClients)
   200  			spinWaitMs := 10
   201  			for spinWaitMs > 0 && nClients > 0 {
   202  				select {
   203  				case c := <-s.doneChan:
   204  					finch.Debug("%s done: %v", c.RunLevel, c.Error)
   205  					nClients -= 1
   206  					if c.Error.Err != nil {
   207  						clientErrors = append(clientErrors, c)
   208  					}
   209  				default:
   210  					time.Sleep(1 * time.Millisecond)
   211  					spinWaitMs -= 1
   212  				}
   213  			}
   214  		}
   215  		if nClients > 0 {
   216  			log.Printf("[%s] WARNING: %d clients did not stop, statistics are not accurate", s.cfg.Name, nClients)
   217  		}
   218  		if len(clientErrors) > 0 {
   219  			log.Printf("%d client errors:\n", len(clientErrors))
   220  			for _, c := range clientErrors {
   221  				log.Printf("  %s: %s (%s)", c.RunLevel.ClientId(), c.Error.Err, c.Statements[c.Error.StatementNo].Query)
   222  			}
   223  		}
   224  	}
   225  
   226  	if finch.CPUProfile != nil {
   227  		pprof.StopCPUProfile()
   228  	}
   229  
   230  	if s.stats != nil {
   231  		if !s.stats.Stop(3*time.Second, ctxFinch.Err() != nil) {
   232  			log.Printf("\n[%s] Timeout waiting for final statistics, reported values are incomplete", s.cfg.Name)
   233  		}
   234  	}
   235  }