github.com/square/finch@v0.0.0-20240412205204-6530c03e2b96/stage/stage.go (about) 1 // Copyright 2024 Block, Inc. 2 3 package stage 4 5 import ( 6 "context" 7 "fmt" 8 "log" 9 "runtime/pprof" 10 "time" 11 12 "github.com/square/finch" 13 "github.com/square/finch/client" 14 "github.com/square/finch/config" 15 "github.com/square/finch/data" 16 "github.com/square/finch/dbconn" 17 "github.com/square/finch/limit" 18 "github.com/square/finch/stats" 19 "github.com/square/finch/trx" 20 "github.com/square/finch/workload" 21 ) 22 23 // Stage allocates and runs a workload. It handles stats for the workload, 24 // including reporting. A stage has a two-phase execute: Prepare to set up 25 // everything, then Run to execute clients (which execute queries). Run is 26 // optional; it's not run when --test is specified on the command line. 27 type Stage struct { 28 cfg config.Stage 29 gds *data.Scope 30 stats *stats.Collector 31 // -- 32 doneChan chan *client.Client // <-Client.Run() 33 execGroups [][]workload.ClientGroup // [n][Client] 34 } 35 36 func New(cfg config.Stage, gds *data.Scope, stats *stats.Collector) *Stage { 37 return &Stage{ 38 cfg: cfg, 39 gds: gds, 40 stats: stats, 41 // -- 42 doneChan: make(chan *client.Client, 1), 43 } 44 } 45 46 func (s *Stage) Prepare(ctxFinch context.Context) error { 47 if len(s.cfg.Trx) == 0 { 48 panic("Stage.Prepare called with zero trx") 49 } 50 51 // Test connection to MySQL 52 dbconn.SetConfig(s.cfg.MySQL) 53 db, dsnRedacted, err := dbconn.Make() 54 if err != nil { 55 return err 56 } 57 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 58 defer cancel() 59 if err := db.PingContext(ctx); err != nil { 60 return fmt.Errorf("test connection to MySQL failed: %s: %s", dsnRedacted, err) 61 } 62 db.Close() // test conn 63 log.Printf("Connected to %s", dsnRedacted) 64 65 // Load and validate all config.stage.trx files. This makes and validates all 66 // data generators, too. Being valid means only that the Finch config/setup is 67 // valid, not the SQL statements because those aren't run yet, so MySQL might 68 // still return errors on Run. 69 finch.Debug("load trx") 70 trxSet, err := trx.Load(s.cfg.Trx, s.gds, s.cfg.Params) 71 if err != nil { 72 return err 73 } 74 75 // Allocate the workload (config.stage.workload): execution groups, client groups, 76 // clients, and trx assigned to clients. This is done in two steps. First, Groups 77 // returns the execution groups. Second, Clients returns the ready-to-run clients 78 // for each exec group. Both steps are required but separated for testing because 79 // the second is complex. 80 finch.Debug("alloc clients") 81 a := workload.Allocator{ 82 Stage: s.cfg.N, 83 StageName: s.cfg.Name, 84 TrxSet: trxSet, 85 Workload: s.cfg.Workload, 86 StageQPS: limit.NewRate(finch.Uint(s.cfg.QPS)), // nil if config.stage.qps == 0 87 StageTPS: limit.NewRate(finch.Uint(s.cfg.TPS)), // nil if config.stage.tps == 0 88 DoneChan: s.doneChan, 89 } 90 groups, err := a.Groups() 91 if err != nil { 92 return err 93 } 94 s.execGroups, err = a.Clients(groups, s.stats != nil) 95 if err != nil { 96 return err 97 } 98 99 // Initialize all clients in all exec groups, and register their stats with 100 // the Collector 101 finch.Debug("init clients") 102 for egNo := range s.execGroups { 103 for cgNo := range s.execGroups[egNo] { 104 for _, c := range s.execGroups[egNo][cgNo].Clients { 105 if err := c.Init(); err != nil { 106 return err 107 } 108 if s.stats != nil { 109 s.stats.Watch(c.Stats) 110 } 111 } 112 } 113 } 114 115 return nil 116 } 117 118 func (s *Stage) Run(ctxFinch context.Context) { 119 // There are 3 levels of contexts: 120 // 121 // ctxFinch from startup.Finch, catches CTRL-C 122 // └──ctxStage config.stage.runtime, stage runtime 123 // └──ctxClients config.stage.workload.runtime, client group runtime 124 // 125 // The ctxClients can end before the ctxStage if, for example, a client group 126 // is conifgured to run for less than the full stage runtime. Different client 127 // groups can also have different runtimes. 128 var ctxStage context.Context 129 var cancelStage context.CancelFunc 130 if s.cfg.Runtime != "" { 131 d, _ := time.ParseDuration(s.cfg.Runtime) // already validated 132 ctxStage, cancelStage = context.WithDeadline(ctxFinch, time.Now().Add(d)) 133 defer cancelStage() // stage and all clients 134 log.Printf("[%s] Running for %s", s.cfg.Name, s.cfg.Runtime) 135 } else { 136 ctxStage = ctxFinch 137 log.Printf("[%s] Running (no runtime limit)", s.cfg.Name) 138 } 139 140 if s.stats != nil { 141 s.stats.Start() 142 } 143 144 if finch.CPUProfile != nil { 145 pprof.StartCPUProfile(finch.CPUProfile) 146 } 147 148 for egNo := range s.execGroups { // ------------------------------------- execution groups 149 if ctxFinch.Err() != nil { 150 break 151 } 152 nClients := 0 153 for cgNo := range s.execGroups[egNo] { // --------------------------- client groups 154 log.Printf("[%s] Execution group %d, client group %d, runnning %d clients", s.cfg.Name, egNo+1, cgNo+1, len(s.execGroups[egNo][cgNo].Clients)) 155 nClients += len(s.execGroups[egNo][cgNo].Clients) 156 var ctxClients context.Context 157 var cancelClients context.CancelFunc 158 if s.execGroups[egNo][cgNo].Runtime > 0 { 159 // Client group runtime (plus stage runtime, if any) 160 finch.Debug("eg %d/%d exec %s", s.execGroups[egNo][cgNo].Runtime) 161 ctxClients, cancelClients = context.WithDeadline(ctxStage, time.Now().Add(s.execGroups[egNo][cgNo].Runtime)) 162 defer cancelClients() 163 } else { 164 // Stage runtime limit, if any 165 finch.Debug("%d/%d no limit", egNo, cgNo) 166 ctxClients = ctxStage 167 } 168 for _, c := range s.execGroups[egNo][cgNo].Clients { // --------- clients 169 go c.Run(ctxClients) 170 } 171 } // start all clients, then... 172 173 clientErrors := make([]*client.Client, 0, nClients) 174 CLIENTS: 175 for nClients > 0 { // wait for clients 176 select { 177 case c := <-s.doneChan: 178 finch.Debug("%s done: %v", c.RunLevel, c.Error) 179 nClients -= 1 180 if c.Error.Err != nil { 181 clientErrors = append(clientErrors, c) 182 } 183 case <-ctxStage.Done(): 184 finch.Debug("stage runtime elapsed") 185 break CLIENTS 186 case <-ctxFinch.Done(): 187 finch.Debug("finch terminated") 188 break CLIENTS 189 } 190 } 191 if nClients > 0 { 192 // spinWaitMs gives clients a _little_ time to finish when either 193 // context is cancelled. This must be done to avoid a data race in 194 // stats reporting: the CLIENTS loop finishes and stats are reported 195 // below while a client is still writing to those stats. (This is 196 // also due to fact that stats are lock-free.) So when a context is 197 // cancelled, start sleeping 1ms and decrementing spinWaitMs which 198 // lets this for loop continue (spin) but also timeout quickly. 199 finch.Debug("spin wait for %d clients", nClients) 200 spinWaitMs := 10 201 for spinWaitMs > 0 && nClients > 0 { 202 select { 203 case c := <-s.doneChan: 204 finch.Debug("%s done: %v", c.RunLevel, c.Error) 205 nClients -= 1 206 if c.Error.Err != nil { 207 clientErrors = append(clientErrors, c) 208 } 209 default: 210 time.Sleep(1 * time.Millisecond) 211 spinWaitMs -= 1 212 } 213 } 214 } 215 if nClients > 0 { 216 log.Printf("[%s] WARNING: %d clients did not stop, statistics are not accurate", s.cfg.Name, nClients) 217 } 218 if len(clientErrors) > 0 { 219 log.Printf("%d client errors:\n", len(clientErrors)) 220 for _, c := range clientErrors { 221 log.Printf(" %s: %s (%s)", c.RunLevel.ClientId(), c.Error.Err, c.Statements[c.Error.StatementNo].Query) 222 } 223 } 224 } 225 226 if finch.CPUProfile != nil { 227 pprof.StopCPUProfile() 228 } 229 230 if s.stats != nil { 231 if !s.stats.Stop(3*time.Second, ctxFinch.Err() != nil) { 232 log.Printf("\n[%s] Timeout waiting for final statistics, reported values are incomplete", s.cfg.Name) 233 } 234 } 235 }