vitess.io/vitess@v0.16.2/go/test/endtoend/tabletgateway/buffer/buffer_test_helpers.go (about) 1 /* 2 Copyright 2020 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 /* 18 Test the vtgate buffering. 19 20 During a failover, vtgate should automatically buffer (stall) requests 21 for a configured time and retry them after the failover is over. 22 23 The test reproduces such a scenario as follows: 24 - run two threads, the first thread continuously executes a critical read and the second executes a write (UPDATE) 25 - vtctl PlannedReparentShard runs a failover 26 - both threads should not see any error during the failover 27 */ 28 29 package buffer 30 31 import ( 32 "context" 33 "encoding/json" 34 "fmt" 35 "io" 36 "math/rand" 37 "net/http" 38 "sync" 39 "testing" 40 "time" 41 42 "vitess.io/vitess/go/test/endtoend/utils" 43 44 "vitess.io/vitess/go/vt/log" 45 46 "github.com/stretchr/testify/assert" 47 "github.com/stretchr/testify/require" 48 49 "vitess.io/vitess/go/mysql" 50 "vitess.io/vitess/go/test/endtoend/cluster" 51 ) 52 53 const ( 54 keyspaceUnshardedName = "ks1" 55 cell = "zone1" 56 hostname = "localhost" 57 sqlSchema = ` 58 create table buffer( 59 id BIGINT NOT NULL, 60 msg VARCHAR(64) NOT NULL, 61 PRIMARY KEY (id) 62 ) Engine=InnoDB;` 63 ) 64 65 const ( 66 criticalReadRowID = 1 67 updateRowID = 2 68 ) 69 70 // threadParams is set of params passed into read and write threads 71 type threadParams struct { 72 quit bool 73 rpcs int // Number of queries successfully executed. 74 errors int // Number of failed queries. 75 waitForNotification chan bool // Channel used to notify the main thread that this thread executed 76 notifyLock sync.Mutex // notifyLock guards the two fields notifyAfterNSuccessfulRpcs/rpcsSoFar. 77 notifyAfterNSuccessfulRpcs int // If 0, notifications are disabled 78 rpcsSoFar int // Number of RPCs at the time a notification was requested 79 index int // 80 internalErrs int 81 executeFunction func(c *threadParams, conn *mysql.Conn) error // Implement the method for read/update. 82 typ string 83 reservedConn bool 84 slowQueries bool 85 } 86 87 // Thread which constantly executes a query on vtgate. 88 func (c *threadParams) threadRun(wg *sync.WaitGroup, vtParams *mysql.ConnParams) { 89 defer wg.Done() 90 91 conn, err := mysql.Connect(context.Background(), vtParams) 92 if err != nil { 93 log.Errorf("error connecting to mysql with params %v: %v", vtParams, err) 94 } 95 defer conn.Close() 96 if c.reservedConn { 97 _, err = conn.ExecuteFetch("set default_week_format = 1", 1000, true) 98 if err != nil { 99 c.errors++ 100 log.Errorf("error setting default_week_format: %v", err) 101 } 102 } 103 for !c.quit { 104 err = c.executeFunction(c, conn) 105 if err != nil { 106 c.errors++ 107 log.Errorf("error executing function %s: %v", c.typ, err) 108 } 109 c.rpcs++ 110 // If notifications are requested, check if we already executed the 111 // required number of successful RPCs. 112 // Use >= instead of == because we can miss the exact point due to 113 // slow thread scheduling. 114 c.notifyLock.Lock() 115 if c.notifyAfterNSuccessfulRpcs != 0 && c.rpcs >= (c.notifyAfterNSuccessfulRpcs+c.rpcsSoFar) { 116 c.waitForNotification <- true 117 c.notifyAfterNSuccessfulRpcs = 0 118 } 119 c.notifyLock.Unlock() 120 // Wait 10ms seconds between two attempts. 121 time.Sleep(10 * time.Millisecond) 122 } 123 } 124 125 func (c *threadParams) ExpectQueries(n int) { 126 c.notifyLock.Lock() 127 c.notifyAfterNSuccessfulRpcs = n 128 c.rpcsSoFar = c.rpcs 129 c.notifyLock.Unlock() 130 } 131 132 func (c *threadParams) stop() { 133 c.quit = true 134 } 135 136 func readExecute(c *threadParams, conn *mysql.Conn) error { 137 attempt := c.index 138 c.index++ 139 140 sel := "*" 141 if c.slowQueries { 142 sel = "*, SLEEP(1)" 143 } 144 qr, err := conn.ExecuteFetch(fmt.Sprintf("SELECT %s FROM buffer WHERE id = %d", sel, criticalReadRowID), 1000, true) 145 146 if err != nil { 147 log.Errorf("select attempt #%d, failed with err: %v", attempt, err) 148 // For a reserved connection, read query can fail as it does not go through the gateway and 149 // goes to tablet directly and later is directed to use Gateway if the error is caused due to cluster failover operation. 150 if c.reservedConn { 151 c.internalErrs++ 152 if c.internalErrs > 1 { 153 log.Errorf("More Read Errors: %d", c.internalErrs) 154 return err 155 } 156 log.Error("This is okay once because we do not support buffering it.") 157 return nil 158 } 159 return err 160 } 161 162 log.Infof("select attempt #%d, rows: %d", attempt, len(qr.Rows)) 163 return nil 164 } 165 166 func updateExecute(c *threadParams, conn *mysql.Conn) error { 167 attempt := c.index 168 // Value used in next UPDATE query. Increased after every query. 169 c.index++ 170 conn.ExecuteFetch("begin", 1000, true) 171 172 result, err := conn.ExecuteFetch(fmt.Sprintf("UPDATE buffer SET msg='update %d' WHERE id = %d", attempt, updateRowID), 1000, true) 173 174 // Sleep between [0, 1] seconds to prolong the time the transaction is in 175 // flight. This is more realistic because applications are going to keep 176 // their transactions open for longer as well. 177 dur := time.Duration(rand.Int31n(1000)) * time.Millisecond 178 if c.slowQueries { 179 dur = dur + 1*time.Second 180 } 181 time.Sleep(dur) 182 183 if err == nil { 184 log.Infof("update attempt #%d affected %v rows", attempt, result.RowsAffected) 185 _, err = conn.ExecuteFetch("commit", 1000, true) 186 if err != nil { 187 log.Errorf("UPDATE #%d failed during COMMIT, err: %v", attempt, err) 188 _, errRollback := conn.ExecuteFetch("rollback", 1000, true) 189 if errRollback != nil { 190 log.Errorf("Error in rollback #%d: %v", attempt, errRollback) 191 } 192 c.internalErrs++ 193 if c.internalErrs > 1 { 194 log.Errorf("More Commit Errors: %d", c.internalErrs) 195 return err 196 } 197 log.Error("This is okay once because we do not support buffering it.") 198 } 199 return nil 200 } 201 log.Errorf("UPDATE #%d failed with err: %v", attempt, err) 202 _, errRollback := conn.ExecuteFetch("rollback", 1000, true) 203 if errRollback != nil { 204 log.Errorf("Error in rollback #%d: %v", attempt, errRollback) 205 } 206 c.internalErrs++ 207 if c.internalErrs > 1 { 208 log.Errorf("More Rollback Errors: %d", c.internalErrs) 209 return err 210 } 211 log.Error("This is okay once because we do not support buffering it.") 212 return nil 213 } 214 215 func (bt *BufferingTest) createCluster() (*cluster.LocalProcessCluster, int) { 216 clusterInstance := cluster.NewCluster(cell, hostname) 217 218 // Start topo server 219 clusterInstance.VtctldExtraArgs = []string{"--remote_operation_timeout", "30s", "--topo_etcd_lease_ttl", "40"} 220 if err := clusterInstance.StartTopo(); err != nil { 221 return nil, 1 222 } 223 224 // Start keyspace 225 keyspace := &cluster.Keyspace{ 226 Name: keyspaceUnshardedName, 227 SchemaSQL: sqlSchema, 228 VSchema: bt.VSchema, 229 } 230 clusterInstance.VtTabletExtraArgs = []string{ 231 "--health_check_interval", "1s", 232 "--queryserver-config-transaction-timeout", "20", 233 } 234 if err := clusterInstance.StartUnshardedKeyspace(*keyspace, 1, false); err != nil { 235 return nil, 1 236 } 237 238 clusterInstance.VtGateExtraArgs = []string{ 239 "--enable_buffer", 240 // Long timeout in case failover is slow. 241 "--buffer_window", "10m", 242 "--buffer_max_failover_duration", "10m", 243 "--buffer_min_time_between_failovers", "20m", 244 "--buffer_implementation", "keyspace_events", 245 "--tablet_refresh_interval", "1s", 246 } 247 clusterInstance.VtGateExtraArgs = append(clusterInstance.VtGateExtraArgs, bt.VtGateExtraArgs...) 248 249 // Start vtgate 250 clusterInstance.VtGatePlannerVersion = 0 251 if err := clusterInstance.StartVtgate(); err != nil { 252 return nil, 1 253 } 254 rand.Seed(time.Now().UnixNano()) 255 return clusterInstance, 0 256 } 257 258 type QueryEngine interface { 259 ExpectQueries(count int) 260 } 261 262 type BufferingTest struct { 263 Assert func(t *testing.T, shard string, stats *VTGateBufferingStats) 264 Failover func(t *testing.T, cluster *cluster.LocalProcessCluster, keyspace string, reads, writes QueryEngine) 265 266 ReserveConn bool 267 SlowQueries bool 268 269 VSchema string 270 VtGateExtraArgs []string 271 272 wg sync.WaitGroup 273 } 274 275 func (bt *BufferingTest) Test(t *testing.T) { 276 defer cluster.PanicHandler(t) 277 clusterInstance, exitCode := bt.createCluster() 278 if exitCode != 0 { 279 t.Fatal("failed to start cluster") 280 } 281 defer clusterInstance.Teardown() 282 283 vtParams := mysql.ConnParams{ 284 Host: clusterInstance.Hostname, 285 Port: clusterInstance.VtgateMySQLPort, 286 } 287 288 // Healthcheck interval on tablet is set to 1s, so sleep for 2s 289 time.Sleep(2 * time.Second) 290 conn, err := mysql.Connect(context.Background(), &vtParams) 291 require.NoError(t, err) 292 defer conn.Close() 293 294 // Insert two rows for the later threads (critical read, update). 295 utils.Exec(t, conn, fmt.Sprintf("INSERT INTO buffer (id, msg) VALUES (%d, %s)", criticalReadRowID, "'critical read'")) 296 utils.Exec(t, conn, fmt.Sprintf("INSERT INTO buffer (id, msg) VALUES (%d, %s)", updateRowID, "'update'")) 297 298 // Start both threads. 299 readThreadInstance := &threadParams{ 300 index: 1, 301 typ: "read", 302 executeFunction: readExecute, 303 waitForNotification: make(chan bool), 304 reservedConn: bt.ReserveConn, 305 slowQueries: bt.SlowQueries, 306 } 307 bt.wg.Add(1) 308 go readThreadInstance.threadRun(&bt.wg, &vtParams) 309 updateThreadInstance := &threadParams{ 310 index: 1, 311 typ: "write", 312 executeFunction: updateExecute, 313 waitForNotification: make(chan bool), 314 reservedConn: bt.ReserveConn, 315 slowQueries: bt.SlowQueries, 316 } 317 bt.wg.Add(1) 318 go updateThreadInstance.threadRun(&bt.wg, &vtParams) 319 320 // Verify they got at least 2 RPCs through. 321 readThreadInstance.ExpectQueries(2) 322 updateThreadInstance.ExpectQueries(2) 323 324 <-readThreadInstance.waitForNotification 325 <-updateThreadInstance.waitForNotification 326 327 bt.Failover(t, clusterInstance, keyspaceUnshardedName, readThreadInstance, updateThreadInstance) 328 329 timeout := time.After(120 * time.Second) 330 select { 331 case <-readThreadInstance.waitForNotification: 332 case <-timeout: 333 timeout = time.After(100 * time.Millisecond) 334 log.Error("failed to get read thread notification") 335 } 336 select { 337 case <-updateThreadInstance.waitForNotification: 338 case <-timeout: 339 log.Error("failed to get update thread notification") 340 } 341 342 // Stop threads 343 readThreadInstance.stop() 344 updateThreadInstance.stop() 345 346 // Both threads must not see any error 347 assert.Zero(t, readThreadInstance.errors, "found errors in read queries") 348 assert.Zero(t, updateThreadInstance.errors, "found errors in tx queries") 349 350 //At least one thread should have been buffered. 351 //This may fail if a failover is too fast. Add retries then. 352 resp, err := http.Get(clusterInstance.VtgateProcess.VerifyURL) 353 require.NoError(t, err) 354 defer resp.Body.Close() 355 356 require.Equal(t, 200, resp.StatusCode) 357 358 var metadata VTGateBufferingStats 359 respByte, err := io.ReadAll(resp.Body) 360 require.NoError(t, err) 361 err = json.Unmarshal(respByte, &metadata) 362 require.NoError(t, err) 363 364 label := fmt.Sprintf("%s.%s", keyspaceUnshardedName, "0") 365 if metadata.BufferLastRequestsInFlightMax[label] == 0 { 366 // Missed buffering is okay when we observed the failover during the 367 // COMMIT (which cannot trigger the buffering). 368 assert.Greater(t, updateThreadInstance.internalErrs, 0, "No buffering took place and the update thread saw no error during COMMIT. But one of it must happen.") 369 } else { 370 bt.Assert(t, label, &metadata) 371 } 372 373 bt.wg.Wait() 374 } 375 376 type VTGateBufferingStats struct { 377 BufferLastRequestsInFlightMax map[string]int 378 HealthcheckPrimaryPromoted map[string]int 379 BufferFailoverDurationSumMs map[string]int 380 BufferRequestsBuffered map[string]int 381 BufferStops map[string]int 382 }