vitess.io/vitess@v0.16.2/go/test/endtoend/onlineddl/vrepl_stress/onlineddl_vrepl_mini_stress_test.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vreplstress 18 19 import ( 20 "context" 21 "flag" 22 "fmt" 23 "math/rand" 24 "os" 25 "path" 26 "strings" 27 "sync" 28 "sync/atomic" 29 "testing" 30 "time" 31 32 "vitess.io/vitess/go/mysql" 33 "vitess.io/vitess/go/vt/log" 34 "vitess.io/vitess/go/vt/schema" 35 36 "vitess.io/vitess/go/test/endtoend/cluster" 37 "vitess.io/vitess/go/test/endtoend/onlineddl" 38 39 "github.com/stretchr/testify/assert" 40 "github.com/stretchr/testify/require" 41 ) 42 43 type WriteMetrics struct { 44 mu sync.Mutex 45 insertsAttempts, insertsFailures, insertsNoops, inserts int64 46 updatesAttempts, updatesFailures, updatesNoops, updates int64 47 deletesAttempts, deletesFailures, deletesNoops, deletes int64 48 } 49 50 func (w *WriteMetrics) Clear() { 51 w.mu.Lock() 52 defer w.mu.Unlock() 53 54 w.inserts = 0 55 w.updates = 0 56 w.deletes = 0 57 58 w.insertsAttempts = 0 59 w.insertsFailures = 0 60 w.insertsNoops = 0 61 62 w.updatesAttempts = 0 63 w.updatesFailures = 0 64 w.updatesNoops = 0 65 66 w.deletesAttempts = 0 67 w.deletesFailures = 0 68 w.deletesNoops = 0 69 } 70 71 func (w *WriteMetrics) String() string { 72 return fmt.Sprintf(`WriteMetrics: inserts-deletes=%d, updates-deletes=%d, 73 insertsAttempts=%d, insertsFailures=%d, insertsNoops=%d, inserts=%d, 74 updatesAttempts=%d, updatesFailures=%d, updatesNoops=%d, updates=%d, 75 deletesAttempts=%d, deletesFailures=%d, deletesNoops=%d, deletes=%d, 76 `, 77 w.inserts-w.deletes, w.updates-w.deletes, 78 w.insertsAttempts, w.insertsFailures, w.insertsNoops, w.inserts, 79 w.updatesAttempts, w.updatesFailures, w.updatesNoops, w.updates, 80 w.deletesAttempts, w.deletesFailures, w.deletesNoops, w.deletes, 81 ) 82 } 83 84 var ( 85 clusterInstance *cluster.LocalProcessCluster 86 shards []cluster.Shard 87 vtParams mysql.ConnParams 88 89 opOrder int64 90 opOrderMutex sync.Mutex 91 onlineDDLStrategy = "vitess" 92 hostname = "localhost" 93 keyspaceName = "ks" 94 cell = "zone1" 95 schemaChangeDirectory = "" 96 tableName = `stress_test` 97 cleanupStatements = []string{ 98 `DROP TABLE IF EXISTS stress_test`, 99 } 100 createStatement = ` 101 CREATE TABLE stress_test ( 102 id bigint(20) not null, 103 rand_val varchar(32) null default '', 104 op_order bigint unsigned not null default 0, 105 hint_col varchar(64) not null default '', 106 created_timestamp timestamp not null default current_timestamp, 107 updates int unsigned not null default 0, 108 PRIMARY KEY (id), 109 key created_idx(created_timestamp), 110 key updates_idx(updates) 111 ) ENGINE=InnoDB 112 ` 113 alterHintStatement = ` 114 ALTER TABLE stress_test modify hint_col varchar(64) not null default '%s' 115 ` 116 insertRowStatement = ` 117 INSERT IGNORE INTO stress_test (id, rand_val, op_order) VALUES (%d, left(md5(rand()), 8), %d) 118 ` 119 updateRowStatement = ` 120 UPDATE stress_test SET op_order=%d, updates=updates+1 WHERE id=%d 121 ` 122 deleteRowStatement = ` 123 DELETE FROM stress_test WHERE id=%d AND updates=1 124 ` 125 selectMaxOpOrder = ` 126 SELECT MAX(op_order) as m FROM stress_test 127 ` 128 // We use CAST(SUM(updates) AS SIGNED) because SUM() returns a DECIMAL datatype, and we want to read a SIGNED INTEGER type 129 selectCountRowsStatement = ` 130 SELECT COUNT(*) AS num_rows, CAST(SUM(updates) AS SIGNED) AS sum_updates FROM stress_test 131 ` 132 truncateStatement = ` 133 TRUNCATE TABLE stress_test 134 ` 135 writeMetrics WriteMetrics 136 ) 137 138 const ( 139 maxTableRows = 4096 140 maxConcurrency = 20 141 singleConnectionSleepInterval = 2 * time.Millisecond 142 countIterations = 5 143 migrationWaitTimeout = 60 * time.Second 144 ) 145 146 func resetOpOrder() { 147 opOrderMutex.Lock() 148 defer opOrderMutex.Unlock() 149 opOrder = 0 150 } 151 152 func nextOpOrder() int64 { 153 opOrderMutex.Lock() 154 defer opOrderMutex.Unlock() 155 opOrder++ 156 return opOrder 157 } 158 159 func TestMain(m *testing.M) { 160 defer cluster.PanicHandler(nil) 161 flag.Parse() 162 163 exitcode, err := func() (int, error) { 164 clusterInstance = cluster.NewCluster(cell, hostname) 165 schemaChangeDirectory = path.Join("/tmp", fmt.Sprintf("schema_change_dir_%d", clusterInstance.GetAndReserveTabletUID())) 166 defer os.RemoveAll(schemaChangeDirectory) 167 defer clusterInstance.Teardown() 168 169 if _, err := os.Stat(schemaChangeDirectory); os.IsNotExist(err) { 170 _ = os.Mkdir(schemaChangeDirectory, 0700) 171 } 172 173 clusterInstance.VtctldExtraArgs = []string{ 174 "--schema_change_dir", schemaChangeDirectory, 175 "--schema_change_controller", "local", 176 "--schema_change_check_interval", "1", 177 } 178 179 clusterInstance.VtTabletExtraArgs = []string{ 180 "--enable-lag-throttler", 181 "--throttle_threshold", "1s", 182 "--heartbeat_enable", 183 "--heartbeat_interval", "250ms", 184 "--heartbeat_on_demand_duration", "5s", 185 "--migration_check_interval", "5s", 186 "--watch_replication_stream", 187 } 188 clusterInstance.VtGateExtraArgs = []string{ 189 "--ddl_strategy", "online", 190 } 191 192 if err := clusterInstance.StartTopo(); err != nil { 193 return 1, err 194 } 195 196 // Start keyspace 197 keyspace := &cluster.Keyspace{ 198 Name: keyspaceName, 199 } 200 201 // No need for replicas in this stress test 202 if err := clusterInstance.StartKeyspace(*keyspace, []string{"1"}, 0, false); err != nil { 203 return 1, err 204 } 205 206 vtgateInstance := clusterInstance.NewVtgateInstance() 207 // Start vtgate 208 if err := vtgateInstance.Setup(); err != nil { 209 return 1, err 210 } 211 // ensure it is torn down during cluster TearDown 212 clusterInstance.VtgateProcess = *vtgateInstance 213 vtParams = mysql.ConnParams{ 214 Host: clusterInstance.Hostname, 215 Port: clusterInstance.VtgateMySQLPort, 216 } 217 218 return m.Run(), nil 219 }() 220 if err != nil { 221 fmt.Printf("%v\n", err) 222 os.Exit(1) 223 } else { 224 os.Exit(exitcode) 225 } 226 227 } 228 229 func TestSchemaChange(t *testing.T) { 230 defer cluster.PanicHandler(t) 231 232 shards = clusterInstance.Keyspaces[0].Shards 233 require.Equal(t, 1, len(shards)) 234 235 t.Run("create schema", func(t *testing.T) { 236 assert.Equal(t, 1, len(clusterInstance.Keyspaces[0].Shards)) 237 testWithInitialSchema(t) 238 }) 239 for i := 0; i < countIterations; i++ { 240 // This first tests the general functionality of initializing the table with data, 241 // no concurrency involved. Just counting. 242 testName := fmt.Sprintf("init table %d/%d", (i + 1), countIterations) 243 t.Run(testName, func(t *testing.T) { 244 initTable(t) 245 testSelectTableMetrics(t) 246 }) 247 } 248 for i := 0; i < countIterations; i++ { 249 // This tests running a workload on the table, then comparing expected metrics with 250 // actual table metrics. All this without any ALTER TABLE: this is to validate 251 // that our testing/metrics logic is sound in the first place. 252 testName := fmt.Sprintf("workload without ALTER TABLE %d/%d", (i + 1), countIterations) 253 t.Run(testName, func(t *testing.T) { 254 ctx, cancel := context.WithCancel(context.Background()) 255 initTable(t) 256 var wg sync.WaitGroup 257 wg.Add(1) 258 go func() { 259 defer wg.Done() 260 runMultipleConnections(ctx, t) 261 }() 262 time.Sleep(5 * time.Second) 263 cancel() // will cause runMultipleConnections() to terminate 264 wg.Wait() 265 testSelectTableMetrics(t) 266 }) 267 } 268 t.Run("ALTER TABLE without workload", func(t *testing.T) { 269 // A single ALTER TABLE. Generally this is covered in endtoend/onlineddl_vrepl, 270 // but we wish to verify the ALTER statement used in these tests is sound 271 testWithInitialSchema(t) 272 initTable(t) 273 hint := "hint-alter-without-workload" 274 uuid := testOnlineDDLStatement(t, fmt.Sprintf(alterHintStatement, hint), onlineDDLStrategy, "vtgate", hint) 275 onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusComplete) 276 testSelectTableMetrics(t) 277 }) 278 279 for i := 0; i < countIterations; i++ { 280 // Finally, this is the real test: 281 // We populate a table, and begin a concurrent workload (this is the "mini stress") 282 // We then ALTER TABLE via vreplication. 283 // Once convinced ALTER TABLE is complete, we stop the workload. 284 // We then compare expected metrics with table metrics. If they agree, then 285 // the vreplication/ALTER TABLE did not corrupt our data and we are happy. 286 testName := fmt.Sprintf("ALTER TABLE with workload %d/%d", (i + 1), countIterations) 287 t.Run(testName, func(t *testing.T) { 288 ctx, cancel := context.WithCancel(context.Background()) 289 t.Run("create schema", func(t *testing.T) { 290 testWithInitialSchema(t) 291 }) 292 t.Run("init table", func(t *testing.T) { 293 initTable(t) 294 }) 295 t.Run("migrate", func(t *testing.T) { 296 var wg sync.WaitGroup 297 wg.Add(1) 298 go func() { 299 defer wg.Done() 300 runMultipleConnections(ctx, t) 301 }() 302 hint := fmt.Sprintf("hint-alter-with-workload-%d", i) 303 uuid := testOnlineDDLStatement(t, fmt.Sprintf(alterHintStatement, hint), onlineDDLStrategy, "vtgate", hint) 304 onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusComplete) 305 cancel() // will cause runMultipleConnections() to terminate 306 wg.Wait() 307 }) 308 t.Run("validate metrics", func(t *testing.T) { 309 testSelectTableMetrics(t) 310 }) 311 }) 312 } 313 314 t.Run("summary: validate sequential migration IDs", func(t *testing.T) { 315 onlineddl.ValidateSequentialMigrationIDs(t, &vtParams, shards) 316 }) 317 } 318 319 func testWithInitialSchema(t *testing.T) { 320 for _, statement := range cleanupStatements { 321 err := clusterInstance.VtctlclientProcess.ApplySchema(keyspaceName, statement) 322 require.Nil(t, err) 323 } 324 // Create the stress table 325 err := clusterInstance.VtctlclientProcess.ApplySchema(keyspaceName, createStatement) 326 require.Nil(t, err) 327 328 // Check if table is created 329 checkTable(t, tableName) 330 } 331 332 // testOnlineDDLStatement runs an online DDL, ALTER statement 333 func testOnlineDDLStatement(t *testing.T, alterStatement string, ddlStrategy string, executeStrategy string, expectHint string) (uuid string) { 334 if executeStrategy == "vtgate" { 335 row := onlineddl.VtgateExecDDL(t, &vtParams, ddlStrategy, alterStatement, "").Named().Row() 336 if row != nil { 337 uuid = row.AsString("uuid", "") 338 } 339 } else { 340 var err error 341 uuid, err = clusterInstance.VtctlclientProcess.ApplySchemaWithOutput(keyspaceName, alterStatement, cluster.VtctlClientParams{DDLStrategy: ddlStrategy}) 342 assert.NoError(t, err) 343 } 344 uuid = strings.TrimSpace(uuid) 345 fmt.Println("# Generated UUID (for debug purposes):") 346 fmt.Printf("<%s>\n", uuid) 347 348 strategySetting, err := schema.ParseDDLStrategy(ddlStrategy) 349 assert.NoError(t, err) 350 351 if !strategySetting.Strategy.IsDirect() { 352 status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, migrationWaitTimeout, schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed) 353 fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) 354 } 355 356 if expectHint != "" { 357 checkMigratedTable(t, tableName, expectHint) 358 } 359 return uuid 360 } 361 362 // checkTable checks the number of tables in the first two shards. 363 func checkTable(t *testing.T, showTableName string) { 364 for i := range clusterInstance.Keyspaces[0].Shards { 365 checkTablesCount(t, clusterInstance.Keyspaces[0].Shards[i].Vttablets[0], showTableName, 1) 366 } 367 } 368 369 // checkTablesCount checks the number of tables in the given tablet 370 func checkTablesCount(t *testing.T, tablet *cluster.Vttablet, showTableName string, expectCount int) { 371 query := fmt.Sprintf(`show tables like '%%%s%%';`, showTableName) 372 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 373 defer cancel() 374 rowcount := 0 375 for { 376 queryResult, err := tablet.VttabletProcess.QueryTablet(query, keyspaceName, true) 377 require.Nil(t, err) 378 rowcount = len(queryResult.Rows) 379 if rowcount > 0 { 380 break 381 } 382 383 select { 384 case <-time.After(time.Second): 385 case <-ctx.Done(): 386 break 387 } 388 } 389 assert.Equal(t, expectCount, rowcount) 390 } 391 392 // checkMigratedTables checks the CREATE STATEMENT of a table after migration 393 func checkMigratedTable(t *testing.T, tableName, expectHint string) { 394 for i := range clusterInstance.Keyspaces[0].Shards { 395 createStatement := getCreateTableStatement(t, clusterInstance.Keyspaces[0].Shards[i].Vttablets[0], tableName) 396 assert.Contains(t, createStatement, expectHint) 397 } 398 } 399 400 // getCreateTableStatement returns the CREATE TABLE statement for a given table 401 func getCreateTableStatement(t *testing.T, tablet *cluster.Vttablet, tableName string) (statement string) { 402 queryResult, err := tablet.VttabletProcess.QueryTablet(fmt.Sprintf("show create table %s;", tableName), keyspaceName, true) 403 require.Nil(t, err) 404 405 assert.Equal(t, len(queryResult.Rows), 1) 406 assert.Equal(t, len(queryResult.Rows[0]), 2) // table name, create statement 407 statement = queryResult.Rows[0][1].ToString() 408 return statement 409 } 410 411 func generateInsert(t *testing.T, conn *mysql.Conn) error { 412 id := rand.Int31n(int32(maxTableRows)) 413 query := fmt.Sprintf(insertRowStatement, id, nextOpOrder()) 414 qr, err := conn.ExecuteFetch(query, 1000, true) 415 416 func() { 417 writeMetrics.mu.Lock() 418 defer writeMetrics.mu.Unlock() 419 420 writeMetrics.insertsAttempts++ 421 if err != nil { 422 writeMetrics.insertsFailures++ 423 return 424 } 425 assert.Less(t, qr.RowsAffected, uint64(2)) 426 if qr.RowsAffected == 0 { 427 writeMetrics.insertsNoops++ 428 return 429 } 430 writeMetrics.inserts++ 431 }() 432 return err 433 } 434 435 func generateUpdate(t *testing.T, conn *mysql.Conn) error { 436 id := rand.Int31n(int32(maxTableRows)) 437 query := fmt.Sprintf(updateRowStatement, nextOpOrder(), id) 438 qr, err := conn.ExecuteFetch(query, 1000, true) 439 440 func() { 441 writeMetrics.mu.Lock() 442 defer writeMetrics.mu.Unlock() 443 444 writeMetrics.updatesAttempts++ 445 if err != nil { 446 writeMetrics.updatesFailures++ 447 return 448 } 449 assert.Less(t, qr.RowsAffected, uint64(2)) 450 if qr.RowsAffected == 0 { 451 writeMetrics.updatesNoops++ 452 return 453 } 454 writeMetrics.updates++ 455 }() 456 return err 457 } 458 459 func generateDelete(t *testing.T, conn *mysql.Conn) error { 460 id := rand.Int31n(int32(maxTableRows)) 461 query := fmt.Sprintf(deleteRowStatement, id) 462 qr, err := conn.ExecuteFetch(query, 1000, true) 463 464 func() { 465 writeMetrics.mu.Lock() 466 defer writeMetrics.mu.Unlock() 467 468 writeMetrics.deletesAttempts++ 469 if err != nil { 470 writeMetrics.deletesFailures++ 471 return 472 } 473 assert.Less(t, qr.RowsAffected, uint64(2)) 474 if qr.RowsAffected == 0 { 475 writeMetrics.deletesNoops++ 476 return 477 } 478 writeMetrics.deletes++ 479 }() 480 return err 481 } 482 483 func runSingleConnection(ctx context.Context, t *testing.T, done *int64) { 484 log.Infof("Running single connection") 485 conn, err := mysql.Connect(ctx, &vtParams) 486 require.Nil(t, err) 487 defer conn.Close() 488 489 _, err = conn.ExecuteFetch("set autocommit=1", 1000, true) 490 require.Nil(t, err) 491 _, err = conn.ExecuteFetch("set transaction isolation level read committed", 1000, true) 492 require.Nil(t, err) 493 494 for { 495 if atomic.LoadInt64(done) == 1 { 496 log.Infof("Terminating single connection") 497 return 498 } 499 switch rand.Int31n(3) { 500 case 0: 501 err = generateInsert(t, conn) 502 case 1: 503 err = generateUpdate(t, conn) 504 case 2: 505 err = generateDelete(t, conn) 506 } 507 assert.Nil(t, err) 508 time.Sleep(singleConnectionSleepInterval) 509 } 510 } 511 512 func runMultipleConnections(ctx context.Context, t *testing.T) { 513 log.Infof("Running multiple connections") 514 var done int64 515 var wg sync.WaitGroup 516 for i := 0; i < maxConcurrency; i++ { 517 wg.Add(1) 518 go func() { 519 defer wg.Done() 520 runSingleConnection(ctx, t, &done) 521 }() 522 } 523 <-ctx.Done() 524 atomic.StoreInt64(&done, 1) 525 log.Infof("Running multiple connections: done") 526 wg.Wait() 527 log.Infof("All connections cancelled") 528 } 529 530 func initTable(t *testing.T) { 531 log.Infof("initTable begin") 532 defer log.Infof("initTable complete") 533 534 t.Run("cancel pending migrations", func(t *testing.T) { 535 cancelQuery := "alter vitess_migration cancel all" 536 r := onlineddl.VtgateExecQuery(t, &vtParams, cancelQuery, "") 537 if r.RowsAffected > 0 { 538 fmt.Printf("# Cancelled migrations (for debug purposes): %d\n", r.RowsAffected) 539 } 540 }) 541 542 ctx := context.Background() 543 conn, err := mysql.Connect(ctx, &vtParams) 544 require.Nil(t, err) 545 defer conn.Close() 546 547 resetOpOrder() 548 writeMetrics.Clear() 549 _, err = conn.ExecuteFetch(truncateStatement, 1000, true) 550 require.Nil(t, err) 551 552 for i := 0; i < maxTableRows/2; i++ { 553 generateInsert(t, conn) 554 } 555 for i := 0; i < maxTableRows/4; i++ { 556 generateUpdate(t, conn) 557 } 558 for i := 0; i < maxTableRows/4; i++ { 559 generateDelete(t, conn) 560 } 561 } 562 563 func testSelectTableMetrics(t *testing.T) { 564 writeMetrics.mu.Lock() 565 defer writeMetrics.mu.Unlock() 566 567 { 568 rs := onlineddl.VtgateExecQuery(t, &vtParams, selectMaxOpOrder, "") 569 row := rs.Named().Row() 570 require.NotNil(t, row) 571 572 maxOpOrder := row.AsInt64("m", 0) 573 fmt.Printf("# max op_order in table: %d\n", maxOpOrder) 574 } 575 576 log.Infof("%s", writeMetrics.String()) 577 578 ctx := context.Background() 579 conn, err := mysql.Connect(ctx, &vtParams) 580 require.Nil(t, err) 581 defer conn.Close() 582 583 rs, err := conn.ExecuteFetch(selectCountRowsStatement, 1000, true) 584 require.Nil(t, err) 585 586 row := rs.Named().Row() 587 require.NotNil(t, row) 588 log.Infof("testSelectTableMetrics, row: %v", row) 589 numRows := row.AsInt64("num_rows", 0) 590 sumUpdates := row.AsInt64("sum_updates", 0) 591 assert.NotZero(t, numRows) 592 assert.NotZero(t, sumUpdates) 593 assert.NotZero(t, writeMetrics.inserts) 594 assert.NotZero(t, writeMetrics.deletes) 595 assert.NotZero(t, writeMetrics.updates) 596 assert.Equal(t, writeMetrics.inserts-writeMetrics.deletes, numRows) 597 assert.Equal(t, writeMetrics.updates-writeMetrics.deletes, sumUpdates) // because we DELETE WHERE updates=1 598 }