github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/txn_restart_test.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sql_test 12 13 import ( 14 "bytes" 15 "context" 16 gosql "database/sql" 17 "fmt" 18 "net/url" 19 "regexp" 20 "strconv" 21 "strings" 22 "sync/atomic" 23 "testing" 24 "time" 25 26 "github.com/cockroachdb/cockroach/pkg/base" 27 "github.com/cockroachdb/cockroach/pkg/kv" 28 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 29 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 30 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 31 "github.com/cockroachdb/cockroach/pkg/roachpb" 32 "github.com/cockroachdb/cockroach/pkg/security" 33 "github.com/cockroachdb/cockroach/pkg/server" 34 "github.com/cockroachdb/cockroach/pkg/sql" 35 "github.com/cockroachdb/cockroach/pkg/sql/tests" 36 "github.com/cockroachdb/cockroach/pkg/testutils" 37 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 38 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 39 "github.com/cockroachdb/cockroach/pkg/util/caller" 40 "github.com/cockroachdb/cockroach/pkg/util/hlc" 41 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 42 "github.com/cockroachdb/cockroach/pkg/util/log" 43 "github.com/cockroachdb/cockroach/pkg/util/shuffle" 44 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 45 "github.com/cockroachdb/errors" 46 "github.com/lib/pq" 47 "github.com/stretchr/testify/require" 48 ) 49 50 type failureRecord struct { 51 err error 52 txn *roachpb.Transaction 53 } 54 55 type filterVals struct { 56 syncutil.Mutex 57 // key -> number of times an retriable error will be injected when that key 58 // is written. 59 restartCounts map[string]int 60 // key -> number of times a TransactionAborted error will be injected when 61 // that key is written. Note that injecting this is pretty funky: it can only 62 // be done on the first write of a txn, otherwise the previously written 63 // intents will linger on. 64 abortCounts map[string]int 65 66 // Keys for which we injected an error. 67 failedValues map[string]failureRecord 68 } 69 70 func createFilterVals(restartCounts map[string]int, abortCounts map[string]int) *filterVals { 71 return &filterVals{ 72 restartCounts: restartCounts, 73 abortCounts: abortCounts, 74 failedValues: map[string]failureRecord{}, 75 } 76 } 77 78 // checkCorrectTxn checks that the current txn is the correct one, according to 79 // the way the previous txn that tried to write value failed. 80 func checkCorrectTxn(value string, magicVals *filterVals, txn *roachpb.Transaction) error { 81 failureRec, found := magicVals.failedValues[value] 82 if !found { 83 return nil 84 } 85 if errors.HasType(failureRec.err, (*roachpb.TransactionAbortedError)(nil)) { 86 // The previous txn should have been aborted, so check that we're running 87 // in a new one. 88 if failureRec.txn.ID == txn.ID { 89 return errors.Errorf(`new transaction for value "%s" is the same as the old one`, value) 90 } 91 } else { 92 // The previous txn should have been restarted, so we should be running in 93 // the same one. 94 if failureRec.txn.ID != txn.ID { 95 return errors.Errorf(`new transaction for value "%s" (%s) is not the same as the old one (%s)`, value, txn, failureRec.txn) 96 } 97 } 98 // Don't check this value in subsequent transactions. 99 delete(magicVals.failedValues, value) 100 101 return nil 102 } 103 104 type injectionApproach struct { 105 counts map[string]int 106 errFn func() error 107 } 108 109 type injectionApproaches []injectionApproach 110 111 func (ia injectionApproaches) Len() int { return len(ia) } 112 func (ia injectionApproaches) Swap(i, j int) { ia[i], ia[j] = ia[j], ia[i] } 113 114 func injectErrors( 115 req roachpb.Request, hdr roachpb.Header, magicVals *filterVals, verifyTxn bool, 116 ) error { 117 magicVals.Lock() 118 defer magicVals.Unlock() 119 120 switch req := req.(type) { 121 case *roachpb.ConditionalPutRequest: 122 // Create a list of each injection approach and shuffle the order of 123 // injection for some additional randomness. 124 injections := injectionApproaches{ 125 {counts: magicVals.restartCounts, errFn: func() error { 126 // Note we use a retry error that cannot be automatically retried 127 // by the transaction coord sender. 128 return roachpb.NewTransactionRetryError(roachpb.RETRY_REASON_UNKNOWN, "injected err") 129 }}, 130 {counts: magicVals.abortCounts, errFn: func() error { 131 return roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_ABORTED_RECORD_FOUND) 132 }}, 133 } 134 shuffle.Shuffle(injections) 135 136 for _, injection := range injections { 137 for key, count := range injection.counts { 138 if verifyTxn { 139 if err := checkCorrectTxn(string(req.Value.RawBytes), magicVals, hdr.Txn); err != nil { 140 return err 141 } 142 } 143 if count > 0 && bytes.Contains(req.Value.RawBytes, []byte(key)) { 144 injection.counts[key]-- 145 err := injection.errFn() 146 magicVals.failedValues[string(req.Value.RawBytes)] = failureRecord{err, hdr.Txn} 147 return err 148 } 149 } 150 } 151 return nil 152 default: 153 return nil 154 } 155 } 156 157 // checkRestart checks that there are no errors left to inject. 158 func checkRestarts(t *testing.T, magicVals *filterVals) { 159 magicVals.Lock() 160 defer magicVals.Unlock() 161 for key, count := range magicVals.restartCounts { 162 if count != 0 { 163 file, line, _ := caller.Lookup(1) 164 t.Errorf("%s:%d: INSERT for \"%s\" still has to be retried %d times", 165 file, line, key, count) 166 } 167 } 168 for key, count := range magicVals.abortCounts { 169 if count != 0 { 170 file, line, _ := caller.Lookup(1) 171 t.Errorf("%s:%d: INSERT for \"%s\" still has to be aborted %d times", 172 file, line, key, count) 173 } 174 } 175 if t.Failed() { 176 t.Fatalf("checking error injection failed") 177 } 178 } 179 180 // TxnAborter can be used to listen for transactions running particular 181 // SQL statements; the trapped transactions will be aborted. 182 // The TxnAborter needs to be hooked up to a Server's 183 // Knobs.StatementFilter, so that the Aborter sees what statements are being 184 // executed. This is done by calling HookupToExecutor(), which returns a 185 // stuitable ExecutorTestingKnobs. 186 // A statement can be registered for abortion (meaning, the statement's 187 // transaction will be TransactionAborted) with QueueStmtForAbortion(). When the 188 // Aborter sees that statement, it will run a higher priority transaction that 189 // tramples the data, so the original transaction will get a TransactionAborted 190 // error when it tries to commit. 191 // 192 // Note that transaction cannot be aborted using an injected error, since we 193 // want the pusher to clean up the intents of the pushee. 194 // 195 // The aborter only works with INSERT statements operating on the table t.test 196 // defined as: 197 // `CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT)` 198 // The TxnAborter runs transactions deleting the row for the `k` that the 199 // trapped transactions were writing to. 200 // 201 // Be sure to set DisableAutoCommit on the ExecutorTestingKnobs, otherwise 202 // implicit transactions won't have a chance to be aborted. 203 // 204 // Example usage: 205 // 206 // func TestTxnAutoRetry(t *testing.T) { 207 // defer leaktest.AfterTest(t)() 208 // aborter := NewTxnAborter() 209 // defer aborter.Close(t) 210 // params, cmdFilters := tests.CreateTestServerParams() 211 // params.Knobs.SQLExecutor = aborter.executorKnobs() 212 // s, sqlDB, _ := serverutils.StartServer(t, params) 213 // defer s.Stopper().Stop(context.Background()) 214 // { 215 // pgURL, cleanup := sqlutils.PGUrl(t, s.ServingRPCAddr(), "TestTxnAutoRetry", url.User(security.RootUser) 216 // defer cleanup() 217 // if err := aborter.Init(pgURL); err != nil { 218 // t.Fatal(err) 219 // } 220 // } 221 // 222 // sqlDB.Exec(`CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT)`) 223 // const sentinelInsert = "INSERT INTO t.test(k, v) VALUES (0, 'sentinel')" 224 // if err := aborter.QueueStmtForAbortion( 225 // sentinelInsert, 1 /* abortCount */, true /* willBeRetriedIbid */, 226 // ); err != nil { 227 // t.Fatal(err) 228 // } 229 // sqlDB.Exec(sentinelInsert) 230 // ... 231 type TxnAborter struct { 232 mu struct { 233 syncutil.Mutex 234 stmtsToAbort map[string]*restartInfo 235 } 236 // A second connection pool, to be used by aborts. 237 // This is needed because the main conn pool is going to be restricted to one 238 // connection. 239 // TODO(andrei): remove this if we ever move to using libpq conns directly. 240 // See TODOs around on SetMaxOpenConns. 241 abortDB *gosql.DB 242 } 243 244 type restartInfo struct { 245 // The numberic value being inserted in col 'k'. 246 key int 247 // The remaining number of times to abort the txn. 248 abortCount int 249 satisfied bool 250 checkSatisfied bool 251 // The number of times the statement as been executed. 252 execCount int 253 } 254 255 func NewTxnAborter() *TxnAborter { 256 ta := new(TxnAborter) 257 ta.mu.stmtsToAbort = make(map[string]*restartInfo) 258 return ta 259 } 260 261 func (ta *TxnAborter) Init(pgURL url.URL) error { 262 abortDB, err := gosql.Open("postgres", pgURL.String()) 263 if err != nil { 264 return err 265 } 266 ta.abortDB = abortDB 267 return nil 268 } 269 270 var valuesRE = regexp.MustCompile(`VALUES.*\((\d),`) 271 272 // QueueStmtForAbortion registers a statement whose transaction will be aborted. 273 // 274 // stmt needs to be the statement, literally as the AST gets converted back to a 275 // string. Note that, since we sometimes change the AST during planning, the 276 // statements sent for execution that need to be intercepted by this filter 277 // need to be written in a canonical form, and stmt passed here needs to also be 278 // that canonical form. In particular, table names need to be fully qualified 279 // with the schema (e.g. t.public.test). 280 // 281 // abortCount specifies how many times a txn running this statement will be 282 // aborted. 283 // willBeRetriedIbid should be set if the statement will be retried by the test 284 // (as an identical statement). This allows the TxnAborter to assert, on 285 // Close(), that the statement has been retried the intended number of times by 286 // the end of the test (besides asserting that an error was injected the right 287 // number of times. So, the Aborter can be used to check that the retry 288 // machinery has done its job. The Aborter will consider the statement to have 289 // been retried correctly if the statement has been executed at least once after 290 // the Aborter is done injecting errors because of it. So normally we'd expect 291 // this statement to executed RestartCount + 1 times, but we allow it to be 292 // retried more times because the statement's txn might also retried because of 293 // other statements. 294 // 295 // Calling QueueStmtForAbortion repeatedly with the same stmt is allowed, and 296 // each call checks that the previous one was satisfied. 297 func (ta *TxnAborter) QueueStmtForAbortion( 298 stmt string, abortCount int, willBeRetriedIbid bool, 299 ) error { 300 ta.mu.Lock() 301 defer ta.mu.Unlock() 302 if ri, ok := ta.mu.stmtsToAbort[stmt]; ok { 303 // If we're overwriting a statement that was already queued, verify it 304 // first. 305 if err := ri.Verify(); err != nil { 306 return errors.Wrapf(err, `statement "%s" error`, stmt) 307 } 308 } 309 // Extract the "key" - the value of the first col, which will be trampled on. 310 switch matches := valuesRE.FindStringSubmatch(stmt); len(matches) { 311 case 0, 1: 312 return errors.Errorf(`bad statement "%s": key col not found`, stmt) 313 default: 314 key, err := strconv.Atoi(matches[1]) 315 if err != nil { 316 return errors.Wrapf(err, `bad statement "%s"`, stmt) 317 } 318 ta.mu.stmtsToAbort[stmt] = &restartInfo{ 319 key: key, 320 abortCount: abortCount, 321 satisfied: false, 322 checkSatisfied: willBeRetriedIbid, 323 } 324 return nil 325 } 326 } 327 328 // GetExecCount returns the number of times a statement has been seen. 329 // You probably don't want to call this while the TxnAborter might be in 330 // the process of aborting the txn containing stmt, as the result will not be 331 // deterministic. 332 func (ta *TxnAborter) GetExecCount(stmt string) (int, bool) { 333 ta.mu.Lock() 334 defer ta.mu.Unlock() 335 if ri, ok := ta.mu.stmtsToAbort[stmt]; ok { 336 return ri.execCount, true 337 } 338 return 0, false 339 } 340 341 func (ta *TxnAborter) statementFilter(ctx context.Context, stmt string, err error) { 342 ta.mu.Lock() 343 log.Infof(ctx, "statement filter running on: %s, with err=%v", stmt, err) 344 ri, ok := ta.mu.stmtsToAbort[stmt] 345 shouldAbort := false 346 if ok { 347 ri.execCount++ 348 if ri.abortCount == 0 { 349 log.VEventf(ctx, 1, "TxnAborter sees satisfied statement %q", stmt) 350 ri.satisfied = true 351 } 352 if ri.abortCount > 0 && err == nil { 353 log.Infof(ctx, "TxnAborter aborting txn for statement %q", stmt) 354 ri.abortCount-- 355 shouldAbort = true 356 } 357 } 358 ta.mu.Unlock() 359 if shouldAbort { 360 if err := ta.abortTxn(ri.key); err != nil { 361 panic(fmt.Sprintf("TxnAborter failed to abort: %s", err)) 362 } 363 } 364 } 365 366 // executorKnobs are the bridge between the TxnAborter and the sql.Executor. 367 func (ta *TxnAborter) executorKnobs() base.ModuleTestingKnobs { 368 return &sql.ExecutorTestingKnobs{ 369 // We're going to abort txns using a TxnAborter, and that's incompatible 370 // with AutoCommit. 371 DisableAutoCommit: true, 372 StatementFilter: ta.statementFilter, 373 } 374 } 375 376 // abortTxn writes to a key and as a side effect aborts a txn that had an intent 377 // on that key. 378 func (ta *TxnAborter) abortTxn(key int) error { 379 tx, err := ta.abortDB.Begin() 380 if err != nil { 381 return err 382 } 383 if _, err := tx.Exec("SET TRANSACTION PRIORITY HIGH"); err != nil { 384 return err 385 } 386 if _, err := tx.Exec("DELETE FROM t.test WHERE k = $1", key); err != nil { 387 return err 388 } 389 if err = tx.Commit(); err != nil { 390 return err 391 } 392 return nil 393 } 394 395 type TxnAborterVerifierError struct { 396 errs []error 397 } 398 399 func (e *TxnAborterVerifierError) Error() string { 400 strs := make([]string, 0) 401 for _, err := range e.errs { 402 strs = append(strs, err.Error()) 403 } 404 return strings.Join(strs, "\n") 405 } 406 407 func (ta *TxnAborter) VerifyAndClear() error { 408 ta.mu.Lock() 409 defer ta.mu.Unlock() 410 allErr := TxnAborterVerifierError{} 411 for stmt, ri := range ta.mu.stmtsToAbort { 412 if err := ri.Verify(); err != nil { 413 allErr.errs = append(allErr.errs, errors.Wrapf(err, `statement "%s" error`, stmt)) 414 } 415 } 416 ta.mu.stmtsToAbort = make(map[string]*restartInfo) 417 if len(allErr.errs) != 0 { 418 return &allErr 419 } 420 return nil 421 } 422 423 func (ta *TxnAborter) Close(t testing.TB) { 424 ta.abortDB.Close() 425 if err := ta.VerifyAndClear(); err != nil { 426 file, line, _ := caller.Lookup(1) 427 t.Errorf("%s:%d %s", file, line, err) 428 } 429 } 430 431 func (ri *restartInfo) Verify() error { 432 if ri.abortCount != 0 { 433 return errors.Errorf("%d additional aborts expected", ri.abortCount) 434 } 435 if ri.checkSatisfied && !ri.satisfied { 436 return errors.New("previous abort did not result in a retry") 437 } 438 return nil 439 } 440 441 // Test the logic in the sql executor for automatically retrying txns in case of 442 // retriable errors. 443 func TestTxnAutoRetry(t *testing.T) { 444 defer leaktest.AfterTest(t)() 445 446 aborter := NewTxnAborter() 447 defer aborter.Close(t) 448 params, cmdFilters := tests.CreateTestServerParams() 449 params.Knobs.SQLExecutor = aborter.executorKnobs() 450 s, sqlDB, _ := serverutils.StartServer(t, params) 451 defer s.Stopper().Stop(context.Background()) 452 { 453 pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestTxnAutoRetry", url.User(security.RootUser)) 454 defer cleanup() 455 if err := aborter.Init(pgURL); err != nil { 456 t.Fatal(err) 457 } 458 } 459 460 // Make sure all the commands we send in this test are sent over the same connection. 461 // This is a bit of a hack; in Go you're not supposed to have connection state 462 // outside of using a db.Tx. But we can't use a db.Tx here, because we want 463 // to control the batching of BEGIN/COMMIT statements. 464 // This SetMaxOpenConns is pretty shady, it doesn't guarantee that you'll be using 465 // the *same* one connection across calls. A proper solution would be to use a 466 // lib/pq connection directly. As of Feb 2016, there's code in cli/sql_util.go to 467 // do that. 468 sqlDB.SetMaxOpenConns(1) 469 470 if _, err := sqlDB.Exec(` 471 CREATE DATABASE t; 472 CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT, t DECIMAL); 473 `); err != nil { 474 t.Fatal(err) 475 } 476 477 // Set up error injection that causes retries. 478 magicVals := createFilterVals(nil, nil) 479 magicVals.restartCounts = map[string]int{ 480 "boulanger": 2, 481 "dromedary": 2, 482 "fajita": 2, 483 "hooly": 2, 484 "josephine": 2, 485 "laureal": 2, 486 } 487 magicVals.abortCounts = map[string]int{ 488 "boulanger": 2, 489 } 490 cleanupFilter := cmdFilters.AppendFilter( 491 func(args kvserverbase.FilterArgs) *roachpb.Error { 492 if err := injectErrors(args.Req, args.Hdr, magicVals, true /* verifyTxn */); err != nil { 493 return roachpb.NewErrorWithTxn(err, args.Hdr.Txn) 494 } 495 return nil 496 }, false) 497 498 if err := aborter.QueueStmtForAbortion( 499 "INSERT INTO t.public.test(k, v, t) VALUES (1, 'boulanger', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */ 500 ); err != nil { 501 t.Fatal(err) 502 } 503 if err := aborter.QueueStmtForAbortion( 504 "INSERT INTO t.public.test(k, v, t) VALUES (2, 'dromedary', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */ 505 ); err != nil { 506 t.Fatal(err) 507 } 508 if err := aborter.QueueStmtForAbortion( 509 "INSERT INTO t.public.test(k, v, t) VALUES (3, 'fajita', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */ 510 ); err != nil { 511 t.Fatal(err) 512 } 513 if err := aborter.QueueStmtForAbortion( 514 "INSERT INTO t.public.test(k, v, t) VALUES (4, 'hooly', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */ 515 ); err != nil { 516 t.Fatal(err) 517 } 518 519 // Test that implicit txns - txns for which we see all the statements and prefixes 520 // of txns (statements batched together with the BEGIN stmt) - are retried. 521 // We also exercise the SQL cluster logical timestamp in here, because 522 // this must be properly propagated across retries. 523 // 524 // The SELECT within the transaction also checks that discarded 525 // intermediate result sets are properly released: the result set it 526 // produces is accounted for by the session monitor, and if it is 527 // not properly released upon a retry the monitor will cause the 528 // server to panic (and thus the test to fail) when the connection 529 // is closed. 530 // 531 // TODO(knz): This test can be made more robust by exposing the 532 // current allocation count in monitor and checking that it has the 533 // same value at the beginning of each retry. 534 rows, err := sqlDB.Query(` 535 INSERT INTO t.public.test(k, v, t) VALUES (1, 'boulanger', cluster_logical_timestamp()) RETURNING 1; 536 BEGIN; 537 INSERT INTO t.public.test(k, v, t) VALUES (2, 'dromedary', cluster_logical_timestamp()) RETURNING 1; 538 INSERT INTO t.public.test(k, v, t) VALUES (3, 'fajita', cluster_logical_timestamp()) RETURNING 1; 539 END; 540 INSERT INTO t.public.test(k, v, t) VALUES (4, 'hooly', cluster_logical_timestamp()) RETURNING 1; 541 BEGIN; 542 INSERT INTO t.public.test(k, v, t) VALUES (5, 'josephine', cluster_logical_timestamp()) RETURNING 1; 543 INSERT INTO t.public.test(k, v, t) VALUES (6, 'laureal', cluster_logical_timestamp()) RETURNING 1; 544 `) 545 if err != nil { 546 t.Fatal(err) 547 } 548 defer rows.Close() 549 550 resSets := 0 551 for { 552 for rows.Next() { 553 resSets++ 554 } 555 if !rows.NextResultSet() { 556 break 557 } 558 } 559 if err := rows.Err(); err != nil { 560 t.Fatal(err) 561 } 562 if resSets != 6 { 563 t.Fatalf("Expected 6 result sets, got %d", resSets) 564 } 565 566 cleanupFilter() 567 568 checkRestarts(t, magicVals) 569 570 if _, err := sqlDB.Exec("END"); err != nil { 571 t.Fatal(err) 572 } 573 574 // Check that the txns succeeded by reading the rows. 575 var count int 576 if err := sqlDB.QueryRow("SELECT count(*) FROM t.public.test").Scan(&count); err != nil { 577 t.Fatal(err) 578 } 579 if count != 6 { 580 t.Fatalf("Expected 6 rows, got %d", count) 581 } 582 583 // Now test that we don't retry what we shouldn't: insert an error into a txn 584 // we can't automatically retry (because it spans requests). 585 586 magicVals = createFilterVals(nil, nil) 587 magicVals.restartCounts = map[string]int{ 588 "hooly": 2, 589 } 590 cleanupFilter = cmdFilters.AppendFilter( 591 func(args kvserverbase.FilterArgs) *roachpb.Error { 592 if err := injectErrors(args.Req, args.Hdr, magicVals, true /* verifyTxn */); err != nil { 593 return roachpb.NewErrorWithTxn(err, args.Hdr.Txn) 594 } 595 return nil 596 }, false) 597 defer cleanupFilter() 598 599 // Start a txn. 600 if _, err := sqlDB.Exec(` 601 DELETE FROM t.public.test WHERE true; 602 BEGIN; 603 `); err != nil { 604 t.Fatal(err) 605 } 606 607 // Run a batch of statements to move the txn out of the AutoRetry state, 608 // otherwise the INSERT below would be automatically retried. 609 if _, err := sqlDB.Exec("SELECT 1"); err != nil { 610 t.Fatal(err) 611 } 612 613 // Continue the txn in a new request, which is not retriable. 614 _, err = sqlDB.Exec("INSERT INTO t.public.test(k, v, t) VALUES (4, 'hooly', cluster_logical_timestamp())") 615 require.Regexp(t, "RETRY_REASON_UNKNOWN - injected err", err) 616 } 617 618 // Test that aborted txn are only retried once. 619 // Prevents regressions of #8456. 620 func TestAbortedTxnOnlyRetriedOnce(t *testing.T) { 621 defer leaktest.AfterTest(t)() 622 623 aborter := NewTxnAborter() 624 defer aborter.Close(t) 625 params, _ := tests.CreateTestServerParams() 626 params.Knobs.SQLExecutor = aborter.executorKnobs() 627 s, sqlDB, _ := serverutils.StartServer(t, params) 628 defer s.Stopper().Stop(context.Background()) 629 { 630 pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestAbortedTxnOnlyRetriedOnce", url.User(security.RootUser)) 631 defer cleanup() 632 if err := aborter.Init(pgURL); err != nil { 633 t.Fatal(err) 634 } 635 } 636 637 const insertStmt = "INSERT INTO t.public.test(k, v) VALUES (1, 'boulanger')" 638 if err := aborter.QueueStmtForAbortion( 639 insertStmt, 1 /* abortCount */, true, /* willBeRetriedIbid */ 640 ); err != nil { 641 t.Fatal(err) 642 } 643 644 if _, err := sqlDB.Exec(` 645 CREATE DATABASE t; 646 CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT); 647 `); err != nil { 648 t.Fatal(err) 649 } 650 651 if _, err := sqlDB.Exec(insertStmt); err != nil { 652 t.Fatalf("unexpected error: %s", err) 653 } 654 655 execCount, ok := aborter.GetExecCount(insertStmt) 656 if !ok { 657 t.Fatalf("aborter has no state on %q", insertStmt) 658 } 659 if execCount != 2 { 660 t.Fatalf("expected %q to be executed 2 times, but got %d", insertStmt, execCount) 661 } 662 } 663 664 // rollbackStrategy is the type of statement which a client can use to 665 // rollback aborted txns from retryable errors. We accept two statements 666 // for rolling back to the cockroach_restart savepoint. See 667 // *Executor.execStmtInAbortedTxn for more about transaction retries. 668 type rollbackStrategy int 669 670 const ( 671 rollbackToSavepoint rollbackStrategy = iota 672 declareSavepoint 673 ) 674 675 func (rs rollbackStrategy) SQLCommand() string { 676 switch rs { 677 case rollbackToSavepoint: 678 return "ROLLBACK TO SAVEPOINT cockroach_restart" 679 case declareSavepoint: 680 return "SAVEPOINT cockroach_restart" 681 } 682 panic("unreachable") 683 } 684 685 // exec takes a closure and executes it repeatedly as long as it says it needs 686 // to be retried. The function also takes a rollback strategy, which specifies 687 // the statement which the client will use to rollback aborted txns from retryable 688 // errors. 689 func retryExec(t *testing.T, sqlDB *gosql.DB, rs rollbackStrategy, fn func(*gosql.Tx) bool) { 690 tx, err := sqlDB.Begin() 691 if err != nil { 692 t.Fatal(err) 693 } 694 if _, err := tx.Exec( 695 "SAVEPOINT cockroach_restart; SET TRANSACTION PRIORITY LOW"); err != nil { 696 t.Fatal(err) 697 } 698 699 for fn(tx) { 700 if _, err := tx.Exec(rs.SQLCommand()); err != nil { 701 t.Fatal(err) 702 } 703 } 704 if err := tx.Commit(); err != nil { 705 t.Fatal(err) 706 } 707 } 708 709 // isRetryableErr returns whether the given error is a PG retryable error. 710 func isRetryableErr(err error) bool { 711 var pqErr *pq.Error 712 return errors.As(err, &pqErr) && pqErr.Code == "40001" 713 } 714 715 // Returns true on retriable errors. 716 func runTestTxn( 717 t *testing.T, 718 magicVals *filterVals, 719 expectedErr string, 720 sqlDB *gosql.DB, 721 tx *gosql.Tx, 722 sentinelInsert string, 723 ) bool { 724 // Run a bogus statement to disable the automatic server retries of subsequent 725 // statements. 726 if _, err := tx.Exec("SELECT 1"); err != nil { 727 t.Fatal(err) 728 } 729 730 retriesNeeded := 731 (magicVals.restartCounts["boulanger"] + magicVals.abortCounts["boulanger"]) > 0 732 if retriesNeeded { 733 _, err := tx.Exec("INSERT INTO t.public.test(k, v) VALUES (1, 'boulanger')") 734 if !testutils.IsError(err, expectedErr) { 735 t.Fatalf("unexpected error: %v", err) 736 } 737 return isRetryableErr(err) 738 } 739 // Now the INSERT should succeed. 740 if _, err := tx.Exec( 741 "DELETE FROM t.public.test WHERE true;" + sentinelInsert, 742 ); err != nil { 743 t.Fatal(err) 744 } 745 746 _, err := tx.Exec("RELEASE SAVEPOINT cockroach_restart") 747 return isRetryableErr(err) 748 } 749 750 // TestUserTxnRestart tests user-directed txn restarts. 751 // The test will inject and otherwise create retriable errors of various kinds 752 // and checks that we still manage to run a txn despite them. 753 func TestTxnUserRestart(t *testing.T) { 754 defer leaktest.AfterTest(t)() 755 756 // Set up error injection that causes retries. 757 testCases := []struct { 758 magicVals *filterVals 759 expectedErr string 760 }{ 761 { 762 magicVals: createFilterVals( 763 map[string]int{"boulanger": 2}, // restartCounts 764 nil), 765 expectedErr: "RETRY_REASON_UNKNOWN", 766 }, 767 { 768 magicVals: createFilterVals( 769 nil, 770 map[string]int{"boulanger": 2}), // abortCounts 771 expectedErr: regexp.QuoteMeta("TransactionAbortedError(ABORT_REASON_ABORTED_RECORD_FOUND)"), 772 }, 773 } 774 775 for _, tc := range testCases { 776 for _, rs := range []rollbackStrategy{rollbackToSavepoint, declareSavepoint} { 777 t.Run(fmt.Sprintf("err=%s,stgy=%d", tc.expectedErr, rs), func(t *testing.T) { 778 aborter := NewTxnAborter() 779 defer aborter.Close(t) 780 params, cmdFilters := tests.CreateTestServerParams() 781 params.Knobs.SQLExecutor = aborter.executorKnobs() 782 s, sqlDB, _ := serverutils.StartServer(t, params) 783 defer s.Stopper().Stop(context.Background()) 784 { 785 pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestTxnUserRestart", url.User(security.RootUser)) 786 defer cleanup() 787 if err := aborter.Init(pgURL); err != nil { 788 t.Fatal(err) 789 } 790 } 791 792 if _, err := sqlDB.Exec(` 793 CREATE DATABASE t; 794 CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT); 795 `); err != nil { 796 t.Fatal(err) 797 } 798 cleanupFilter := cmdFilters.AppendFilter( 799 func(args kvserverbase.FilterArgs) *roachpb.Error { 800 if err := injectErrors(args.Req, args.Hdr, tc.magicVals, true /* verifyTxn */); err != nil { 801 return roachpb.NewErrorWithTxn(err, args.Hdr.Txn) 802 } 803 return nil 804 }, false) 805 806 // Also inject an error at RELEASE time, besides the error injected by magicVals. 807 sentinelInsert := "INSERT INTO t.public.test(k, v) VALUES (0, 'sentinel')" 808 if err := aborter.QueueStmtForAbortion( 809 sentinelInsert, 1 /* abortCount */, true, /* willBeRetriedIbid */ 810 ); err != nil { 811 t.Fatal(err) 812 } 813 814 commitCount := s.MustGetSQLCounter(sql.MetaTxnCommitStarted.Name) 815 // This is the magic. Run the txn closure until all the retries are exhausted. 816 retryExec(t, sqlDB, rs, func(tx *gosql.Tx) bool { 817 return runTestTxn(t, tc.magicVals, tc.expectedErr, sqlDB, tx, sentinelInsert) 818 }) 819 checkRestarts(t, tc.magicVals) 820 821 // Check that we only wrote the sentinel row. 822 rows, err := sqlDB.Query("SELECT * FROM t.test") 823 if err != nil { 824 t.Fatal(err) 825 } 826 defer rows.Close() 827 for rows.Next() { 828 var k int 829 var v string 830 err = rows.Scan(&k, &v) 831 if err != nil { 832 t.Fatal(err) 833 } 834 if k != 0 || v != "sentinel" { 835 t.Fatalf("didn't find expected row: %d %s", k, v) 836 } 837 } 838 // Check that the commit counter was incremented. It could have been 839 // incremented by more than 1 because of the transactions we use to force 840 // aborts, plus who knows what else the server is doing in the background. 841 if err := checkCounterGE(s, sql.MetaTxnCommitStarted, commitCount+1); err != nil { 842 t.Error(err) 843 } 844 // Clean up the table for the next test iteration. 845 _, err = sqlDB.Exec("DELETE FROM t.test WHERE true") 846 if err != nil { 847 t.Fatal(err) 848 } 849 cleanupFilter() 850 }) 851 } 852 } 853 } 854 855 // Test that rando commands while in COMMIT_WAIT return a particular error. 856 func TestCommitWaitState(t *testing.T) { 857 defer leaktest.AfterTest(t)() 858 859 params, _ := tests.CreateTestServerParams() 860 s, sqlDB, _ := serverutils.StartServer(t, params) 861 defer s.Stopper().Stop(context.Background()) 862 if _, err := sqlDB.Exec(` 863 CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT); 864 `); err != nil { 865 t.Fatal(err) 866 } 867 868 tx, err := sqlDB.Begin() 869 if err != nil { 870 t.Fatal(err) 871 } 872 if _, err := tx.Exec( 873 "SAVEPOINT cockroach_restart; RELEASE cockroach_restart"); err != nil { 874 t.Fatal(err) 875 } 876 if _, err := tx.Exec("INSERT INTO t.test(k, v) VALUES (0, 'sentinel')"); !testutils.IsError(err, "current transaction is committed") { 877 t.Fatalf("unexpected error: %v", err) 878 } 879 // Rollback should respond with a COMMIT command tag. 880 if err := tx.Rollback(); !testutils.IsError(err, "unexpected command tag COMMIT") { 881 t.Fatalf("unexpected error: %v", err) 882 } 883 } 884 885 // Test that a COMMIT getting an error, retriable or not, leaves the txn 886 // finalized and not in Aborted/RestartWait (i.e. COMMIT, like ROLLBACK, is 887 // always final). As opposed to an error on a COMMIT in an auto-retry 888 // txn, where we retry the txn (not tested here). 889 func TestErrorOnCommitFinalizesTxn(t *testing.T) { 890 defer leaktest.AfterTest(t)() 891 892 aborter := NewTxnAborter() 893 defer aborter.Close(t) 894 params, _ := tests.CreateTestServerParams() 895 params.Knobs.SQLExecutor = aborter.executorKnobs() 896 s, sqlDB, _ := serverutils.StartServer(t, params) 897 defer s.Stopper().Stop(context.Background()) 898 { 899 pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestErrorOnCommitFinalizesTxn", url.User(security.RootUser)) 900 defer cleanup() 901 if err := aborter.Init(pgURL); err != nil { 902 t.Fatal(err) 903 } 904 } 905 906 if _, err := sqlDB.Exec(` 907 CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT); 908 `); err != nil { 909 t.Fatal(err) 910 } 911 // We need to do everything on one connection as we'll want to observe the 912 // connection state after a COMMIT. 913 sqlDB.SetMaxOpenConns(1) 914 915 // We're going to test both errors that would leave the transaction in the 916 // RestartWait state and errors that would leave the transaction in Aborted, 917 // if they were to happen on any other statement than COMMIT. 918 // We do that by always injecting a retryable error at COMMIT, but once in a 919 // txn that had a "retry intent" (SAVEPOINT cockroach_restart), and once in a 920 // txn without it. 921 testCases := []struct { 922 retryIntent bool 923 }{ 924 {false}, 925 {true}, 926 } 927 for _, tc := range testCases { 928 t.Run(fmt.Sprintf("retryIntent=%t", tc.retryIntent), func(t *testing.T) { 929 const insertStmt = "INSERT INTO t.public.test(k, v) VALUES (0, 'boulanger')" 930 if err := aborter.QueueStmtForAbortion( 931 insertStmt, 1 /* abortCount */, false, /* willBeRetriedIbid */ 932 ); err != nil { 933 t.Fatal(err) 934 } 935 if _, err := sqlDB.Exec("BEGIN"); err != nil { 936 t.Fatal(err) 937 } 938 if tc.retryIntent { 939 if _, err := sqlDB.Exec("SAVEPOINT cockroach_restart"); err != nil { 940 t.Fatal(err) 941 } 942 } 943 if _, err := sqlDB.Exec(insertStmt); err != nil { 944 t.Fatal(err) 945 } 946 if _, err := sqlDB.Exec("COMMIT"); !testutils.IsError(err, "pq: restart transaction") { 947 t.Fatalf("unexpected error: %v", err) 948 } 949 950 // Check that we can start another txn on the (one and only) connection. 951 if _, err := sqlDB.Exec("BEGIN"); err != nil { 952 t.Fatal(err) 953 } 954 // Check that we don't see any rows, so the previous txn was rolled back. 955 rows, err := sqlDB.Query("SELECT * FROM t.test") 956 if err != nil { 957 t.Fatal(err) 958 } 959 defer rows.Close() 960 if rows.Next() { 961 var k int 962 var v string 963 err := rows.Scan(&k, &v) 964 t.Fatalf("found unexpected row: %d %s, %v", k, v, err) 965 } 966 if _, err := sqlDB.Exec("END"); err != nil { 967 t.Fatal(err) 968 } 969 }) 970 } 971 } 972 973 // TestRollbackInRestartWait ensures that a ROLLBACK while the txn is in the 974 // RetryWait state works. 975 func TestRollbackInRestartWait(t *testing.T) { 976 defer leaktest.AfterTest(t)() 977 978 aborter := NewTxnAborter() 979 defer aborter.Close(t) 980 params, _ := tests.CreateTestServerParams() 981 params.Knobs.SQLExecutor = aborter.executorKnobs() 982 s, sqlDB, _ := serverutils.StartServer(t, params) 983 defer s.Stopper().Stop(context.Background()) 984 { 985 pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestRollbackInRestartWait", url.User(security.RootUser)) 986 defer cleanup() 987 if err := aborter.Init(pgURL); err != nil { 988 t.Fatal(err) 989 } 990 } 991 992 if _, err := sqlDB.Exec(` 993 CREATE DATABASE t; 994 CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT); 995 `); err != nil { 996 t.Fatal(err) 997 } 998 999 // Set up error injection that causes retries. 1000 const insertStmt = "INSERT INTO t.public.test(k, v) VALUES (0, 'boulanger')" 1001 if err := aborter.QueueStmtForAbortion( 1002 insertStmt, 1 /* abortCount */, false, /* willBeRetriedIbid */ 1003 ); err != nil { 1004 t.Fatal(err) 1005 } 1006 1007 tx, err := sqlDB.Begin() 1008 if err != nil { 1009 t.Fatal(err) 1010 } 1011 if _, err := tx.Exec("SAVEPOINT cockroach_restart"); err != nil { 1012 t.Fatal(err) 1013 } 1014 // Run a batch of statements to move the txn out of the AutoRetry state, 1015 // otherwise the INSERT below would be automatically retried. 1016 if _, err := tx.Exec("SELECT 1"); err != nil { 1017 t.Fatal(err) 1018 } 1019 1020 if _, err := tx.Exec(insertStmt); err != nil { 1021 t.Fatal(err) 1022 } 1023 if _, err := tx.Exec("RELEASE SAVEPOINT cockroach_restart"); !testutils.IsError( 1024 err, "pq: restart transaction") { 1025 t.Fatalf("unexpected error: %s", err) 1026 } 1027 if err := tx.Rollback(); err != nil { 1028 t.Fatal(err) 1029 } 1030 } 1031 1032 // TestUnexpectedStatementInRestartWait ensures that a statement other than 1033 // ROLLBACK [TO SAVEPOINT] while the txn is in the RetryWait state terminates 1034 // the transaction. More importantly than the state in which the transaction 1035 // transitions when this happens is that this test prevents a regression of 1036 // #15412, whereby the server would crash in this situation. 1037 func TestUnexpectedStatementInRestartWait(t *testing.T) { 1038 defer leaktest.AfterTest(t)() 1039 1040 params, _ := tests.CreateTestServerParams() 1041 s, sqlDB, _ := serverutils.StartServer(t, params) 1042 defer s.Stopper().Stop(context.Background()) 1043 1044 tx, err := sqlDB.Begin() 1045 if err != nil { 1046 t.Fatal(err) 1047 } 1048 1049 if _, err := tx.Exec("SAVEPOINT cockroach_restart"); err != nil { 1050 t.Fatal(err) 1051 } 1052 // Run a batch of statements to move the txn out of the AutoRetry state, 1053 // otherwise the SELECT below would be automatically retried. 1054 if _, err := tx.Exec("SELECT 1"); err != nil { 1055 t.Fatal(err) 1056 } 1057 1058 if _, err := tx.Exec( 1059 "SELECT crdb_internal.force_retry('1s':::INTERVAL)"); !testutils.IsError( 1060 err, `forced by crdb_internal\.force_retry\(\)`) { 1061 t.Fatal(err) 1062 } 1063 var state string 1064 if err := tx.QueryRow("SHOW TRANSACTION STATUS").Scan(&state); err != nil { 1065 t.Fatal(err) 1066 } 1067 if state != "Aborted" { 1068 t.Fatalf("expected state %s, got: %s", "Aborted", state) 1069 } 1070 1071 if _, err := tx.Exec("SELECT 1"); !testutils.IsError(err, 1072 `pq: current transaction is aborted, commands ignored until end of transaction block`) { 1073 t.Fatal(err) 1074 } 1075 if err := tx.QueryRow("SHOW TRANSACTION STATUS").Scan(&state); err != nil { 1076 t.Fatal(err) 1077 } 1078 if state != "Aborted" { 1079 t.Fatalf("expected state %s, got: %s", "Aborted", state) 1080 } 1081 if err := tx.Rollback(); err != nil { 1082 t.Fatal(err) 1083 } 1084 } 1085 1086 // TestNonRetryableError verifies that a non-retryable error is propagated to the client. 1087 func TestNonRetryableError(t *testing.T) { 1088 defer leaktest.AfterTest(t)() 1089 1090 params, cmdFilters := tests.CreateTestServerParams() 1091 s, sqlDB, _ := serverutils.StartServer(t, params) 1092 defer s.Stopper().Stop(context.Background()) 1093 1094 testKey := []byte("test_key") 1095 hitError := false 1096 cleanupFilter := cmdFilters.AppendFilter( 1097 func(args kvserverbase.FilterArgs) *roachpb.Error { 1098 if req, ok := args.Req.(*roachpb.ScanRequest); ok { 1099 if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) { 1100 hitError = true 1101 return roachpb.NewErrorWithTxn(fmt.Errorf("testError"), args.Hdr.Txn) 1102 } 1103 } 1104 return nil 1105 }, false) 1106 defer cleanupFilter() 1107 1108 // We need to do everything on one connection as we'll want to observe the 1109 // connection state after a COMMIT. 1110 sqlDB.SetMaxOpenConns(1) 1111 if _, err := sqlDB.Exec(` 1112 CREATE DATABASE t; 1113 CREATE TABLE t.test (k TEXT PRIMARY KEY, v TEXT); 1114 INSERT INTO t.test (k, v) VALUES ('test_key', 'test_val'); 1115 SELECT * from t.test WHERE k = 'test_key'; 1116 `); !testutils.IsError(err, "pq: testError") { 1117 t.Errorf("unexpected error %v", err) 1118 } 1119 if !hitError { 1120 t.Errorf("expected to hit error, but it didn't happen") 1121 } 1122 } 1123 1124 // Verifies that an expired lease is released and a new lease is acquired on 1125 // transaction restart. 1126 // 1127 // This test triggers the above scenario by making 1128 // ReadWithinUncertaintyIntervalError advance the clock, so that the transaction 1129 // timestamp exceeds the deadline of the EndTxnRequest. 1130 func TestReacquireLeaseOnRestart(t *testing.T) { 1131 defer leaktest.AfterTest(t)() 1132 1133 advancement := 2 * base.DefaultTableDescriptorLeaseDuration 1134 1135 var cmdFilters tests.CommandFilters 1136 cmdFilters.AppendFilter(tests.CheckEndTxnTrigger, true) 1137 1138 var clockUpdate int32 1139 testKey := []byte("test_key") 1140 storeTestingKnobs := &kvserver.StoreTestingKnobs{ 1141 EvalKnobs: kvserverbase.BatchEvalTestingKnobs{ 1142 TestingEvalFilter: cmdFilters.RunFilters, 1143 }, 1144 DisableMaxOffsetCheck: true, 1145 ClockBeforeSend: func(c *hlc.Clock, ba roachpb.BatchRequest) { 1146 if atomic.LoadInt32(&clockUpdate) > 0 { 1147 return 1148 } 1149 1150 // Hack to advance the transaction timestamp on a transaction restart. 1151 for _, union := range ba.Requests { 1152 if req, ok := union.GetInner().(*roachpb.ScanRequest); ok { 1153 if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) { 1154 atomic.AddInt32(&clockUpdate, 1) 1155 now := c.Now() 1156 now.WallTime += advancement.Nanoseconds() 1157 c.Update(now) 1158 break 1159 } 1160 } 1161 } 1162 }, 1163 } 1164 1165 const refreshAttempts = 3 1166 clientTestingKnobs := &kvcoord.ClientTestingKnobs{ 1167 MaxTxnRefreshAttempts: refreshAttempts, 1168 } 1169 1170 params, _ := tests.CreateTestServerParams() 1171 params.Knobs.Store = storeTestingKnobs 1172 params.Knobs.KVClient = clientTestingKnobs 1173 s, sqlDB, _ := serverutils.StartServer(t, params) 1174 defer s.Stopper().Stop(context.Background()) 1175 1176 var restartDone int32 1177 cleanupFilter := cmdFilters.AppendFilter( 1178 func(args kvserverbase.FilterArgs) *roachpb.Error { 1179 // Allow a set number of restarts so that the auto retry on the 1180 // first few uncertainty interval errors also fails. 1181 if atomic.LoadInt32(&restartDone) > refreshAttempts { 1182 return nil 1183 } 1184 1185 if req, ok := args.Req.(*roachpb.ScanRequest); ok { 1186 if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) { 1187 atomic.AddInt32(&restartDone, 1) 1188 // Return ReadWithinUncertaintyIntervalError to update the transaction timestamp on retry. 1189 txn := args.Hdr.Txn 1190 txn.ResetObservedTimestamps() 1191 now := s.Clock().Now() 1192 txn.UpdateObservedTimestamp(s.(*server.TestServer).Gossip().NodeID.Get(), now) 1193 return roachpb.NewErrorWithTxn(roachpb.NewReadWithinUncertaintyIntervalError(now, now, txn), txn) 1194 } 1195 } 1196 return nil 1197 }, false) 1198 defer cleanupFilter() 1199 1200 sqlDB.SetMaxOpenConns(1) 1201 if _, err := sqlDB.Exec(` 1202 CREATE DATABASE t; 1203 CREATE TABLE t.test (k TEXT PRIMARY KEY, v TEXT); 1204 INSERT INTO t.test (k, v) VALUES ('test_key', 'test_val'); 1205 `); err != nil { 1206 t.Fatal(err) 1207 } 1208 // Acquire the lease and enable the auto-retry. The first few read attempts 1209 // will trigger ReadWithinUncertaintyIntervalError and advance the 1210 // transaction timestamp due to txnSpanRefresher-initiated span refreshes. 1211 // The transaction timestamp will exceed the lease expiration time, and the 1212 // last read attempt will re-acquire the lease. 1213 if _, err := sqlDB.Exec(` 1214 SELECT * from t.test WHERE k = 'test_key'; 1215 `); err != nil { 1216 t.Fatal(err) 1217 } 1218 1219 if u := atomic.LoadInt32(&clockUpdate); u != 1 { 1220 t.Errorf("expected exacltly one clock update, but got %d", u) 1221 } 1222 if u, e := atomic.LoadInt32(&restartDone), int32(refreshAttempts+1); u != e { 1223 t.Errorf("expected exactly %d restarts, but got %d", e, u) 1224 } 1225 } 1226 1227 // Verifies that the uncommitted descriptor cache is flushed on a txn restart. 1228 // 1229 // This test triggers the above scenario by triggering a restart by returning 1230 // ReadWithinUncertaintyIntervalError on the first transaction attempt. 1231 func TestFlushUncommitedDescriptorCacheOnRestart(t *testing.T) { 1232 defer leaktest.AfterTest(t)() 1233 1234 var cmdFilters tests.CommandFilters 1235 cmdFilters.AppendFilter(tests.CheckEndTxnTrigger, true) 1236 testKey := []byte("test_key") 1237 testingKnobs := &kvserver.StoreTestingKnobs{ 1238 EvalKnobs: kvserverbase.BatchEvalTestingKnobs{ 1239 TestingEvalFilter: cmdFilters.RunFilters, 1240 }, 1241 } 1242 1243 params, _ := tests.CreateTestServerParams() 1244 params.Knobs.Store = testingKnobs 1245 s, sqlDB, _ := serverutils.StartServer(t, params) 1246 defer s.Stopper().Stop(context.Background()) 1247 1248 var restartDone int32 1249 cleanupFilter := cmdFilters.AppendFilter( 1250 func(args kvserverbase.FilterArgs) *roachpb.Error { 1251 if atomic.LoadInt32(&restartDone) > 0 { 1252 return nil 1253 } 1254 1255 if req, ok := args.Req.(*roachpb.ScanRequest); ok { 1256 if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) { 1257 atomic.AddInt32(&restartDone, 1) 1258 // Return ReadWithinUncertaintyIntervalError. 1259 txn := args.Hdr.Txn 1260 txn.ResetObservedTimestamps() 1261 now := s.Clock().Now() 1262 txn.UpdateObservedTimestamp(s.(*server.TestServer).Gossip().NodeID.Get(), now) 1263 return roachpb.NewErrorWithTxn(roachpb.NewReadWithinUncertaintyIntervalError(now, now, txn), txn) 1264 } 1265 } 1266 return nil 1267 }, false) 1268 defer cleanupFilter() 1269 1270 sqlDB.SetMaxOpenConns(1) 1271 if _, err := sqlDB.Exec(` 1272 CREATE DATABASE t; 1273 CREATE TABLE t.test (k TEXT PRIMARY KEY, v TEXT); 1274 INSERT INTO t.test (k, v) VALUES ('test_key', 'test_val'); 1275 `); err != nil { 1276 t.Fatal(err) 1277 } 1278 // Read from a table, rename it, and then read from the table to trigger 1279 // the retry. On the second attempt the first read from the table should 1280 // not see the uncommitted renamed table. 1281 if _, err := sqlDB.Exec(` 1282 BEGIN; 1283 SELECT * from t.test WHERE k = 'foobar'; 1284 ALTER TABLE t.test RENAME TO t.foo; 1285 SELECT * from t.foo WHERE k = 'test_key'; 1286 COMMIT; 1287 `); err != nil { 1288 t.Fatal(err) 1289 } 1290 1291 if u := atomic.LoadInt32(&restartDone); u != 1 { 1292 t.Errorf("expected exactly one restart, but got %d", u) 1293 } 1294 } 1295 1296 // Test that retryable errors are handled properly through DistSQL. 1297 func TestDistSQLRetryableError(t *testing.T) { 1298 defer leaktest.AfterTest(t)() 1299 1300 // One of the rows in the table. 1301 targetKey := roachpb.Key("\275\211\212") 1302 1303 restarted := true 1304 1305 tc := serverutils.StartTestCluster(t, 3, /* numNodes */ 1306 base.TestClusterArgs{ 1307 ReplicationMode: base.ReplicationManual, 1308 ServerArgs: base.TestServerArgs{ 1309 UseDatabase: "test", 1310 Knobs: base.TestingKnobs{ 1311 Store: &kvserver.StoreTestingKnobs{ 1312 EvalKnobs: kvserverbase.BatchEvalTestingKnobs{ 1313 TestingEvalFilter: func(fArgs kvserverbase.FilterArgs) *roachpb.Error { 1314 _, ok := fArgs.Req.(*roachpb.ScanRequest) 1315 if ok && fArgs.Req.Header().Key.Equal(targetKey) && fArgs.Hdr.Txn.Epoch == 0 { 1316 restarted = true 1317 err := roachpb.NewReadWithinUncertaintyIntervalError( 1318 fArgs.Hdr.Timestamp, /* readTS */ 1319 hlc.Timestamp{}, 1320 nil) 1321 errTxn := fArgs.Hdr.Txn.Clone() 1322 errTxn.UpdateObservedTimestamp(roachpb.NodeID(2), hlc.Timestamp{}) 1323 pErr := roachpb.NewErrorWithTxn(err, errTxn) 1324 pErr.OriginNode = 2 1325 return pErr 1326 } 1327 1328 return nil 1329 }, 1330 }, 1331 }, 1332 }, 1333 }, 1334 }) 1335 defer tc.Stopper().Stop(context.Background()) 1336 1337 db := tc.ServerConn(0) 1338 sqlutils.CreateTable(t, db, "t", 1339 "num INT PRIMARY KEY", 1340 3, /* numRows */ 1341 sqlutils.ToRowFn(sqlutils.RowIdxFn)) 1342 1343 // We're going to split one of the tables, but node 4 is unaware of this. 1344 _, err := db.Exec(fmt.Sprintf(` 1345 ALTER TABLE "t" SPLIT AT VALUES (1), (2), (3); 1346 ALTER TABLE "t" EXPERIMENTAL_RELOCATE VALUES (ARRAY[%d], 1), (ARRAY[%d], 2), (ARRAY[%d], 3); 1347 `, 1348 tc.Server(1).GetFirstStoreID(), 1349 tc.Server(0).GetFirstStoreID(), 1350 tc.Server(2).GetFirstStoreID())) 1351 if err != nil { 1352 t.Fatal(err) 1353 } 1354 1355 db.SetMaxOpenConns(1) 1356 1357 if _, err := db.Exec("SET DISTSQL = ON"); err != nil { 1358 t.Fatal(err) 1359 } 1360 1361 // Test that a stand-alone statement is retried by the Executor. 1362 if _, err := db.Exec("SELECT count(1) FROM t"); err != nil { 1363 t.Fatal(err) 1364 } 1365 if !restarted { 1366 t.Fatalf("expected the EvalFilter to restart the txn, but it didn't") 1367 } 1368 1369 // Test that a command that can't be retried automatically generates an error 1370 // with the correct code. 1371 restarted = false 1372 1373 txn, err := db.Begin() 1374 if err != nil { 1375 t.Fatal(err) 1376 } 1377 // Run a batch of statements to move the txn out of the "AutoRetry" state. 1378 if _, err := txn.Exec("SELECT 1"); err != nil { 1379 t.Fatal(err) 1380 } 1381 1382 // Let's make sure that DISTSQL will actually be used. 1383 row := txn.QueryRow(`SELECT automatic FROM [EXPLAIN (DISTSQL) SELECT count(1) FROM t]`) 1384 var automatic bool 1385 if err := row.Scan(&automatic); err != nil { 1386 t.Fatal(err) 1387 } 1388 if !automatic { 1389 t.Fatal("DISTSQL not used for test's query") 1390 } 1391 1392 _, err = txn.Exec("SELECT count(1) FROM t") 1393 if !restarted { 1394 t.Fatalf("expected the EvalFilter to restart the txn, but it didn't") 1395 } 1396 if err == nil { 1397 t.Fatal("expected retryable error") 1398 } 1399 if !isRetryableErr(err) { 1400 t.Fatalf("expected retryable error, got: %s", err) 1401 } 1402 1403 if err := txn.Rollback(); err != nil { 1404 t.Fatal(err) 1405 } 1406 1407 // Test that ORDER BY properly propagates retryable errors. The weird 1408 // ordering criteria is to ensure that the ORDER BY is present and not elided 1409 // because we're ordering on the primary key column. 1410 restarted = false 1411 rows, err := db.Query("SELECT * FROM t ORDER BY upper(num::TEXT)") 1412 if err != nil { 1413 t.Fatal(err) 1414 } 1415 var count int 1416 for rows.Next() { 1417 count++ 1418 } 1419 if count != 3 { 1420 t.Fatalf("expected 3 rows, but found %d", count) 1421 } 1422 if !restarted { 1423 t.Fatalf("expected the EvalFilter to restart the txn, but it didn't") 1424 } 1425 } 1426 1427 // TestRollbackToSavepointFromUnusualStates tests that issuing a ROLLBACK TO 1428 // SAVEPOINT from a non-retryable state works, and that the transaction that it 1429 // opens has the same attributes as the existing one. 1430 func TestRollbackToSavepointFromUnusualStates(t *testing.T) { 1431 defer leaktest.AfterTest(t)() 1432 1433 params, _ := tests.CreateTestServerParams() 1434 s, sqlDB, _ := serverutils.StartServer(t, params) 1435 defer s.Stopper().Stop(context.Background()) 1436 1437 checkState := func(tx *gosql.Tx, ts time.Time) { 1438 t.Helper() 1439 var pri string 1440 r := tx.QueryRow("SHOW TRANSACTION PRIORITY") 1441 if err := r.Scan(&pri); err != nil { 1442 t.Fatal(err) 1443 } else { 1444 if pri != "high" { 1445 t.Errorf("Expected high, got: %s", pri) 1446 } 1447 } 1448 } 1449 1450 tx, err := sqlDB.Begin() 1451 if err != nil { 1452 t.Fatal(err) 1453 } 1454 if _, err := tx.Exec("SET TRANSACTION PRIORITY HIGH"); err != nil { 1455 t.Fatal(err) 1456 } 1457 if _, err := tx.Exec("SAVEPOINT cockroach_restart"); err != nil { 1458 t.Fatal(err) 1459 } 1460 1461 var ts time.Time 1462 r := tx.QueryRow("SELECT now()") 1463 if err := r.Scan(&ts); err != nil { 1464 t.Fatal(err) 1465 } 1466 1467 checkState(tx, ts) 1468 1469 // ROLLBACK TO SAVEPOINT from an Open txn should work. 1470 if _, err := tx.Exec("ROLLBACK TO SAVEPOINT cockroach_restart"); err != nil { 1471 t.Fatal(err) 1472 } 1473 checkState(tx, ts) 1474 1475 // ROLLBACK TO SAVEPOINT from an Aborted txn should work. 1476 if _, err := tx.Exec("BOGUS SQL STATEMENT"); !testutils.IsError(err, `at or near "bogus": syntax error`) { 1477 t.Fatalf("unexpected error: %v", err) 1478 } 1479 if _, err := tx.Exec("ROLLBACK TO SAVEPOINT cockroach_restart"); err != nil { 1480 t.Fatalf("unexpected error: %s", err) 1481 } 1482 checkState(tx, ts) 1483 1484 if err := tx.Rollback(); err != nil { 1485 t.Fatal(err) 1486 } 1487 } 1488 1489 // Test that, if we'd otherwise perform an auto-retry but results for the 1490 // current txn have already been streamed to the client, we don't do the 1491 // auto-restart. 1492 func TestTxnAutoRetriesDisabledAfterResultsHaveBeenSentToClient(t *testing.T) { 1493 defer leaktest.AfterTest(t)() 1494 1495 params, _ := tests.CreateTestServerParams() 1496 s, sqlDB, _ := serverutils.StartServer(t, params) 1497 defer s.Stopper().Stop(context.Background()) 1498 1499 tests := []struct { 1500 name string 1501 autoCommit bool 1502 clientDirectedRetry bool 1503 expectedTxnStateAfterRetriableErr string 1504 }{ 1505 { 1506 name: "client_directed_retries", 1507 clientDirectedRetry: true, 1508 expectedTxnStateAfterRetriableErr: "Aborted", 1509 }, 1510 { 1511 name: "no_client_directed_retries", 1512 clientDirectedRetry: false, 1513 expectedTxnStateAfterRetriableErr: "Aborted", 1514 }, 1515 { 1516 name: "autocommit", 1517 autoCommit: true, 1518 expectedTxnStateAfterRetriableErr: "NoTxn", 1519 }, 1520 } 1521 for _, tc := range tests { 1522 t.Run(tc.name, func(t *testing.T) { 1523 // Cleanup the connection state after each test so the next one can run 1524 // statements. 1525 // TODO(andrei): Once we're on go 1.9, this test should use the new 1526 // db.Conn() method to tie each test to a connection; then this cleanup 1527 // wouldn't be necessary. Also, the test is currently technically 1528 // incorrect, as there's no guarantee that the state check at the end will 1529 // happen on the right connection. 1530 defer func() { 1531 if tc.autoCommit { 1532 // No cleanup necessary. 1533 return 1534 } 1535 if _, err := sqlDB.Exec("ROLLBACK"); err != nil { 1536 t.Fatal(err) 1537 } 1538 }() 1539 1540 var savepoint string 1541 if tc.clientDirectedRetry { 1542 savepoint = "SAVEPOINT cockroach_restart;" 1543 } 1544 1545 var prefix, suffix string 1546 if !tc.autoCommit { 1547 prefix = "BEGIN; " + savepoint 1548 suffix = "COMMIT;" 1549 } 1550 1551 // We'll run a statement that produces enough results to overflow the 1552 // buffers and start streaming results to the client before the retriable 1553 // error is injected. We do this by running a generate series that blows 1554 // up at the very end, with a CASE statement. 1555 sql := fmt.Sprintf(` 1556 %s 1557 SELECT 1558 CASE x 1559 WHEN 10000 THEN crdb_internal.force_retry('1s') 1560 ELSE x 1561 END 1562 FROM generate_series(1, 10000) AS t(x); 1563 %s`, 1564 prefix, suffix) 1565 _, err := sqlDB.Exec(sql) 1566 if !isRetryableErr(err) { 1567 t.Fatalf("expected retriable error, got: %v", err) 1568 } 1569 var state string 1570 if err := sqlDB.QueryRow("SHOW TRANSACTION STATUS").Scan(&state); err != nil { 1571 t.Fatal(err) 1572 } 1573 if expStateStr := tc.expectedTxnStateAfterRetriableErr; state != expStateStr { 1574 t.Fatalf("expected state %s, got: %s", expStateStr, state) 1575 } 1576 }) 1577 } 1578 }