github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/txn_restart_test.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql_test
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	gosql "database/sql"
    17  	"fmt"
    18  	"net/url"
    19  	"regexp"
    20  	"strconv"
    21  	"strings"
    22  	"sync/atomic"
    23  	"testing"
    24  	"time"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/base"
    27  	"github.com/cockroachdb/cockroach/pkg/kv"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    30  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    31  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    32  	"github.com/cockroachdb/cockroach/pkg/security"
    33  	"github.com/cockroachdb/cockroach/pkg/server"
    34  	"github.com/cockroachdb/cockroach/pkg/sql"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/tests"
    36  	"github.com/cockroachdb/cockroach/pkg/testutils"
    37  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    38  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    39  	"github.com/cockroachdb/cockroach/pkg/util/caller"
    40  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    41  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    42  	"github.com/cockroachdb/cockroach/pkg/util/log"
    43  	"github.com/cockroachdb/cockroach/pkg/util/shuffle"
    44  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    45  	"github.com/cockroachdb/errors"
    46  	"github.com/lib/pq"
    47  	"github.com/stretchr/testify/require"
    48  )
    49  
    50  type failureRecord struct {
    51  	err error
    52  	txn *roachpb.Transaction
    53  }
    54  
    55  type filterVals struct {
    56  	syncutil.Mutex
    57  	// key -> number of times an retriable error will be injected when that key
    58  	// is written.
    59  	restartCounts map[string]int
    60  	// key -> number of times a TransactionAborted error will be injected when
    61  	// that key is written. Note that injecting this is pretty funky: it can only
    62  	// be done on the first write of a txn, otherwise the previously written
    63  	// intents will linger on.
    64  	abortCounts map[string]int
    65  
    66  	// Keys for which we injected an error.
    67  	failedValues map[string]failureRecord
    68  }
    69  
    70  func createFilterVals(restartCounts map[string]int, abortCounts map[string]int) *filterVals {
    71  	return &filterVals{
    72  		restartCounts: restartCounts,
    73  		abortCounts:   abortCounts,
    74  		failedValues:  map[string]failureRecord{},
    75  	}
    76  }
    77  
    78  // checkCorrectTxn checks that the current txn is the correct one, according to
    79  // the way the previous txn that tried to write value failed.
    80  func checkCorrectTxn(value string, magicVals *filterVals, txn *roachpb.Transaction) error {
    81  	failureRec, found := magicVals.failedValues[value]
    82  	if !found {
    83  		return nil
    84  	}
    85  	if errors.HasType(failureRec.err, (*roachpb.TransactionAbortedError)(nil)) {
    86  		// The previous txn should have been aborted, so check that we're running
    87  		// in a new one.
    88  		if failureRec.txn.ID == txn.ID {
    89  			return errors.Errorf(`new transaction for value "%s" is the same as the old one`, value)
    90  		}
    91  	} else {
    92  		// The previous txn should have been restarted, so we should be running in
    93  		// the same one.
    94  		if failureRec.txn.ID != txn.ID {
    95  			return errors.Errorf(`new transaction for value "%s" (%s) is not the same as the old one (%s)`, value, txn, failureRec.txn)
    96  		}
    97  	}
    98  	// Don't check this value in subsequent transactions.
    99  	delete(magicVals.failedValues, value)
   100  
   101  	return nil
   102  }
   103  
   104  type injectionApproach struct {
   105  	counts map[string]int
   106  	errFn  func() error
   107  }
   108  
   109  type injectionApproaches []injectionApproach
   110  
   111  func (ia injectionApproaches) Len() int      { return len(ia) }
   112  func (ia injectionApproaches) Swap(i, j int) { ia[i], ia[j] = ia[j], ia[i] }
   113  
   114  func injectErrors(
   115  	req roachpb.Request, hdr roachpb.Header, magicVals *filterVals, verifyTxn bool,
   116  ) error {
   117  	magicVals.Lock()
   118  	defer magicVals.Unlock()
   119  
   120  	switch req := req.(type) {
   121  	case *roachpb.ConditionalPutRequest:
   122  		// Create a list of each injection approach and shuffle the order of
   123  		// injection for some additional randomness.
   124  		injections := injectionApproaches{
   125  			{counts: magicVals.restartCounts, errFn: func() error {
   126  				// Note we use a retry error that cannot be automatically retried
   127  				// by the transaction coord sender.
   128  				return roachpb.NewTransactionRetryError(roachpb.RETRY_REASON_UNKNOWN, "injected err")
   129  			}},
   130  			{counts: magicVals.abortCounts, errFn: func() error {
   131  				return roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_ABORTED_RECORD_FOUND)
   132  			}},
   133  		}
   134  		shuffle.Shuffle(injections)
   135  
   136  		for _, injection := range injections {
   137  			for key, count := range injection.counts {
   138  				if verifyTxn {
   139  					if err := checkCorrectTxn(string(req.Value.RawBytes), magicVals, hdr.Txn); err != nil {
   140  						return err
   141  					}
   142  				}
   143  				if count > 0 && bytes.Contains(req.Value.RawBytes, []byte(key)) {
   144  					injection.counts[key]--
   145  					err := injection.errFn()
   146  					magicVals.failedValues[string(req.Value.RawBytes)] = failureRecord{err, hdr.Txn}
   147  					return err
   148  				}
   149  			}
   150  		}
   151  		return nil
   152  	default:
   153  		return nil
   154  	}
   155  }
   156  
   157  // checkRestart checks that there are no errors left to inject.
   158  func checkRestarts(t *testing.T, magicVals *filterVals) {
   159  	magicVals.Lock()
   160  	defer magicVals.Unlock()
   161  	for key, count := range magicVals.restartCounts {
   162  		if count != 0 {
   163  			file, line, _ := caller.Lookup(1)
   164  			t.Errorf("%s:%d: INSERT for \"%s\" still has to be retried %d times",
   165  				file, line, key, count)
   166  		}
   167  	}
   168  	for key, count := range magicVals.abortCounts {
   169  		if count != 0 {
   170  			file, line, _ := caller.Lookup(1)
   171  			t.Errorf("%s:%d: INSERT for \"%s\" still has to be aborted %d times",
   172  				file, line, key, count)
   173  		}
   174  	}
   175  	if t.Failed() {
   176  		t.Fatalf("checking error injection failed")
   177  	}
   178  }
   179  
   180  // TxnAborter can be used to listen for transactions running particular
   181  // SQL statements; the trapped transactions will be aborted.
   182  // The TxnAborter needs to be hooked up to a Server's
   183  // Knobs.StatementFilter, so that the Aborter sees what statements are being
   184  // executed. This is done by calling HookupToExecutor(), which returns a
   185  // stuitable ExecutorTestingKnobs.
   186  // A statement can be registered for abortion (meaning, the statement's
   187  // transaction will be TransactionAborted) with QueueStmtForAbortion(). When the
   188  // Aborter sees that statement, it will run a higher priority transaction that
   189  // tramples the data, so the original transaction will get a TransactionAborted
   190  // error when it tries to commit.
   191  //
   192  // Note that transaction cannot be aborted using an injected error, since we
   193  // want the pusher to clean up the intents of the pushee.
   194  //
   195  // The aborter only works with INSERT statements operating on the table t.test
   196  // defined as:
   197  //	`CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT)`
   198  // The TxnAborter runs transactions deleting the row for the `k` that the
   199  // trapped transactions were writing to.
   200  //
   201  // Be sure to set DisableAutoCommit on the ExecutorTestingKnobs, otherwise
   202  // implicit transactions won't have a chance to be aborted.
   203  //
   204  // Example usage:
   205  //
   206  //	func TestTxnAutoRetry(t *testing.T) {
   207  //		defer leaktest.AfterTest(t)()
   208  //		aborter := NewTxnAborter()
   209  //		defer aborter.Close(t)
   210  //		params, cmdFilters := tests.CreateTestServerParams()
   211  //		params.Knobs.SQLExecutor = aborter.executorKnobs()
   212  //		s, sqlDB, _ := serverutils.StartServer(t, params)
   213  //		defer s.Stopper().Stop(context.Background())
   214  //		{
   215  //			pgURL, cleanup := sqlutils.PGUrl(t, s.ServingRPCAddr(), "TestTxnAutoRetry", url.User(security.RootUser)
   216  //			defer cleanup()
   217  //			if err := aborter.Init(pgURL); err != nil {
   218  //				t.Fatal(err)
   219  //			}
   220  //		}
   221  //
   222  //		sqlDB.Exec(`CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT)`)
   223  //		const sentinelInsert = "INSERT INTO t.test(k, v) VALUES (0, 'sentinel')"
   224  //		if err := aborter.QueueStmtForAbortion(
   225  //			sentinelInsert, 1 /* abortCount */, true /* willBeRetriedIbid */,
   226  //		); err != nil {
   227  //			t.Fatal(err)
   228  //		}
   229  //		sqlDB.Exec(sentinelInsert)
   230  //	...
   231  type TxnAborter struct {
   232  	mu struct {
   233  		syncutil.Mutex
   234  		stmtsToAbort map[string]*restartInfo
   235  	}
   236  	// A second connection pool, to be used by aborts.
   237  	// This is needed because the main conn pool is going to be restricted to one
   238  	// connection.
   239  	// TODO(andrei): remove this if we ever move to using libpq conns directly.
   240  	// See TODOs around on SetMaxOpenConns.
   241  	abortDB *gosql.DB
   242  }
   243  
   244  type restartInfo struct {
   245  	// The numberic value being inserted in col 'k'.
   246  	key int
   247  	// The remaining number of times to abort the txn.
   248  	abortCount     int
   249  	satisfied      bool
   250  	checkSatisfied bool
   251  	// The number of times the statement as been executed.
   252  	execCount int
   253  }
   254  
   255  func NewTxnAborter() *TxnAborter {
   256  	ta := new(TxnAborter)
   257  	ta.mu.stmtsToAbort = make(map[string]*restartInfo)
   258  	return ta
   259  }
   260  
   261  func (ta *TxnAborter) Init(pgURL url.URL) error {
   262  	abortDB, err := gosql.Open("postgres", pgURL.String())
   263  	if err != nil {
   264  		return err
   265  	}
   266  	ta.abortDB = abortDB
   267  	return nil
   268  }
   269  
   270  var valuesRE = regexp.MustCompile(`VALUES.*\((\d),`)
   271  
   272  // QueueStmtForAbortion registers a statement whose transaction will be aborted.
   273  //
   274  // stmt needs to be the statement, literally as the AST gets converted back to a
   275  // string. Note that, since we sometimes change the AST during planning, the
   276  // statements sent for execution that need to be intercepted by this filter
   277  // need to be written in a canonical form, and stmt passed here needs to also be
   278  // that canonical form. In particular, table names need to be fully qualified
   279  // with the schema (e.g. t.public.test).
   280  //
   281  // abortCount specifies how many times a txn running this statement will be
   282  // aborted.
   283  // willBeRetriedIbid should be set if the statement will be retried by the test
   284  // (as an identical statement). This allows the TxnAborter to assert, on
   285  // Close(), that the statement has been retried the intended number of times by
   286  // the end of the test (besides asserting that an error was injected the right
   287  // number of times. So, the Aborter can be used to check that the retry
   288  // machinery has done its job. The Aborter will consider the statement to have
   289  // been retried correctly if the statement has been executed at least once after
   290  // the Aborter is done injecting errors because of it. So normally we'd expect
   291  // this statement to executed RestartCount + 1 times, but we allow it to be
   292  // retried more times because the statement's txn might also retried because of
   293  // other statements.
   294  //
   295  // Calling QueueStmtForAbortion repeatedly with the same stmt is allowed, and
   296  // each call checks that the previous one was satisfied.
   297  func (ta *TxnAborter) QueueStmtForAbortion(
   298  	stmt string, abortCount int, willBeRetriedIbid bool,
   299  ) error {
   300  	ta.mu.Lock()
   301  	defer ta.mu.Unlock()
   302  	if ri, ok := ta.mu.stmtsToAbort[stmt]; ok {
   303  		// If we're overwriting a statement that was already queued, verify it
   304  		// first.
   305  		if err := ri.Verify(); err != nil {
   306  			return errors.Wrapf(err, `statement "%s" error`, stmt)
   307  		}
   308  	}
   309  	// Extract the "key" - the value of the first col, which will be trampled on.
   310  	switch matches := valuesRE.FindStringSubmatch(stmt); len(matches) {
   311  	case 0, 1:
   312  		return errors.Errorf(`bad statement "%s": key col not found`, stmt)
   313  	default:
   314  		key, err := strconv.Atoi(matches[1])
   315  		if err != nil {
   316  			return errors.Wrapf(err, `bad statement "%s"`, stmt)
   317  		}
   318  		ta.mu.stmtsToAbort[stmt] = &restartInfo{
   319  			key:            key,
   320  			abortCount:     abortCount,
   321  			satisfied:      false,
   322  			checkSatisfied: willBeRetriedIbid,
   323  		}
   324  		return nil
   325  	}
   326  }
   327  
   328  // GetExecCount returns the number of times a statement has been seen.
   329  // You probably don't want to call this while the TxnAborter might be in
   330  // the process of aborting the txn containing stmt, as the result will not be
   331  // deterministic.
   332  func (ta *TxnAborter) GetExecCount(stmt string) (int, bool) {
   333  	ta.mu.Lock()
   334  	defer ta.mu.Unlock()
   335  	if ri, ok := ta.mu.stmtsToAbort[stmt]; ok {
   336  		return ri.execCount, true
   337  	}
   338  	return 0, false
   339  }
   340  
   341  func (ta *TxnAborter) statementFilter(ctx context.Context, stmt string, err error) {
   342  	ta.mu.Lock()
   343  	log.Infof(ctx, "statement filter running on: %s, with err=%v", stmt, err)
   344  	ri, ok := ta.mu.stmtsToAbort[stmt]
   345  	shouldAbort := false
   346  	if ok {
   347  		ri.execCount++
   348  		if ri.abortCount == 0 {
   349  			log.VEventf(ctx, 1, "TxnAborter sees satisfied statement %q", stmt)
   350  			ri.satisfied = true
   351  		}
   352  		if ri.abortCount > 0 && err == nil {
   353  			log.Infof(ctx, "TxnAborter aborting txn for statement %q", stmt)
   354  			ri.abortCount--
   355  			shouldAbort = true
   356  		}
   357  	}
   358  	ta.mu.Unlock()
   359  	if shouldAbort {
   360  		if err := ta.abortTxn(ri.key); err != nil {
   361  			panic(fmt.Sprintf("TxnAborter failed to abort: %s", err))
   362  		}
   363  	}
   364  }
   365  
   366  // executorKnobs are the bridge between the TxnAborter and the sql.Executor.
   367  func (ta *TxnAborter) executorKnobs() base.ModuleTestingKnobs {
   368  	return &sql.ExecutorTestingKnobs{
   369  		// We're going to abort txns using a TxnAborter, and that's incompatible
   370  		// with AutoCommit.
   371  		DisableAutoCommit: true,
   372  		StatementFilter:   ta.statementFilter,
   373  	}
   374  }
   375  
   376  // abortTxn writes to a key and as a side effect aborts a txn that had an intent
   377  // on that key.
   378  func (ta *TxnAborter) abortTxn(key int) error {
   379  	tx, err := ta.abortDB.Begin()
   380  	if err != nil {
   381  		return err
   382  	}
   383  	if _, err := tx.Exec("SET TRANSACTION PRIORITY HIGH"); err != nil {
   384  		return err
   385  	}
   386  	if _, err := tx.Exec("DELETE FROM t.test WHERE k = $1", key); err != nil {
   387  		return err
   388  	}
   389  	if err = tx.Commit(); err != nil {
   390  		return err
   391  	}
   392  	return nil
   393  }
   394  
   395  type TxnAborterVerifierError struct {
   396  	errs []error
   397  }
   398  
   399  func (e *TxnAborterVerifierError) Error() string {
   400  	strs := make([]string, 0)
   401  	for _, err := range e.errs {
   402  		strs = append(strs, err.Error())
   403  	}
   404  	return strings.Join(strs, "\n")
   405  }
   406  
   407  func (ta *TxnAborter) VerifyAndClear() error {
   408  	ta.mu.Lock()
   409  	defer ta.mu.Unlock()
   410  	allErr := TxnAborterVerifierError{}
   411  	for stmt, ri := range ta.mu.stmtsToAbort {
   412  		if err := ri.Verify(); err != nil {
   413  			allErr.errs = append(allErr.errs, errors.Wrapf(err, `statement "%s" error`, stmt))
   414  		}
   415  	}
   416  	ta.mu.stmtsToAbort = make(map[string]*restartInfo)
   417  	if len(allErr.errs) != 0 {
   418  		return &allErr
   419  	}
   420  	return nil
   421  }
   422  
   423  func (ta *TxnAborter) Close(t testing.TB) {
   424  	ta.abortDB.Close()
   425  	if err := ta.VerifyAndClear(); err != nil {
   426  		file, line, _ := caller.Lookup(1)
   427  		t.Errorf("%s:%d %s", file, line, err)
   428  	}
   429  }
   430  
   431  func (ri *restartInfo) Verify() error {
   432  	if ri.abortCount != 0 {
   433  		return errors.Errorf("%d additional aborts expected", ri.abortCount)
   434  	}
   435  	if ri.checkSatisfied && !ri.satisfied {
   436  		return errors.New("previous abort did not result in a retry")
   437  	}
   438  	return nil
   439  }
   440  
   441  // Test the logic in the sql executor for automatically retrying txns in case of
   442  // retriable errors.
   443  func TestTxnAutoRetry(t *testing.T) {
   444  	defer leaktest.AfterTest(t)()
   445  
   446  	aborter := NewTxnAborter()
   447  	defer aborter.Close(t)
   448  	params, cmdFilters := tests.CreateTestServerParams()
   449  	params.Knobs.SQLExecutor = aborter.executorKnobs()
   450  	s, sqlDB, _ := serverutils.StartServer(t, params)
   451  	defer s.Stopper().Stop(context.Background())
   452  	{
   453  		pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestTxnAutoRetry", url.User(security.RootUser))
   454  		defer cleanup()
   455  		if err := aborter.Init(pgURL); err != nil {
   456  			t.Fatal(err)
   457  		}
   458  	}
   459  
   460  	// Make sure all the commands we send in this test are sent over the same connection.
   461  	// This is a bit of a hack; in Go you're not supposed to have connection state
   462  	// outside of using a db.Tx. But we can't use a db.Tx here, because we want
   463  	// to control the batching of BEGIN/COMMIT statements.
   464  	// This SetMaxOpenConns is pretty shady, it doesn't guarantee that you'll be using
   465  	// the *same* one connection across calls. A proper solution would be to use a
   466  	// lib/pq connection directly. As of Feb 2016, there's code in cli/sql_util.go to
   467  	// do that.
   468  	sqlDB.SetMaxOpenConns(1)
   469  
   470  	if _, err := sqlDB.Exec(`
   471  CREATE DATABASE t;
   472  CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT, t DECIMAL);
   473  `); err != nil {
   474  		t.Fatal(err)
   475  	}
   476  
   477  	// Set up error injection that causes retries.
   478  	magicVals := createFilterVals(nil, nil)
   479  	magicVals.restartCounts = map[string]int{
   480  		"boulanger": 2,
   481  		"dromedary": 2,
   482  		"fajita":    2,
   483  		"hooly":     2,
   484  		"josephine": 2,
   485  		"laureal":   2,
   486  	}
   487  	magicVals.abortCounts = map[string]int{
   488  		"boulanger": 2,
   489  	}
   490  	cleanupFilter := cmdFilters.AppendFilter(
   491  		func(args kvserverbase.FilterArgs) *roachpb.Error {
   492  			if err := injectErrors(args.Req, args.Hdr, magicVals, true /* verifyTxn */); err != nil {
   493  				return roachpb.NewErrorWithTxn(err, args.Hdr.Txn)
   494  			}
   495  			return nil
   496  		}, false)
   497  
   498  	if err := aborter.QueueStmtForAbortion(
   499  		"INSERT INTO t.public.test(k, v, t) VALUES (1, 'boulanger', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */
   500  	); err != nil {
   501  		t.Fatal(err)
   502  	}
   503  	if err := aborter.QueueStmtForAbortion(
   504  		"INSERT INTO t.public.test(k, v, t) VALUES (2, 'dromedary', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */
   505  	); err != nil {
   506  		t.Fatal(err)
   507  	}
   508  	if err := aborter.QueueStmtForAbortion(
   509  		"INSERT INTO t.public.test(k, v, t) VALUES (3, 'fajita', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */
   510  	); err != nil {
   511  		t.Fatal(err)
   512  	}
   513  	if err := aborter.QueueStmtForAbortion(
   514  		"INSERT INTO t.public.test(k, v, t) VALUES (4, 'hooly', cluster_logical_timestamp()) RETURNING 1", 2 /* abortCount */, true, /* willBeRetriedIbid */
   515  	); err != nil {
   516  		t.Fatal(err)
   517  	}
   518  
   519  	// Test that implicit txns - txns for which we see all the statements and prefixes
   520  	// of txns (statements batched together with the BEGIN stmt) - are retried.
   521  	// We also exercise the SQL cluster logical timestamp in here, because
   522  	// this must be properly propagated across retries.
   523  	//
   524  	// The SELECT within the transaction also checks that discarded
   525  	// intermediate result sets are properly released: the result set it
   526  	// produces is accounted for by the session monitor, and if it is
   527  	// not properly released upon a retry the monitor will cause the
   528  	// server to panic (and thus the test to fail) when the connection
   529  	// is closed.
   530  	//
   531  	// TODO(knz): This test can be made more robust by exposing the
   532  	// current allocation count in monitor and checking that it has the
   533  	// same value at the beginning of each retry.
   534  	rows, err := sqlDB.Query(`
   535  INSERT INTO t.public.test(k, v, t) VALUES (1, 'boulanger', cluster_logical_timestamp()) RETURNING 1;
   536  BEGIN;
   537  INSERT INTO t.public.test(k, v, t) VALUES (2, 'dromedary', cluster_logical_timestamp()) RETURNING 1;
   538  INSERT INTO t.public.test(k, v, t) VALUES (3, 'fajita', cluster_logical_timestamp()) RETURNING 1;
   539  END;
   540  INSERT INTO t.public.test(k, v, t) VALUES (4, 'hooly', cluster_logical_timestamp()) RETURNING 1;
   541  BEGIN;
   542  INSERT INTO t.public.test(k, v, t) VALUES (5, 'josephine', cluster_logical_timestamp()) RETURNING 1;
   543  INSERT INTO t.public.test(k, v, t) VALUES (6, 'laureal', cluster_logical_timestamp()) RETURNING 1;
   544  `)
   545  	if err != nil {
   546  		t.Fatal(err)
   547  	}
   548  	defer rows.Close()
   549  
   550  	resSets := 0
   551  	for {
   552  		for rows.Next() {
   553  			resSets++
   554  		}
   555  		if !rows.NextResultSet() {
   556  			break
   557  		}
   558  	}
   559  	if err := rows.Err(); err != nil {
   560  		t.Fatal(err)
   561  	}
   562  	if resSets != 6 {
   563  		t.Fatalf("Expected 6 result sets, got %d", resSets)
   564  	}
   565  
   566  	cleanupFilter()
   567  
   568  	checkRestarts(t, magicVals)
   569  
   570  	if _, err := sqlDB.Exec("END"); err != nil {
   571  		t.Fatal(err)
   572  	}
   573  
   574  	// Check that the txns succeeded by reading the rows.
   575  	var count int
   576  	if err := sqlDB.QueryRow("SELECT count(*) FROM t.public.test").Scan(&count); err != nil {
   577  		t.Fatal(err)
   578  	}
   579  	if count != 6 {
   580  		t.Fatalf("Expected 6 rows, got %d", count)
   581  	}
   582  
   583  	// Now test that we don't retry what we shouldn't: insert an error into a txn
   584  	// we can't automatically retry (because it spans requests).
   585  
   586  	magicVals = createFilterVals(nil, nil)
   587  	magicVals.restartCounts = map[string]int{
   588  		"hooly": 2,
   589  	}
   590  	cleanupFilter = cmdFilters.AppendFilter(
   591  		func(args kvserverbase.FilterArgs) *roachpb.Error {
   592  			if err := injectErrors(args.Req, args.Hdr, magicVals, true /* verifyTxn */); err != nil {
   593  				return roachpb.NewErrorWithTxn(err, args.Hdr.Txn)
   594  			}
   595  			return nil
   596  		}, false)
   597  	defer cleanupFilter()
   598  
   599  	// Start a txn.
   600  	if _, err := sqlDB.Exec(`
   601  DELETE FROM t.public.test WHERE true;
   602  BEGIN;
   603  `); err != nil {
   604  		t.Fatal(err)
   605  	}
   606  
   607  	// Run a batch of statements to move the txn out of the AutoRetry state,
   608  	// otherwise the INSERT below would be automatically retried.
   609  	if _, err := sqlDB.Exec("SELECT 1"); err != nil {
   610  		t.Fatal(err)
   611  	}
   612  
   613  	// Continue the txn in a new request, which is not retriable.
   614  	_, err = sqlDB.Exec("INSERT INTO t.public.test(k, v, t) VALUES (4, 'hooly', cluster_logical_timestamp())")
   615  	require.Regexp(t, "RETRY_REASON_UNKNOWN - injected err", err)
   616  }
   617  
   618  // Test that aborted txn are only retried once.
   619  // Prevents regressions of #8456.
   620  func TestAbortedTxnOnlyRetriedOnce(t *testing.T) {
   621  	defer leaktest.AfterTest(t)()
   622  
   623  	aborter := NewTxnAborter()
   624  	defer aborter.Close(t)
   625  	params, _ := tests.CreateTestServerParams()
   626  	params.Knobs.SQLExecutor = aborter.executorKnobs()
   627  	s, sqlDB, _ := serverutils.StartServer(t, params)
   628  	defer s.Stopper().Stop(context.Background())
   629  	{
   630  		pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestAbortedTxnOnlyRetriedOnce", url.User(security.RootUser))
   631  		defer cleanup()
   632  		if err := aborter.Init(pgURL); err != nil {
   633  			t.Fatal(err)
   634  		}
   635  	}
   636  
   637  	const insertStmt = "INSERT INTO t.public.test(k, v) VALUES (1, 'boulanger')"
   638  	if err := aborter.QueueStmtForAbortion(
   639  		insertStmt, 1 /* abortCount */, true, /* willBeRetriedIbid */
   640  	); err != nil {
   641  		t.Fatal(err)
   642  	}
   643  
   644  	if _, err := sqlDB.Exec(`
   645  CREATE DATABASE t;
   646  CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT);
   647  `); err != nil {
   648  		t.Fatal(err)
   649  	}
   650  
   651  	if _, err := sqlDB.Exec(insertStmt); err != nil {
   652  		t.Fatalf("unexpected error: %s", err)
   653  	}
   654  
   655  	execCount, ok := aborter.GetExecCount(insertStmt)
   656  	if !ok {
   657  		t.Fatalf("aborter has no state on %q", insertStmt)
   658  	}
   659  	if execCount != 2 {
   660  		t.Fatalf("expected %q to be executed 2 times, but got %d", insertStmt, execCount)
   661  	}
   662  }
   663  
   664  // rollbackStrategy is the type of statement which a client can use to
   665  // rollback aborted txns from retryable errors. We accept two statements
   666  // for rolling back to the cockroach_restart savepoint. See
   667  // *Executor.execStmtInAbortedTxn for more about transaction retries.
   668  type rollbackStrategy int
   669  
   670  const (
   671  	rollbackToSavepoint rollbackStrategy = iota
   672  	declareSavepoint
   673  )
   674  
   675  func (rs rollbackStrategy) SQLCommand() string {
   676  	switch rs {
   677  	case rollbackToSavepoint:
   678  		return "ROLLBACK TO SAVEPOINT cockroach_restart"
   679  	case declareSavepoint:
   680  		return "SAVEPOINT cockroach_restart"
   681  	}
   682  	panic("unreachable")
   683  }
   684  
   685  // exec takes a closure and executes it repeatedly as long as it says it needs
   686  // to be retried. The function also takes a rollback strategy, which specifies
   687  // the statement which the client will use to rollback aborted txns from retryable
   688  // errors.
   689  func retryExec(t *testing.T, sqlDB *gosql.DB, rs rollbackStrategy, fn func(*gosql.Tx) bool) {
   690  	tx, err := sqlDB.Begin()
   691  	if err != nil {
   692  		t.Fatal(err)
   693  	}
   694  	if _, err := tx.Exec(
   695  		"SAVEPOINT cockroach_restart; SET TRANSACTION PRIORITY LOW"); err != nil {
   696  		t.Fatal(err)
   697  	}
   698  
   699  	for fn(tx) {
   700  		if _, err := tx.Exec(rs.SQLCommand()); err != nil {
   701  			t.Fatal(err)
   702  		}
   703  	}
   704  	if err := tx.Commit(); err != nil {
   705  		t.Fatal(err)
   706  	}
   707  }
   708  
   709  // isRetryableErr returns whether the given error is a PG retryable error.
   710  func isRetryableErr(err error) bool {
   711  	var pqErr *pq.Error
   712  	return errors.As(err, &pqErr) && pqErr.Code == "40001"
   713  }
   714  
   715  // Returns true on retriable errors.
   716  func runTestTxn(
   717  	t *testing.T,
   718  	magicVals *filterVals,
   719  	expectedErr string,
   720  	sqlDB *gosql.DB,
   721  	tx *gosql.Tx,
   722  	sentinelInsert string,
   723  ) bool {
   724  	// Run a bogus statement to disable the automatic server retries of subsequent
   725  	// statements.
   726  	if _, err := tx.Exec("SELECT 1"); err != nil {
   727  		t.Fatal(err)
   728  	}
   729  
   730  	retriesNeeded :=
   731  		(magicVals.restartCounts["boulanger"] + magicVals.abortCounts["boulanger"]) > 0
   732  	if retriesNeeded {
   733  		_, err := tx.Exec("INSERT INTO t.public.test(k, v) VALUES (1, 'boulanger')")
   734  		if !testutils.IsError(err, expectedErr) {
   735  			t.Fatalf("unexpected error: %v", err)
   736  		}
   737  		return isRetryableErr(err)
   738  	}
   739  	// Now the INSERT should succeed.
   740  	if _, err := tx.Exec(
   741  		"DELETE FROM t.public.test WHERE true;" + sentinelInsert,
   742  	); err != nil {
   743  		t.Fatal(err)
   744  	}
   745  
   746  	_, err := tx.Exec("RELEASE SAVEPOINT cockroach_restart")
   747  	return isRetryableErr(err)
   748  }
   749  
   750  // TestUserTxnRestart tests user-directed txn restarts.
   751  // The test will inject and otherwise create retriable errors of various kinds
   752  // and checks that we still manage to run a txn despite them.
   753  func TestTxnUserRestart(t *testing.T) {
   754  	defer leaktest.AfterTest(t)()
   755  
   756  	// Set up error injection that causes retries.
   757  	testCases := []struct {
   758  		magicVals   *filterVals
   759  		expectedErr string
   760  	}{
   761  		{
   762  			magicVals: createFilterVals(
   763  				map[string]int{"boulanger": 2}, // restartCounts
   764  				nil),
   765  			expectedErr: "RETRY_REASON_UNKNOWN",
   766  		},
   767  		{
   768  			magicVals: createFilterVals(
   769  				nil,
   770  				map[string]int{"boulanger": 2}), // abortCounts
   771  			expectedErr: regexp.QuoteMeta("TransactionAbortedError(ABORT_REASON_ABORTED_RECORD_FOUND)"),
   772  		},
   773  	}
   774  
   775  	for _, tc := range testCases {
   776  		for _, rs := range []rollbackStrategy{rollbackToSavepoint, declareSavepoint} {
   777  			t.Run(fmt.Sprintf("err=%s,stgy=%d", tc.expectedErr, rs), func(t *testing.T) {
   778  				aborter := NewTxnAborter()
   779  				defer aborter.Close(t)
   780  				params, cmdFilters := tests.CreateTestServerParams()
   781  				params.Knobs.SQLExecutor = aborter.executorKnobs()
   782  				s, sqlDB, _ := serverutils.StartServer(t, params)
   783  				defer s.Stopper().Stop(context.Background())
   784  				{
   785  					pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestTxnUserRestart", url.User(security.RootUser))
   786  					defer cleanup()
   787  					if err := aborter.Init(pgURL); err != nil {
   788  						t.Fatal(err)
   789  					}
   790  				}
   791  
   792  				if _, err := sqlDB.Exec(`
   793  CREATE DATABASE t;
   794  CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT);
   795  `); err != nil {
   796  					t.Fatal(err)
   797  				}
   798  				cleanupFilter := cmdFilters.AppendFilter(
   799  					func(args kvserverbase.FilterArgs) *roachpb.Error {
   800  						if err := injectErrors(args.Req, args.Hdr, tc.magicVals, true /* verifyTxn */); err != nil {
   801  							return roachpb.NewErrorWithTxn(err, args.Hdr.Txn)
   802  						}
   803  						return nil
   804  					}, false)
   805  
   806  				// Also inject an error at RELEASE time, besides the error injected by magicVals.
   807  				sentinelInsert := "INSERT INTO t.public.test(k, v) VALUES (0, 'sentinel')"
   808  				if err := aborter.QueueStmtForAbortion(
   809  					sentinelInsert, 1 /* abortCount */, true, /* willBeRetriedIbid */
   810  				); err != nil {
   811  					t.Fatal(err)
   812  				}
   813  
   814  				commitCount := s.MustGetSQLCounter(sql.MetaTxnCommitStarted.Name)
   815  				// This is the magic. Run the txn closure until all the retries are exhausted.
   816  				retryExec(t, sqlDB, rs, func(tx *gosql.Tx) bool {
   817  					return runTestTxn(t, tc.magicVals, tc.expectedErr, sqlDB, tx, sentinelInsert)
   818  				})
   819  				checkRestarts(t, tc.magicVals)
   820  
   821  				// Check that we only wrote the sentinel row.
   822  				rows, err := sqlDB.Query("SELECT * FROM t.test")
   823  				if err != nil {
   824  					t.Fatal(err)
   825  				}
   826  				defer rows.Close()
   827  				for rows.Next() {
   828  					var k int
   829  					var v string
   830  					err = rows.Scan(&k, &v)
   831  					if err != nil {
   832  						t.Fatal(err)
   833  					}
   834  					if k != 0 || v != "sentinel" {
   835  						t.Fatalf("didn't find expected row: %d %s", k, v)
   836  					}
   837  				}
   838  				// Check that the commit counter was incremented. It could have been
   839  				// incremented by more than 1 because of the transactions we use to force
   840  				// aborts, plus who knows what else the server is doing in the background.
   841  				if err := checkCounterGE(s, sql.MetaTxnCommitStarted, commitCount+1); err != nil {
   842  					t.Error(err)
   843  				}
   844  				// Clean up the table for the next test iteration.
   845  				_, err = sqlDB.Exec("DELETE FROM t.test WHERE true")
   846  				if err != nil {
   847  					t.Fatal(err)
   848  				}
   849  				cleanupFilter()
   850  			})
   851  		}
   852  	}
   853  }
   854  
   855  // Test that rando commands while in COMMIT_WAIT return a particular error.
   856  func TestCommitWaitState(t *testing.T) {
   857  	defer leaktest.AfterTest(t)()
   858  
   859  	params, _ := tests.CreateTestServerParams()
   860  	s, sqlDB, _ := serverutils.StartServer(t, params)
   861  	defer s.Stopper().Stop(context.Background())
   862  	if _, err := sqlDB.Exec(`
   863  CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT);
   864  `); err != nil {
   865  		t.Fatal(err)
   866  	}
   867  
   868  	tx, err := sqlDB.Begin()
   869  	if err != nil {
   870  		t.Fatal(err)
   871  	}
   872  	if _, err := tx.Exec(
   873  		"SAVEPOINT cockroach_restart; RELEASE cockroach_restart"); err != nil {
   874  		t.Fatal(err)
   875  	}
   876  	if _, err := tx.Exec("INSERT INTO t.test(k, v) VALUES (0, 'sentinel')"); !testutils.IsError(err, "current transaction is committed") {
   877  		t.Fatalf("unexpected error: %v", err)
   878  	}
   879  	// Rollback should respond with a COMMIT command tag.
   880  	if err := tx.Rollback(); !testutils.IsError(err, "unexpected command tag COMMIT") {
   881  		t.Fatalf("unexpected error: %v", err)
   882  	}
   883  }
   884  
   885  // Test that a COMMIT getting an error, retriable or not, leaves the txn
   886  // finalized and not in Aborted/RestartWait (i.e. COMMIT, like ROLLBACK, is
   887  // always final). As opposed to an error on a COMMIT in an auto-retry
   888  // txn, where we retry the txn (not tested here).
   889  func TestErrorOnCommitFinalizesTxn(t *testing.T) {
   890  	defer leaktest.AfterTest(t)()
   891  
   892  	aborter := NewTxnAborter()
   893  	defer aborter.Close(t)
   894  	params, _ := tests.CreateTestServerParams()
   895  	params.Knobs.SQLExecutor = aborter.executorKnobs()
   896  	s, sqlDB, _ := serverutils.StartServer(t, params)
   897  	defer s.Stopper().Stop(context.Background())
   898  	{
   899  		pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestErrorOnCommitFinalizesTxn", url.User(security.RootUser))
   900  		defer cleanup()
   901  		if err := aborter.Init(pgURL); err != nil {
   902  			t.Fatal(err)
   903  		}
   904  	}
   905  
   906  	if _, err := sqlDB.Exec(`
   907  CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT);
   908  `); err != nil {
   909  		t.Fatal(err)
   910  	}
   911  	// We need to do everything on one connection as we'll want to observe the
   912  	// connection state after a COMMIT.
   913  	sqlDB.SetMaxOpenConns(1)
   914  
   915  	// We're going to test both errors that would leave the transaction in the
   916  	// RestartWait state and errors that would leave the transaction in Aborted,
   917  	// if they were to happen on any other statement than COMMIT.
   918  	// We do that by always injecting a retryable error at COMMIT, but once in a
   919  	// txn that had a "retry intent" (SAVEPOINT cockroach_restart), and once in a
   920  	// txn without it.
   921  	testCases := []struct {
   922  		retryIntent bool
   923  	}{
   924  		{false},
   925  		{true},
   926  	}
   927  	for _, tc := range testCases {
   928  		t.Run(fmt.Sprintf("retryIntent=%t", tc.retryIntent), func(t *testing.T) {
   929  			const insertStmt = "INSERT INTO t.public.test(k, v) VALUES (0, 'boulanger')"
   930  			if err := aborter.QueueStmtForAbortion(
   931  				insertStmt, 1 /* abortCount */, false, /* willBeRetriedIbid */
   932  			); err != nil {
   933  				t.Fatal(err)
   934  			}
   935  			if _, err := sqlDB.Exec("BEGIN"); err != nil {
   936  				t.Fatal(err)
   937  			}
   938  			if tc.retryIntent {
   939  				if _, err := sqlDB.Exec("SAVEPOINT cockroach_restart"); err != nil {
   940  					t.Fatal(err)
   941  				}
   942  			}
   943  			if _, err := sqlDB.Exec(insertStmt); err != nil {
   944  				t.Fatal(err)
   945  			}
   946  			if _, err := sqlDB.Exec("COMMIT"); !testutils.IsError(err, "pq: restart transaction") {
   947  				t.Fatalf("unexpected error: %v", err)
   948  			}
   949  
   950  			// Check that we can start another txn on the (one and only) connection.
   951  			if _, err := sqlDB.Exec("BEGIN"); err != nil {
   952  				t.Fatal(err)
   953  			}
   954  			// Check that we don't see any rows, so the previous txn was rolled back.
   955  			rows, err := sqlDB.Query("SELECT * FROM t.test")
   956  			if err != nil {
   957  				t.Fatal(err)
   958  			}
   959  			defer rows.Close()
   960  			if rows.Next() {
   961  				var k int
   962  				var v string
   963  				err := rows.Scan(&k, &v)
   964  				t.Fatalf("found unexpected row: %d %s, %v", k, v, err)
   965  			}
   966  			if _, err := sqlDB.Exec("END"); err != nil {
   967  				t.Fatal(err)
   968  			}
   969  		})
   970  	}
   971  }
   972  
   973  // TestRollbackInRestartWait ensures that a ROLLBACK while the txn is in the
   974  // RetryWait state works.
   975  func TestRollbackInRestartWait(t *testing.T) {
   976  	defer leaktest.AfterTest(t)()
   977  
   978  	aborter := NewTxnAborter()
   979  	defer aborter.Close(t)
   980  	params, _ := tests.CreateTestServerParams()
   981  	params.Knobs.SQLExecutor = aborter.executorKnobs()
   982  	s, sqlDB, _ := serverutils.StartServer(t, params)
   983  	defer s.Stopper().Stop(context.Background())
   984  	{
   985  		pgURL, cleanup := sqlutils.PGUrl(t, s.ServingSQLAddr(), "TestRollbackInRestartWait", url.User(security.RootUser))
   986  		defer cleanup()
   987  		if err := aborter.Init(pgURL); err != nil {
   988  			t.Fatal(err)
   989  		}
   990  	}
   991  
   992  	if _, err := sqlDB.Exec(`
   993  CREATE DATABASE t;
   994  CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT);
   995  `); err != nil {
   996  		t.Fatal(err)
   997  	}
   998  
   999  	// Set up error injection that causes retries.
  1000  	const insertStmt = "INSERT INTO t.public.test(k, v) VALUES (0, 'boulanger')"
  1001  	if err := aborter.QueueStmtForAbortion(
  1002  		insertStmt, 1 /* abortCount */, false, /* willBeRetriedIbid */
  1003  	); err != nil {
  1004  		t.Fatal(err)
  1005  	}
  1006  
  1007  	tx, err := sqlDB.Begin()
  1008  	if err != nil {
  1009  		t.Fatal(err)
  1010  	}
  1011  	if _, err := tx.Exec("SAVEPOINT cockroach_restart"); err != nil {
  1012  		t.Fatal(err)
  1013  	}
  1014  	// Run a batch of statements to move the txn out of the AutoRetry state,
  1015  	// otherwise the INSERT below would be automatically retried.
  1016  	if _, err := tx.Exec("SELECT 1"); err != nil {
  1017  		t.Fatal(err)
  1018  	}
  1019  
  1020  	if _, err := tx.Exec(insertStmt); err != nil {
  1021  		t.Fatal(err)
  1022  	}
  1023  	if _, err := tx.Exec("RELEASE SAVEPOINT cockroach_restart"); !testutils.IsError(
  1024  		err, "pq: restart transaction") {
  1025  		t.Fatalf("unexpected error: %s", err)
  1026  	}
  1027  	if err := tx.Rollback(); err != nil {
  1028  		t.Fatal(err)
  1029  	}
  1030  }
  1031  
  1032  // TestUnexpectedStatementInRestartWait ensures that a statement other than
  1033  // ROLLBACK [TO SAVEPOINT] while the txn is in the RetryWait state terminates
  1034  // the transaction. More importantly than the state in which the transaction
  1035  // transitions when this happens is that this test prevents a regression of
  1036  // #15412, whereby the server would crash in this situation.
  1037  func TestUnexpectedStatementInRestartWait(t *testing.T) {
  1038  	defer leaktest.AfterTest(t)()
  1039  
  1040  	params, _ := tests.CreateTestServerParams()
  1041  	s, sqlDB, _ := serverutils.StartServer(t, params)
  1042  	defer s.Stopper().Stop(context.Background())
  1043  
  1044  	tx, err := sqlDB.Begin()
  1045  	if err != nil {
  1046  		t.Fatal(err)
  1047  	}
  1048  
  1049  	if _, err := tx.Exec("SAVEPOINT cockroach_restart"); err != nil {
  1050  		t.Fatal(err)
  1051  	}
  1052  	// Run a batch of statements to move the txn out of the AutoRetry state,
  1053  	// otherwise the SELECT below would be automatically retried.
  1054  	if _, err := tx.Exec("SELECT 1"); err != nil {
  1055  		t.Fatal(err)
  1056  	}
  1057  
  1058  	if _, err := tx.Exec(
  1059  		"SELECT crdb_internal.force_retry('1s':::INTERVAL)"); !testutils.IsError(
  1060  		err, `forced by crdb_internal\.force_retry\(\)`) {
  1061  		t.Fatal(err)
  1062  	}
  1063  	var state string
  1064  	if err := tx.QueryRow("SHOW TRANSACTION STATUS").Scan(&state); err != nil {
  1065  		t.Fatal(err)
  1066  	}
  1067  	if state != "Aborted" {
  1068  		t.Fatalf("expected state %s, got: %s", "Aborted", state)
  1069  	}
  1070  
  1071  	if _, err := tx.Exec("SELECT 1"); !testutils.IsError(err,
  1072  		`pq: current transaction is aborted, commands ignored until end of transaction block`) {
  1073  		t.Fatal(err)
  1074  	}
  1075  	if err := tx.QueryRow("SHOW TRANSACTION STATUS").Scan(&state); err != nil {
  1076  		t.Fatal(err)
  1077  	}
  1078  	if state != "Aborted" {
  1079  		t.Fatalf("expected state %s, got: %s", "Aborted", state)
  1080  	}
  1081  	if err := tx.Rollback(); err != nil {
  1082  		t.Fatal(err)
  1083  	}
  1084  }
  1085  
  1086  // TestNonRetryableError verifies that a non-retryable error is propagated to the client.
  1087  func TestNonRetryableError(t *testing.T) {
  1088  	defer leaktest.AfterTest(t)()
  1089  
  1090  	params, cmdFilters := tests.CreateTestServerParams()
  1091  	s, sqlDB, _ := serverutils.StartServer(t, params)
  1092  	defer s.Stopper().Stop(context.Background())
  1093  
  1094  	testKey := []byte("test_key")
  1095  	hitError := false
  1096  	cleanupFilter := cmdFilters.AppendFilter(
  1097  		func(args kvserverbase.FilterArgs) *roachpb.Error {
  1098  			if req, ok := args.Req.(*roachpb.ScanRequest); ok {
  1099  				if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) {
  1100  					hitError = true
  1101  					return roachpb.NewErrorWithTxn(fmt.Errorf("testError"), args.Hdr.Txn)
  1102  				}
  1103  			}
  1104  			return nil
  1105  		}, false)
  1106  	defer cleanupFilter()
  1107  
  1108  	// We need to do everything on one connection as we'll want to observe the
  1109  	// connection state after a COMMIT.
  1110  	sqlDB.SetMaxOpenConns(1)
  1111  	if _, err := sqlDB.Exec(`
  1112  CREATE DATABASE t;
  1113  CREATE TABLE t.test (k TEXT PRIMARY KEY, v TEXT);
  1114  INSERT INTO t.test (k, v) VALUES ('test_key', 'test_val');
  1115  SELECT * from t.test WHERE k = 'test_key';
  1116  `); !testutils.IsError(err, "pq: testError") {
  1117  		t.Errorf("unexpected error %v", err)
  1118  	}
  1119  	if !hitError {
  1120  		t.Errorf("expected to hit error, but it didn't happen")
  1121  	}
  1122  }
  1123  
  1124  // Verifies that an expired lease is released and a new lease is acquired on
  1125  // transaction restart.
  1126  //
  1127  // This test triggers the above scenario by making
  1128  // ReadWithinUncertaintyIntervalError advance the clock, so that the transaction
  1129  // timestamp exceeds the deadline of the EndTxnRequest.
  1130  func TestReacquireLeaseOnRestart(t *testing.T) {
  1131  	defer leaktest.AfterTest(t)()
  1132  
  1133  	advancement := 2 * base.DefaultTableDescriptorLeaseDuration
  1134  
  1135  	var cmdFilters tests.CommandFilters
  1136  	cmdFilters.AppendFilter(tests.CheckEndTxnTrigger, true)
  1137  
  1138  	var clockUpdate int32
  1139  	testKey := []byte("test_key")
  1140  	storeTestingKnobs := &kvserver.StoreTestingKnobs{
  1141  		EvalKnobs: kvserverbase.BatchEvalTestingKnobs{
  1142  			TestingEvalFilter: cmdFilters.RunFilters,
  1143  		},
  1144  		DisableMaxOffsetCheck: true,
  1145  		ClockBeforeSend: func(c *hlc.Clock, ba roachpb.BatchRequest) {
  1146  			if atomic.LoadInt32(&clockUpdate) > 0 {
  1147  				return
  1148  			}
  1149  
  1150  			// Hack to advance the transaction timestamp on a transaction restart.
  1151  			for _, union := range ba.Requests {
  1152  				if req, ok := union.GetInner().(*roachpb.ScanRequest); ok {
  1153  					if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) {
  1154  						atomic.AddInt32(&clockUpdate, 1)
  1155  						now := c.Now()
  1156  						now.WallTime += advancement.Nanoseconds()
  1157  						c.Update(now)
  1158  						break
  1159  					}
  1160  				}
  1161  			}
  1162  		},
  1163  	}
  1164  
  1165  	const refreshAttempts = 3
  1166  	clientTestingKnobs := &kvcoord.ClientTestingKnobs{
  1167  		MaxTxnRefreshAttempts: refreshAttempts,
  1168  	}
  1169  
  1170  	params, _ := tests.CreateTestServerParams()
  1171  	params.Knobs.Store = storeTestingKnobs
  1172  	params.Knobs.KVClient = clientTestingKnobs
  1173  	s, sqlDB, _ := serverutils.StartServer(t, params)
  1174  	defer s.Stopper().Stop(context.Background())
  1175  
  1176  	var restartDone int32
  1177  	cleanupFilter := cmdFilters.AppendFilter(
  1178  		func(args kvserverbase.FilterArgs) *roachpb.Error {
  1179  			// Allow a set number of restarts so that the auto retry on the
  1180  			// first few uncertainty interval errors also fails.
  1181  			if atomic.LoadInt32(&restartDone) > refreshAttempts {
  1182  				return nil
  1183  			}
  1184  
  1185  			if req, ok := args.Req.(*roachpb.ScanRequest); ok {
  1186  				if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) {
  1187  					atomic.AddInt32(&restartDone, 1)
  1188  					// Return ReadWithinUncertaintyIntervalError to update the transaction timestamp on retry.
  1189  					txn := args.Hdr.Txn
  1190  					txn.ResetObservedTimestamps()
  1191  					now := s.Clock().Now()
  1192  					txn.UpdateObservedTimestamp(s.(*server.TestServer).Gossip().NodeID.Get(), now)
  1193  					return roachpb.NewErrorWithTxn(roachpb.NewReadWithinUncertaintyIntervalError(now, now, txn), txn)
  1194  				}
  1195  			}
  1196  			return nil
  1197  		}, false)
  1198  	defer cleanupFilter()
  1199  
  1200  	sqlDB.SetMaxOpenConns(1)
  1201  	if _, err := sqlDB.Exec(`
  1202  CREATE DATABASE t;
  1203  CREATE TABLE t.test (k TEXT PRIMARY KEY, v TEXT);
  1204  INSERT INTO t.test (k, v) VALUES ('test_key', 'test_val');
  1205  `); err != nil {
  1206  		t.Fatal(err)
  1207  	}
  1208  	// Acquire the lease and enable the auto-retry. The first few read attempts
  1209  	// will trigger ReadWithinUncertaintyIntervalError and advance the
  1210  	// transaction timestamp due to txnSpanRefresher-initiated span refreshes.
  1211  	// The transaction timestamp will exceed the lease expiration time, and the
  1212  	// last read attempt will re-acquire the lease.
  1213  	if _, err := sqlDB.Exec(`
  1214  SELECT * from t.test WHERE k = 'test_key';
  1215  `); err != nil {
  1216  		t.Fatal(err)
  1217  	}
  1218  
  1219  	if u := atomic.LoadInt32(&clockUpdate); u != 1 {
  1220  		t.Errorf("expected exacltly one clock update, but got %d", u)
  1221  	}
  1222  	if u, e := atomic.LoadInt32(&restartDone), int32(refreshAttempts+1); u != e {
  1223  		t.Errorf("expected exactly %d restarts, but got %d", e, u)
  1224  	}
  1225  }
  1226  
  1227  // Verifies that the uncommitted descriptor cache is flushed on a txn restart.
  1228  //
  1229  // This test triggers the above scenario by triggering a restart by returning
  1230  // ReadWithinUncertaintyIntervalError on the first transaction attempt.
  1231  func TestFlushUncommitedDescriptorCacheOnRestart(t *testing.T) {
  1232  	defer leaktest.AfterTest(t)()
  1233  
  1234  	var cmdFilters tests.CommandFilters
  1235  	cmdFilters.AppendFilter(tests.CheckEndTxnTrigger, true)
  1236  	testKey := []byte("test_key")
  1237  	testingKnobs := &kvserver.StoreTestingKnobs{
  1238  		EvalKnobs: kvserverbase.BatchEvalTestingKnobs{
  1239  			TestingEvalFilter: cmdFilters.RunFilters,
  1240  		},
  1241  	}
  1242  
  1243  	params, _ := tests.CreateTestServerParams()
  1244  	params.Knobs.Store = testingKnobs
  1245  	s, sqlDB, _ := serverutils.StartServer(t, params)
  1246  	defer s.Stopper().Stop(context.Background())
  1247  
  1248  	var restartDone int32
  1249  	cleanupFilter := cmdFilters.AppendFilter(
  1250  		func(args kvserverbase.FilterArgs) *roachpb.Error {
  1251  			if atomic.LoadInt32(&restartDone) > 0 {
  1252  				return nil
  1253  			}
  1254  
  1255  			if req, ok := args.Req.(*roachpb.ScanRequest); ok {
  1256  				if bytes.Contains(req.Key, testKey) && !kv.TestingIsRangeLookupRequest(req) {
  1257  					atomic.AddInt32(&restartDone, 1)
  1258  					// Return ReadWithinUncertaintyIntervalError.
  1259  					txn := args.Hdr.Txn
  1260  					txn.ResetObservedTimestamps()
  1261  					now := s.Clock().Now()
  1262  					txn.UpdateObservedTimestamp(s.(*server.TestServer).Gossip().NodeID.Get(), now)
  1263  					return roachpb.NewErrorWithTxn(roachpb.NewReadWithinUncertaintyIntervalError(now, now, txn), txn)
  1264  				}
  1265  			}
  1266  			return nil
  1267  		}, false)
  1268  	defer cleanupFilter()
  1269  
  1270  	sqlDB.SetMaxOpenConns(1)
  1271  	if _, err := sqlDB.Exec(`
  1272  CREATE DATABASE t;
  1273  CREATE TABLE t.test (k TEXT PRIMARY KEY, v TEXT);
  1274  INSERT INTO t.test (k, v) VALUES ('test_key', 'test_val');
  1275  `); err != nil {
  1276  		t.Fatal(err)
  1277  	}
  1278  	// Read from a table, rename it, and then read from the table to trigger
  1279  	// the retry. On the second attempt the first read from the table should
  1280  	// not see the uncommitted renamed table.
  1281  	if _, err := sqlDB.Exec(`
  1282  BEGIN;
  1283  SELECT * from t.test WHERE k = 'foobar';
  1284  ALTER TABLE t.test RENAME TO t.foo;
  1285  SELECT * from t.foo WHERE k = 'test_key';
  1286  COMMIT;
  1287  `); err != nil {
  1288  		t.Fatal(err)
  1289  	}
  1290  
  1291  	if u := atomic.LoadInt32(&restartDone); u != 1 {
  1292  		t.Errorf("expected exactly one restart, but got %d", u)
  1293  	}
  1294  }
  1295  
  1296  // Test that retryable errors are handled properly through DistSQL.
  1297  func TestDistSQLRetryableError(t *testing.T) {
  1298  	defer leaktest.AfterTest(t)()
  1299  
  1300  	// One of the rows in the table.
  1301  	targetKey := roachpb.Key("\275\211\212")
  1302  
  1303  	restarted := true
  1304  
  1305  	tc := serverutils.StartTestCluster(t, 3, /* numNodes */
  1306  		base.TestClusterArgs{
  1307  			ReplicationMode: base.ReplicationManual,
  1308  			ServerArgs: base.TestServerArgs{
  1309  				UseDatabase: "test",
  1310  				Knobs: base.TestingKnobs{
  1311  					Store: &kvserver.StoreTestingKnobs{
  1312  						EvalKnobs: kvserverbase.BatchEvalTestingKnobs{
  1313  							TestingEvalFilter: func(fArgs kvserverbase.FilterArgs) *roachpb.Error {
  1314  								_, ok := fArgs.Req.(*roachpb.ScanRequest)
  1315  								if ok && fArgs.Req.Header().Key.Equal(targetKey) && fArgs.Hdr.Txn.Epoch == 0 {
  1316  									restarted = true
  1317  									err := roachpb.NewReadWithinUncertaintyIntervalError(
  1318  										fArgs.Hdr.Timestamp, /* readTS */
  1319  										hlc.Timestamp{},
  1320  										nil)
  1321  									errTxn := fArgs.Hdr.Txn.Clone()
  1322  									errTxn.UpdateObservedTimestamp(roachpb.NodeID(2), hlc.Timestamp{})
  1323  									pErr := roachpb.NewErrorWithTxn(err, errTxn)
  1324  									pErr.OriginNode = 2
  1325  									return pErr
  1326  								}
  1327  
  1328  								return nil
  1329  							},
  1330  						},
  1331  					},
  1332  				},
  1333  			},
  1334  		})
  1335  	defer tc.Stopper().Stop(context.Background())
  1336  
  1337  	db := tc.ServerConn(0)
  1338  	sqlutils.CreateTable(t, db, "t",
  1339  		"num INT PRIMARY KEY",
  1340  		3, /* numRows */
  1341  		sqlutils.ToRowFn(sqlutils.RowIdxFn))
  1342  
  1343  	// We're going to split one of the tables, but node 4 is unaware of this.
  1344  	_, err := db.Exec(fmt.Sprintf(`
  1345  	ALTER TABLE "t" SPLIT AT VALUES (1), (2), (3);
  1346  	ALTER TABLE "t" EXPERIMENTAL_RELOCATE VALUES (ARRAY[%d], 1), (ARRAY[%d], 2), (ARRAY[%d], 3);
  1347  	`,
  1348  		tc.Server(1).GetFirstStoreID(),
  1349  		tc.Server(0).GetFirstStoreID(),
  1350  		tc.Server(2).GetFirstStoreID()))
  1351  	if err != nil {
  1352  		t.Fatal(err)
  1353  	}
  1354  
  1355  	db.SetMaxOpenConns(1)
  1356  
  1357  	if _, err := db.Exec("SET DISTSQL = ON"); err != nil {
  1358  		t.Fatal(err)
  1359  	}
  1360  
  1361  	// Test that a stand-alone statement is retried by the Executor.
  1362  	if _, err := db.Exec("SELECT count(1) FROM t"); err != nil {
  1363  		t.Fatal(err)
  1364  	}
  1365  	if !restarted {
  1366  		t.Fatalf("expected the EvalFilter to restart the txn, but it didn't")
  1367  	}
  1368  
  1369  	// Test that a command that can't be retried automatically generates an error
  1370  	// with the correct code.
  1371  	restarted = false
  1372  
  1373  	txn, err := db.Begin()
  1374  	if err != nil {
  1375  		t.Fatal(err)
  1376  	}
  1377  	// Run a batch of statements to move the txn out of the "AutoRetry" state.
  1378  	if _, err := txn.Exec("SELECT 1"); err != nil {
  1379  		t.Fatal(err)
  1380  	}
  1381  
  1382  	// Let's make sure that DISTSQL will actually be used.
  1383  	row := txn.QueryRow(`SELECT automatic FROM [EXPLAIN (DISTSQL) SELECT count(1) FROM t]`)
  1384  	var automatic bool
  1385  	if err := row.Scan(&automatic); err != nil {
  1386  		t.Fatal(err)
  1387  	}
  1388  	if !automatic {
  1389  		t.Fatal("DISTSQL not used for test's query")
  1390  	}
  1391  
  1392  	_, err = txn.Exec("SELECT count(1) FROM t")
  1393  	if !restarted {
  1394  		t.Fatalf("expected the EvalFilter to restart the txn, but it didn't")
  1395  	}
  1396  	if err == nil {
  1397  		t.Fatal("expected retryable error")
  1398  	}
  1399  	if !isRetryableErr(err) {
  1400  		t.Fatalf("expected retryable error, got: %s", err)
  1401  	}
  1402  
  1403  	if err := txn.Rollback(); err != nil {
  1404  		t.Fatal(err)
  1405  	}
  1406  
  1407  	// Test that ORDER BY properly propagates retryable errors. The weird
  1408  	// ordering criteria is to ensure that the ORDER BY is present and not elided
  1409  	// because we're ordering on the primary key column.
  1410  	restarted = false
  1411  	rows, err := db.Query("SELECT * FROM t ORDER BY upper(num::TEXT)")
  1412  	if err != nil {
  1413  		t.Fatal(err)
  1414  	}
  1415  	var count int
  1416  	for rows.Next() {
  1417  		count++
  1418  	}
  1419  	if count != 3 {
  1420  		t.Fatalf("expected 3 rows, but found %d", count)
  1421  	}
  1422  	if !restarted {
  1423  		t.Fatalf("expected the EvalFilter to restart the txn, but it didn't")
  1424  	}
  1425  }
  1426  
  1427  // TestRollbackToSavepointFromUnusualStates tests that issuing a ROLLBACK TO
  1428  // SAVEPOINT from a non-retryable state works, and that the transaction that it
  1429  // opens has the same attributes as the existing one.
  1430  func TestRollbackToSavepointFromUnusualStates(t *testing.T) {
  1431  	defer leaktest.AfterTest(t)()
  1432  
  1433  	params, _ := tests.CreateTestServerParams()
  1434  	s, sqlDB, _ := serverutils.StartServer(t, params)
  1435  	defer s.Stopper().Stop(context.Background())
  1436  
  1437  	checkState := func(tx *gosql.Tx, ts time.Time) {
  1438  		t.Helper()
  1439  		var pri string
  1440  		r := tx.QueryRow("SHOW TRANSACTION PRIORITY")
  1441  		if err := r.Scan(&pri); err != nil {
  1442  			t.Fatal(err)
  1443  		} else {
  1444  			if pri != "high" {
  1445  				t.Errorf("Expected high, got: %s", pri)
  1446  			}
  1447  		}
  1448  	}
  1449  
  1450  	tx, err := sqlDB.Begin()
  1451  	if err != nil {
  1452  		t.Fatal(err)
  1453  	}
  1454  	if _, err := tx.Exec("SET TRANSACTION PRIORITY HIGH"); err != nil {
  1455  		t.Fatal(err)
  1456  	}
  1457  	if _, err := tx.Exec("SAVEPOINT cockroach_restart"); err != nil {
  1458  		t.Fatal(err)
  1459  	}
  1460  
  1461  	var ts time.Time
  1462  	r := tx.QueryRow("SELECT now()")
  1463  	if err := r.Scan(&ts); err != nil {
  1464  		t.Fatal(err)
  1465  	}
  1466  
  1467  	checkState(tx, ts)
  1468  
  1469  	// ROLLBACK TO SAVEPOINT from an Open txn should work.
  1470  	if _, err := tx.Exec("ROLLBACK TO SAVEPOINT cockroach_restart"); err != nil {
  1471  		t.Fatal(err)
  1472  	}
  1473  	checkState(tx, ts)
  1474  
  1475  	// ROLLBACK TO SAVEPOINT from an Aborted txn should work.
  1476  	if _, err := tx.Exec("BOGUS SQL STATEMENT"); !testutils.IsError(err, `at or near "bogus": syntax error`) {
  1477  		t.Fatalf("unexpected error: %v", err)
  1478  	}
  1479  	if _, err := tx.Exec("ROLLBACK TO SAVEPOINT cockroach_restart"); err != nil {
  1480  		t.Fatalf("unexpected error: %s", err)
  1481  	}
  1482  	checkState(tx, ts)
  1483  
  1484  	if err := tx.Rollback(); err != nil {
  1485  		t.Fatal(err)
  1486  	}
  1487  }
  1488  
  1489  // Test that, if we'd otherwise perform an auto-retry but results for the
  1490  // current txn have already been streamed to the client, we don't do the
  1491  // auto-restart.
  1492  func TestTxnAutoRetriesDisabledAfterResultsHaveBeenSentToClient(t *testing.T) {
  1493  	defer leaktest.AfterTest(t)()
  1494  
  1495  	params, _ := tests.CreateTestServerParams()
  1496  	s, sqlDB, _ := serverutils.StartServer(t, params)
  1497  	defer s.Stopper().Stop(context.Background())
  1498  
  1499  	tests := []struct {
  1500  		name                              string
  1501  		autoCommit                        bool
  1502  		clientDirectedRetry               bool
  1503  		expectedTxnStateAfterRetriableErr string
  1504  	}{
  1505  		{
  1506  			name:                              "client_directed_retries",
  1507  			clientDirectedRetry:               true,
  1508  			expectedTxnStateAfterRetriableErr: "Aborted",
  1509  		},
  1510  		{
  1511  			name:                              "no_client_directed_retries",
  1512  			clientDirectedRetry:               false,
  1513  			expectedTxnStateAfterRetriableErr: "Aborted",
  1514  		},
  1515  		{
  1516  			name:                              "autocommit",
  1517  			autoCommit:                        true,
  1518  			expectedTxnStateAfterRetriableErr: "NoTxn",
  1519  		},
  1520  	}
  1521  	for _, tc := range tests {
  1522  		t.Run(tc.name, func(t *testing.T) {
  1523  			// Cleanup the connection state after each test so the next one can run
  1524  			// statements.
  1525  			// TODO(andrei): Once we're on go 1.9, this test should use the new
  1526  			// db.Conn() method to tie each test to a connection; then this cleanup
  1527  			// wouldn't be necessary. Also, the test is currently technically
  1528  			// incorrect, as there's no guarantee that the state check at the end will
  1529  			// happen on the right connection.
  1530  			defer func() {
  1531  				if tc.autoCommit {
  1532  					// No cleanup necessary.
  1533  					return
  1534  				}
  1535  				if _, err := sqlDB.Exec("ROLLBACK"); err != nil {
  1536  					t.Fatal(err)
  1537  				}
  1538  			}()
  1539  
  1540  			var savepoint string
  1541  			if tc.clientDirectedRetry {
  1542  				savepoint = "SAVEPOINT cockroach_restart;"
  1543  			}
  1544  
  1545  			var prefix, suffix string
  1546  			if !tc.autoCommit {
  1547  				prefix = "BEGIN; " + savepoint
  1548  				suffix = "COMMIT;"
  1549  			}
  1550  
  1551  			// We'll run a statement that produces enough results to overflow the
  1552  			// buffers and start streaming results to the client before the retriable
  1553  			// error is injected. We do this by running a generate series that blows
  1554  			// up at the very end, with a CASE statement.
  1555  			sql := fmt.Sprintf(`
  1556  				%s
  1557  				SELECT
  1558  					CASE x
  1559            WHEN 10000 THEN crdb_internal.force_retry('1s')
  1560            ELSE x
  1561  					END
  1562          FROM generate_series(1, 10000) AS t(x);
  1563  				%s`,
  1564  				prefix, suffix)
  1565  			_, err := sqlDB.Exec(sql)
  1566  			if !isRetryableErr(err) {
  1567  				t.Fatalf("expected retriable error, got: %v", err)
  1568  			}
  1569  			var state string
  1570  			if err := sqlDB.QueryRow("SHOW TRANSACTION STATUS").Scan(&state); err != nil {
  1571  				t.Fatal(err)
  1572  			}
  1573  			if expStateStr := tc.expectedTxnStateAfterRetriableErr; state != expStateStr {
  1574  				t.Fatalf("expected state %s, got: %s", expStateStr, state)
  1575  			}
  1576  		})
  1577  	}
  1578  }