vitess.io/vitess@v0.16.2/go/vt/vtgate/scatter_conn.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vtgate
    18  
    19  import (
    20  	"context"
    21  	"io"
    22  	"sync"
    23  	"time"
    24  
    25  	"vitess.io/vitess/go/vt/sqlparser"
    26  
    27  	"google.golang.org/protobuf/proto"
    28  
    29  	"vitess.io/vitess/go/mysql"
    30  	"vitess.io/vitess/go/sqltypes"
    31  	"vitess.io/vitess/go/stats"
    32  	"vitess.io/vitess/go/vt/concurrency"
    33  	"vitess.io/vitess/go/vt/discovery"
    34  	"vitess.io/vitess/go/vt/log"
    35  	"vitess.io/vitess/go/vt/srvtopo"
    36  	"vitess.io/vitess/go/vt/topo/topoproto"
    37  	"vitess.io/vitess/go/vt/vterrors"
    38  	"vitess.io/vitess/go/vt/vtgate/engine"
    39  	"vitess.io/vitess/go/vt/vttablet/queryservice"
    40  
    41  	querypb "vitess.io/vitess/go/vt/proto/query"
    42  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    43  	vtgatepb "vitess.io/vitess/go/vt/proto/vtgate"
    44  	vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
    45  )
    46  
    47  // ScatterConn is used for executing queries across
    48  // multiple shard level connections.
    49  type ScatterConn struct {
    50  	timings              *stats.MultiTimings
    51  	tabletCallErrorCount *stats.CountersWithMultiLabels
    52  	txConn               *TxConn
    53  	gateway              *TabletGateway
    54  }
    55  
    56  // shardActionFunc defines the contract for a shard action
    57  // outside of a transaction. Every such function executes the
    58  // necessary action on a shard, sends the results to sResults, and
    59  // return an error if any.  multiGo is capable of executing
    60  // multiple shardActionFunc actions in parallel and
    61  // consolidating the results and errors for the caller.
    62  type shardActionFunc func(rs *srvtopo.ResolvedShard, i int) error
    63  
    64  // shardActionTransactionFunc defines the contract for a shard action
    65  // that may be in a transaction. Every such function executes the
    66  // necessary action on a shard (with an optional Begin call), aggregates
    67  // the results, and return an error if any.
    68  // multiGoTransaction is capable of executing multiple
    69  // shardActionTransactionFunc actions in parallel and consolidating
    70  // the results and errors for the caller.
    71  type shardActionTransactionFunc func(rs *srvtopo.ResolvedShard, i int, shardActionInfo *shardActionInfo) (*shardActionInfo, error)
    72  
    73  // NewScatterConn creates a new ScatterConn.
    74  func NewScatterConn(statsName string, txConn *TxConn, gw *TabletGateway) *ScatterConn {
    75  	// this only works with TabletGateway
    76  	tabletCallErrorCountStatsName := ""
    77  	if statsName != "" {
    78  		tabletCallErrorCountStatsName = statsName + "ErrorCount"
    79  	}
    80  	return &ScatterConn{
    81  		timings: stats.NewMultiTimings(
    82  			statsName,
    83  			"Scatter connection timings",
    84  			[]string{"Operation", "Keyspace", "ShardName", "DbType"}),
    85  		tabletCallErrorCount: stats.NewCountersWithMultiLabels(
    86  			tabletCallErrorCountStatsName,
    87  			"Error count from tablet calls in scatter conns",
    88  			[]string{"Operation", "Keyspace", "ShardName", "DbType"}),
    89  		txConn:  txConn,
    90  		gateway: gw,
    91  	}
    92  }
    93  
    94  func (stc *ScatterConn) startAction(name string, target *querypb.Target) (time.Time, []string) {
    95  	statsKey := []string{name, target.Keyspace, target.Shard, topoproto.TabletTypeLString(target.TabletType)}
    96  	startTime := time.Now()
    97  	return startTime, statsKey
    98  }
    99  
   100  func (stc *ScatterConn) endAction(startTime time.Time, allErrors *concurrency.AllErrorRecorder, statsKey []string, err *error, session *SafeSession) {
   101  	if *err != nil {
   102  		allErrors.RecordError(*err)
   103  		// Don't increment the error counter for duplicate
   104  		// keys or bad queries, as those errors are caused by
   105  		// client queries and are not VTGate's fault.
   106  		ec := vterrors.Code(*err)
   107  		if ec != vtrpcpb.Code_ALREADY_EXISTS && ec != vtrpcpb.Code_INVALID_ARGUMENT {
   108  			stc.tabletCallErrorCount.Add(statsKey, 1)
   109  		}
   110  		if ec == vtrpcpb.Code_RESOURCE_EXHAUSTED || ec == vtrpcpb.Code_ABORTED {
   111  			session.SetRollback()
   112  		}
   113  	}
   114  	stc.timings.Record(statsKey, startTime)
   115  }
   116  
   117  func (stc *ScatterConn) endLockAction(startTime time.Time, allErrors *concurrency.AllErrorRecorder, statsKey []string, err *error) {
   118  	if *err != nil {
   119  		allErrors.RecordError(*err)
   120  		stc.tabletCallErrorCount.Add(statsKey, 1)
   121  	}
   122  	stc.timings.Record(statsKey, startTime)
   123  }
   124  
   125  type reset int
   126  
   127  const (
   128  	none reset = iota
   129  	shard
   130  	newQS
   131  )
   132  
   133  // ExecuteMultiShard is like Execute,
   134  // but each shard gets its own Sql Queries and BindVariables.
   135  //
   136  // It always returns a non-nil query result and an array of
   137  // shard errors which may be nil so that callers can optionally
   138  // process a partially-successful operation.
   139  func (stc *ScatterConn) ExecuteMultiShard(
   140  	ctx context.Context,
   141  	primitive engine.Primitive,
   142  	rss []*srvtopo.ResolvedShard,
   143  	queries []*querypb.BoundQuery,
   144  	session *SafeSession,
   145  	autocommit bool,
   146  	ignoreMaxMemoryRows bool,
   147  ) (qr *sqltypes.Result, errs []error) {
   148  
   149  	if len(rss) != len(queries) {
   150  		return nil, []error{vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] got mismatched number of queries and shards")}
   151  	}
   152  
   153  	// mu protects qr
   154  	var mu sync.Mutex
   155  	qr = new(sqltypes.Result)
   156  
   157  	if session.InLockSession() && session.TriggerLockHeartBeat() {
   158  		go stc.runLockQuery(ctx, session)
   159  	}
   160  
   161  	allErrors := stc.multiGoTransaction(
   162  		ctx,
   163  		"Execute",
   164  		rss,
   165  		session,
   166  		autocommit,
   167  		func(rs *srvtopo.ResolvedShard, i int, info *shardActionInfo) (*shardActionInfo, error) {
   168  			var (
   169  				innerqr *sqltypes.Result
   170  				err     error
   171  				opts    *querypb.ExecuteOptions
   172  				alias   *topodatapb.TabletAlias
   173  				qs      queryservice.QueryService
   174  			)
   175  			transactionID := info.transactionID
   176  			reservedID := info.reservedID
   177  
   178  			if session != nil && session.Session != nil {
   179  				opts = session.Session.Options
   180  			}
   181  
   182  			if autocommit {
   183  				// As this is auto-commit, the transactionID is supposed to be zero.
   184  				if transactionID != int64(0) {
   185  					return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "in autocommit mode, transactionID should be zero but was: %d", transactionID)
   186  				}
   187  			}
   188  
   189  			qs, err = getQueryService(rs, info, session, false)
   190  			if err != nil {
   191  				return nil, err
   192  			}
   193  
   194  			retryRequest := func(exec func()) {
   195  				retry := checkAndResetShardSession(info, err, session, rs.Target)
   196  				switch retry {
   197  				case newQS:
   198  					// Current tablet is not available, try querying new tablet using gateway.
   199  					qs = rs.Gateway
   200  					fallthrough
   201  				case shard:
   202  					// if we need to reset a reserved connection, here is our chance to try executing again,
   203  					// against a new connection
   204  					exec()
   205  				}
   206  			}
   207  
   208  			switch info.actionNeeded {
   209  			case nothing:
   210  				innerqr, err = qs.Execute(ctx, rs.Target, queries[i].Sql, queries[i].BindVariables, info.transactionID, info.reservedID, opts)
   211  				if err != nil {
   212  					retryRequest(func() {
   213  						// we seem to have lost our connection. it was a reserved connection, let's try to recreate it
   214  						info.actionNeeded = reserve
   215  						var state queryservice.ReservedState
   216  						state, innerqr, err = qs.ReserveExecute(ctx, rs.Target, session.SetPreQueries(), queries[i].Sql, queries[i].BindVariables, 0 /*transactionId*/, opts)
   217  						reservedID = state.ReservedID
   218  						alias = state.TabletAlias
   219  					})
   220  				}
   221  			case begin:
   222  				var state queryservice.TransactionState
   223  				state, innerqr, err = qs.BeginExecute(ctx, rs.Target, session.SavePoints(), queries[i].Sql, queries[i].BindVariables, reservedID, opts)
   224  				transactionID = state.TransactionID
   225  				alias = state.TabletAlias
   226  				if err != nil {
   227  					retryRequest(func() {
   228  						// we seem to have lost our connection. it was a reserved connection, let's try to recreate it
   229  						info.actionNeeded = reserveBegin
   230  						var state queryservice.ReservedTransactionState
   231  						state, innerqr, err = qs.ReserveBeginExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), queries[i].Sql, queries[i].BindVariables, opts)
   232  						transactionID = state.TransactionID
   233  						reservedID = state.ReservedID
   234  						alias = state.TabletAlias
   235  					})
   236  				}
   237  			case reserve:
   238  				var state queryservice.ReservedState
   239  				state, innerqr, err = qs.ReserveExecute(ctx, rs.Target, session.SetPreQueries(), queries[i].Sql, queries[i].BindVariables, transactionID, opts)
   240  				reservedID = state.ReservedID
   241  				alias = state.TabletAlias
   242  			case reserveBegin:
   243  				var state queryservice.ReservedTransactionState
   244  				state, innerqr, err = qs.ReserveBeginExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), queries[i].Sql, queries[i].BindVariables, opts)
   245  				transactionID = state.TransactionID
   246  				reservedID = state.ReservedID
   247  				alias = state.TabletAlias
   248  			default:
   249  				return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] unexpected actionNeeded on query execution: %v", info.actionNeeded)
   250  			}
   251  			session.logging.log(primitive, rs.Target, rs.Gateway, queries[i].Sql, info.actionNeeded == begin || info.actionNeeded == reserveBegin, queries[i].BindVariables)
   252  
   253  			// We need to new shard info irrespective of the error.
   254  			newInfo := info.updateTransactionAndReservedID(transactionID, reservedID, alias)
   255  			if err != nil {
   256  				return newInfo, err
   257  			}
   258  			mu.Lock()
   259  			defer mu.Unlock()
   260  
   261  			// Don't append more rows if row count is exceeded.
   262  			if ignoreMaxMemoryRows || len(qr.Rows) <= maxMemoryRows {
   263  				qr.AppendResult(innerqr)
   264  			}
   265  			return newInfo, nil
   266  		},
   267  	)
   268  
   269  	if !ignoreMaxMemoryRows && len(qr.Rows) > maxMemoryRows {
   270  		return nil, []error{vterrors.NewErrorf(vtrpcpb.Code_RESOURCE_EXHAUSTED, vterrors.NetPacketTooLarge, "in-memory row count exceeded allowed limit of %d", maxMemoryRows)}
   271  	}
   272  
   273  	return qr, allErrors.GetErrors()
   274  }
   275  
   276  func (stc *ScatterConn) runLockQuery(ctx context.Context, session *SafeSession) {
   277  	rs := &srvtopo.ResolvedShard{Target: session.LockSession.Target, Gateway: stc.gateway}
   278  	query := &querypb.BoundQuery{Sql: "select 1", BindVariables: nil}
   279  	_, lockErr := stc.ExecuteLock(ctx, rs, query, session, sqlparser.IsUsedLock)
   280  	if lockErr != nil {
   281  		log.Warningf("Locking heartbeat failed, held locks might be released: %s", lockErr.Error())
   282  	}
   283  }
   284  
   285  func checkAndResetShardSession(info *shardActionInfo, err error, session *SafeSession, target *querypb.Target) reset {
   286  	retry := none
   287  	if info.reservedID != 0 && info.transactionID == 0 {
   288  		if wasConnectionClosed(err) {
   289  			retry = shard
   290  		}
   291  		if requireNewQS(err, target) {
   292  			retry = newQS
   293  		}
   294  	}
   295  	if retry != none {
   296  		_ = session.ResetShard(info.alias)
   297  	}
   298  	return retry
   299  }
   300  
   301  func getQueryService(rs *srvtopo.ResolvedShard, info *shardActionInfo, session *SafeSession, skipReset bool) (queryservice.QueryService, error) {
   302  	if info.alias == nil {
   303  		return rs.Gateway, nil
   304  	}
   305  	qs, err := rs.Gateway.QueryServiceByAlias(info.alias, rs.Target)
   306  	if err == nil || skipReset {
   307  		return qs, err
   308  	}
   309  	// If the session info has only reserved connection and no transaction then we will route it through gateway
   310  	// Otherwise, we will fail.
   311  	if info.reservedID == 0 || info.transactionID != 0 {
   312  		return nil, err
   313  	}
   314  	err = session.ResetShard(info.alias)
   315  	if err != nil {
   316  		return nil, err
   317  	}
   318  	// Returning rs.Gateway will make the gateway to choose new healthy tablet for the targeted tablet type.
   319  	return rs.Gateway, nil
   320  }
   321  
   322  func (stc *ScatterConn) processOneStreamingResult(mu *sync.Mutex, fieldSent *bool, qr *sqltypes.Result, callback func(*sqltypes.Result) error) error {
   323  	mu.Lock()
   324  	defer mu.Unlock()
   325  	if *fieldSent {
   326  		if len(qr.Rows) == 0 {
   327  			// It's another field info result. Don't send.
   328  			return nil
   329  		}
   330  	} else {
   331  		if len(qr.Fields) == 0 {
   332  			// Unreachable: this can happen only if vttablet misbehaves.
   333  			return vterrors.VT13001("received rows before fields")
   334  		}
   335  		*fieldSent = true
   336  	}
   337  
   338  	return callback(qr)
   339  }
   340  
   341  // StreamExecuteMulti is like StreamExecute,
   342  // but each shard gets its own bindVars. If len(shards) is not equal to
   343  // len(bindVars), the function panics.
   344  // Note we guarantee the callback will not be called concurrently
   345  // by multiple go routines, through processOneStreamingResult.
   346  func (stc *ScatterConn) StreamExecuteMulti(
   347  	ctx context.Context,
   348  	primitive engine.Primitive,
   349  	query string,
   350  	rss []*srvtopo.ResolvedShard,
   351  	bindVars []map[string]*querypb.BindVariable,
   352  	session *SafeSession,
   353  	autocommit bool,
   354  	callback func(reply *sqltypes.Result) error,
   355  ) []error {
   356  	if session.InLockSession() && session.TriggerLockHeartBeat() {
   357  		go stc.runLockQuery(ctx, session)
   358  	}
   359  
   360  	allErrors := stc.multiGoTransaction(
   361  		ctx,
   362  		"StreamExecute",
   363  		rss,
   364  		session,
   365  		autocommit,
   366  		func(rs *srvtopo.ResolvedShard, i int, info *shardActionInfo) (*shardActionInfo, error) {
   367  			var (
   368  				err   error
   369  				opts  *querypb.ExecuteOptions
   370  				alias *topodatapb.TabletAlias
   371  				qs    queryservice.QueryService
   372  			)
   373  			transactionID := info.transactionID
   374  			reservedID := info.reservedID
   375  
   376  			if session != nil && session.Session != nil {
   377  				opts = session.Session.Options
   378  			}
   379  
   380  			if autocommit {
   381  				// As this is auto-commit, the transactionID is supposed to be zero.
   382  				if transactionID != int64(0) {
   383  					return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "in autocommit mode, transactionID should be zero but was: %d", transactionID)
   384  				}
   385  			}
   386  
   387  			qs, err = getQueryService(rs, info, session, false)
   388  			if err != nil {
   389  				return nil, err
   390  			}
   391  
   392  			retryRequest := func(exec func()) {
   393  				retry := checkAndResetShardSession(info, err, session, rs.Target)
   394  				switch retry {
   395  				case newQS:
   396  					// Current tablet is not available, try querying new tablet using gateway.
   397  					qs = rs.Gateway
   398  					fallthrough
   399  				case shard:
   400  					// if we need to reset a reserved connection, here is our chance to try executing again,
   401  					// against a new connection
   402  					exec()
   403  				}
   404  			}
   405  
   406  			switch info.actionNeeded {
   407  			case nothing:
   408  				err = qs.StreamExecute(ctx, rs.Target, query, bindVars[i], transactionID, reservedID, opts, callback)
   409  				if err != nil {
   410  					retryRequest(func() {
   411  						// we seem to have lost our connection. it was a reserved connection, let's try to recreate it
   412  						info.actionNeeded = reserve
   413  						var state queryservice.ReservedState
   414  						state, err = qs.ReserveStreamExecute(ctx, rs.Target, session.SetPreQueries(), query, bindVars[i], 0 /*transactionId*/, opts, callback)
   415  						reservedID = state.ReservedID
   416  						alias = state.TabletAlias
   417  					})
   418  				}
   419  			case begin:
   420  				var state queryservice.TransactionState
   421  				state, err = qs.BeginStreamExecute(ctx, rs.Target, session.SavePoints(), query, bindVars[i], reservedID, opts, callback)
   422  				transactionID = state.TransactionID
   423  				alias = state.TabletAlias
   424  				if err != nil {
   425  					retryRequest(func() {
   426  						// we seem to have lost our connection. it was a reserved connection, let's try to recreate it
   427  						info.actionNeeded = reserveBegin
   428  						var state queryservice.ReservedTransactionState
   429  						state, err = qs.ReserveBeginStreamExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), query, bindVars[i], opts, callback)
   430  						transactionID = state.TransactionID
   431  						reservedID = state.ReservedID
   432  						alias = state.TabletAlias
   433  					})
   434  				}
   435  			case reserve:
   436  				var state queryservice.ReservedState
   437  				state, err = qs.ReserveStreamExecute(ctx, rs.Target, session.SetPreQueries(), query, bindVars[i], transactionID, opts, callback)
   438  				reservedID = state.ReservedID
   439  				alias = state.TabletAlias
   440  			case reserveBegin:
   441  				var state queryservice.ReservedTransactionState
   442  				state, err = qs.ReserveBeginStreamExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), query, bindVars[i], opts, callback)
   443  				transactionID = state.TransactionID
   444  				reservedID = state.ReservedID
   445  				alias = state.TabletAlias
   446  			default:
   447  				return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] unexpected actionNeeded on query execution: %v", info.actionNeeded)
   448  			}
   449  			session.logging.log(primitive, rs.Target, rs.Gateway, query, info.actionNeeded == begin || info.actionNeeded == reserveBegin, bindVars[i])
   450  
   451  			// We need to new shard info irrespective of the error.
   452  			newInfo := info.updateTransactionAndReservedID(transactionID, reservedID, alias)
   453  			if err != nil {
   454  				return newInfo, err
   455  			}
   456  
   457  			return newInfo, nil
   458  		},
   459  	)
   460  	return allErrors.GetErrors()
   461  }
   462  
   463  // timeTracker is a convenience wrapper used by MessageStream
   464  // to track how long a stream has been unavailable.
   465  type timeTracker struct {
   466  	mu         sync.Mutex
   467  	timestamps map[*querypb.Target]time.Time
   468  }
   469  
   470  func newTimeTracker() *timeTracker {
   471  	return &timeTracker{
   472  		timestamps: make(map[*querypb.Target]time.Time),
   473  	}
   474  }
   475  
   476  // Reset resets the timestamp set by Record.
   477  func (tt *timeTracker) Reset(target *querypb.Target) {
   478  	tt.mu.Lock()
   479  	defer tt.mu.Unlock()
   480  	delete(tt.timestamps, target)
   481  }
   482  
   483  // Record records the time to Now if there was no previous timestamp,
   484  // and it keeps returning that value until the next Reset.
   485  func (tt *timeTracker) Record(target *querypb.Target) time.Time {
   486  	tt.mu.Lock()
   487  	defer tt.mu.Unlock()
   488  	last, ok := tt.timestamps[target]
   489  	if !ok {
   490  		last = time.Now()
   491  		tt.timestamps[target] = last
   492  	}
   493  	return last
   494  }
   495  
   496  // MessageStream streams messages from the specified shards.
   497  // Note we guarantee the callback will not be called concurrently
   498  // by multiple go routines, through processOneStreamingResult.
   499  func (stc *ScatterConn) MessageStream(ctx context.Context, rss []*srvtopo.ResolvedShard, name string, callback func(*sqltypes.Result) error) error {
   500  	// The cancelable context is used for handling errors
   501  	// from individual streams.
   502  	ctx, cancel := context.WithCancel(ctx)
   503  	defer cancel()
   504  
   505  	// mu is used to merge multiple callback calls into one.
   506  	var mu sync.Mutex
   507  	fieldSent := false
   508  	lastErrors := newTimeTracker()
   509  	allErrors := stc.multiGo("MessageStream", rss, func(rs *srvtopo.ResolvedShard, i int) error {
   510  		// This loop handles the case where a reparent happens, which can cause
   511  		// an individual stream to end. If we don't succeed on the retries for
   512  		// messageStreamGracePeriod, we abort and return an error.
   513  		for {
   514  			err := rs.Gateway.MessageStream(ctx, rs.Target, name, func(qr *sqltypes.Result) error {
   515  				lastErrors.Reset(rs.Target)
   516  				return stc.processOneStreamingResult(&mu, &fieldSent, qr, callback)
   517  			})
   518  			// nil and EOF are equivalent. UNAVAILABLE can be returned by vttablet if it's demoted
   519  			// from primary to replica. For any of these conditions, we have to retry.
   520  			if err != nil && err != io.EOF && vterrors.Code(err) != vtrpcpb.Code_UNAVAILABLE {
   521  				cancel()
   522  				return err
   523  			}
   524  
   525  			// There was no error. We have to see if we need to retry.
   526  			// If context was canceled, likely due to client disconnect,
   527  			// return normally without retrying.
   528  			select {
   529  			case <-ctx.Done():
   530  				return nil
   531  			default:
   532  			}
   533  			firstErrorTimeStamp := lastErrors.Record(rs.Target)
   534  			if time.Since(firstErrorTimeStamp) >= messageStreamGracePeriod {
   535  				// Cancel all streams and return an error.
   536  				cancel()
   537  				return vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "message stream from %v has repeatedly failed for longer than %v", rs.Target, messageStreamGracePeriod)
   538  			}
   539  
   540  			// It's not been too long since our last good send. Wait and retry.
   541  			select {
   542  			case <-ctx.Done():
   543  				return nil
   544  			case <-time.After(messageStreamGracePeriod / 5):
   545  			}
   546  		}
   547  	})
   548  	return allErrors.AggrError(vterrors.Aggregate)
   549  }
   550  
   551  // Close closes the underlying Gateway.
   552  func (stc *ScatterConn) Close() error {
   553  	return stc.gateway.Close(context.Background())
   554  }
   555  
   556  // GetGatewayCacheStatus returns a displayable version of the Gateway cache.
   557  func (stc *ScatterConn) GetGatewayCacheStatus() TabletCacheStatusList {
   558  	return stc.gateway.CacheStatus()
   559  }
   560  
   561  // GetHealthCheckCacheStatus returns a displayable version of the HealthCheck cache.
   562  func (stc *ScatterConn) GetHealthCheckCacheStatus() discovery.TabletsCacheStatusList {
   563  	return stc.gateway.TabletsCacheStatus()
   564  }
   565  
   566  // multiGo performs the requested 'action' on the specified
   567  // shards in parallel. This does not handle any transaction state.
   568  // The action function must match the shardActionFunc2 signature.
   569  func (stc *ScatterConn) multiGo(
   570  	name string,
   571  	rss []*srvtopo.ResolvedShard,
   572  	action shardActionFunc,
   573  ) (allErrors *concurrency.AllErrorRecorder) {
   574  	allErrors = new(concurrency.AllErrorRecorder)
   575  	if len(rss) == 0 {
   576  		return allErrors
   577  	}
   578  
   579  	oneShard := func(rs *srvtopo.ResolvedShard, i int) {
   580  		var err error
   581  		startTime, statsKey := stc.startAction(name, rs.Target)
   582  		// Send a dummy session.
   583  		// TODO(sougou): plumb a real session through this call.
   584  		defer stc.endAction(startTime, allErrors, statsKey, &err, NewSafeSession(nil))
   585  		err = action(rs, i)
   586  	}
   587  
   588  	if len(rss) == 1 {
   589  		// only one shard, do it synchronously.
   590  		oneShard(rss[0], 0)
   591  		return allErrors
   592  	}
   593  
   594  	var wg sync.WaitGroup
   595  	for i, rs := range rss {
   596  		wg.Add(1)
   597  		go func(rs *srvtopo.ResolvedShard, i int) {
   598  			defer wg.Done()
   599  			oneShard(rs, i)
   600  		}(rs, i)
   601  	}
   602  	wg.Wait()
   603  	return allErrors
   604  }
   605  
   606  // multiGoTransaction performs the requested 'action' on the specified
   607  // ResolvedShards in parallel. For each shard, if the requested
   608  // session is in a transaction, it opens a new transactions on the connection,
   609  // and updates the Session with the transaction id. If the session already
   610  // contains a transaction id for the shard, it reuses it.
   611  // The action function must match the shardActionTransactionFunc signature.
   612  //
   613  // It returns an error recorder in which each shard error is recorded positionally,
   614  // i.e. if rss[2] had an error, then the error recorder will store that error
   615  // in the second position.
   616  func (stc *ScatterConn) multiGoTransaction(
   617  	ctx context.Context,
   618  	name string,
   619  	rss []*srvtopo.ResolvedShard,
   620  	session *SafeSession,
   621  	autocommit bool,
   622  	action shardActionTransactionFunc,
   623  ) (allErrors *concurrency.AllErrorRecorder) {
   624  
   625  	numShards := len(rss)
   626  	allErrors = new(concurrency.AllErrorRecorder)
   627  
   628  	if numShards == 0 {
   629  		return allErrors
   630  	}
   631  	oneShard := func(rs *srvtopo.ResolvedShard, i int) {
   632  		var err error
   633  		startTime, statsKey := stc.startAction(name, rs.Target)
   634  		defer stc.endAction(startTime, allErrors, statsKey, &err, session)
   635  
   636  		shardActionInfo, err := actionInfo(ctx, rs.Target, session, autocommit, stc.txConn.mode)
   637  		if err != nil {
   638  			return
   639  		}
   640  		updated, err := action(rs, i, shardActionInfo)
   641  		if updated == nil {
   642  			return
   643  		}
   644  		if updated.actionNeeded != nothing && (updated.transactionID != 0 || updated.reservedID != 0) {
   645  			appendErr := session.AppendOrUpdate(&vtgatepb.Session_ShardSession{
   646  				Target:        rs.Target,
   647  				TransactionId: updated.transactionID,
   648  				ReservedId:    updated.reservedID,
   649  				TabletAlias:   updated.alias,
   650  			}, stc.txConn.mode)
   651  			if appendErr != nil {
   652  				err = appendErr
   653  			}
   654  		}
   655  	}
   656  
   657  	if numShards == 1 {
   658  		// only one shard, do it synchronously.
   659  		for i, rs := range rss {
   660  			oneShard(rs, i)
   661  		}
   662  	} else {
   663  		var wg sync.WaitGroup
   664  		for i, rs := range rss {
   665  			wg.Add(1)
   666  			go func(rs *srvtopo.ResolvedShard, i int) {
   667  				defer wg.Done()
   668  				oneShard(rs, i)
   669  			}(rs, i)
   670  		}
   671  		wg.Wait()
   672  	}
   673  
   674  	if session.MustRollback() {
   675  		_ = stc.txConn.Rollback(ctx, session)
   676  	}
   677  	return allErrors
   678  }
   679  
   680  // ExecuteLock performs the requested 'action' on the specified
   681  // ResolvedShard. If the lock session already has a reserved connection,
   682  // it reuses it. Otherwise open a new reserved connection.
   683  // The action function must match the shardActionTransactionFunc signature.
   684  //
   685  // It returns an error recorder in which each shard error is recorded positionally,
   686  // i.e. if rss[2] had an error, then the error recorder will store that error
   687  // in the second position.
   688  func (stc *ScatterConn) ExecuteLock(ctx context.Context, rs *srvtopo.ResolvedShard, query *querypb.BoundQuery, session *SafeSession, lockFuncType sqlparser.LockingFuncType) (*sqltypes.Result, error) {
   689  
   690  	var (
   691  		qr    *sqltypes.Result
   692  		err   error
   693  		opts  *querypb.ExecuteOptions
   694  		alias *topodatapb.TabletAlias
   695  	)
   696  	allErrors := new(concurrency.AllErrorRecorder)
   697  	startTime, statsKey := stc.startAction("ExecuteLock", rs.Target)
   698  	defer stc.endLockAction(startTime, allErrors, statsKey, &err)
   699  
   700  	if session == nil || session.Session == nil {
   701  		return nil, vterrors.VT13001("session cannot be nil")
   702  	}
   703  
   704  	opts = session.Session.Options
   705  	info, err := lockInfo(rs.Target, session, lockFuncType)
   706  	// Lock session is created on alphabetic sorted keyspace.
   707  	// This error will occur if the existing session target does not match the current target.
   708  	// This will happen either due to re-sharding or a new keyspace which comes before the existing order.
   709  	// In which case, we will try to release old locks and return error.
   710  	if err != nil {
   711  		_ = stc.txConn.ReleaseLock(ctx, session)
   712  		return nil, vterrors.Wrap(err, "Any previous held locks are released")
   713  	}
   714  	qs, err := getQueryService(rs, info, nil, true)
   715  	if err != nil {
   716  		return nil, err
   717  	}
   718  	reservedID := info.reservedID
   719  
   720  	switch info.actionNeeded {
   721  	case nothing:
   722  		qr, err = qs.Execute(ctx, rs.Target, query.Sql, query.BindVariables, 0 /* transactionID */, reservedID, opts)
   723  		if err != nil && wasConnectionClosed(err) {
   724  			// TODO: try to acquire lock again.
   725  			session.ResetLock()
   726  			err = vterrors.Wrap(err, "held locks released")
   727  		}
   728  		if reservedID != 0 {
   729  			session.UpdateLockHeartbeat()
   730  		}
   731  	case reserve:
   732  		var state queryservice.ReservedState
   733  		state, qr, err = qs.ReserveExecute(ctx, rs.Target, session.SetPreQueries(), query.Sql, query.BindVariables, 0 /* transactionID */, opts)
   734  		reservedID = state.ReservedID
   735  		alias = state.TabletAlias
   736  		if err != nil && reservedID != 0 {
   737  			_ = stc.txConn.ReleaseLock(ctx, session)
   738  		}
   739  
   740  		if reservedID != 0 {
   741  			session.SetLockSession(&vtgatepb.Session_ShardSession{
   742  				Target:      rs.Target,
   743  				ReservedId:  reservedID,
   744  				TabletAlias: alias,
   745  			})
   746  		}
   747  	default:
   748  		return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] unexpected actionNeeded on lock execution: %v", info.actionNeeded)
   749  	}
   750  
   751  	if err != nil {
   752  		return nil, err
   753  	}
   754  	return qr, err
   755  }
   756  
   757  func wasConnectionClosed(err error) bool {
   758  	sqlErr := mysql.NewSQLErrorFromError(err).(*mysql.SQLError)
   759  	message := sqlErr.Error()
   760  
   761  	switch sqlErr.Number() {
   762  	case mysql.CRServerGone, mysql.CRServerLost:
   763  		return true
   764  	case mysql.ERQueryInterrupted:
   765  		return vterrors.TxClosed.MatchString(message)
   766  	default:
   767  		return false
   768  	}
   769  }
   770  
   771  // requireNewQS this checks if we need to fallback to new tablet.
   772  func requireNewQS(err error, target *querypb.Target) bool {
   773  	code := vterrors.Code(err)
   774  	msg := err.Error()
   775  	switch code {
   776  	// when the tablet or mysql is unavailable for any reason.
   777  	case vtrpcpb.Code_UNAVAILABLE:
   778  		return true
   779  	// when received wrong tablet error message.
   780  	case vtrpcpb.Code_FAILED_PRECONDITION:
   781  		return vterrors.RxWrongTablet.MatchString(msg)
   782  	// when received cluster_event from tablet and tablet is not operational.
   783  	// this will also help in buffering the query if needed.
   784  	case vtrpcpb.Code_CLUSTER_EVENT:
   785  		return (target != nil && target.TabletType == topodatapb.TabletType_PRIMARY) || vterrors.RxOp.MatchString(msg)
   786  	}
   787  	return false
   788  }
   789  
   790  // actionInfo looks at the current session, and returns information about what needs to be done for this tablet
   791  func actionInfo(ctx context.Context, target *querypb.Target, session *SafeSession, autocommit bool, txMode vtgatepb.TransactionMode) (*shardActionInfo, error) {
   792  	if !(session.InTransaction() || session.InReservedConn()) {
   793  		return &shardActionInfo{}, nil
   794  	}
   795  	ignoreSession := ctx.Value(engine.IgnoreReserveTxn)
   796  	if ignoreSession != nil {
   797  		return &shardActionInfo{}, nil
   798  	}
   799  	// No need to protect ourselves from the race condition between
   800  	// Find and AppendOrUpdate. The higher level functions ensure that no
   801  	// duplicate (target) tuples can execute
   802  	// this at the same time.
   803  	transactionID, reservedID, alias, err := session.FindAndChangeSessionIfInSingleTxMode(target.Keyspace, target.Shard, target.TabletType, txMode)
   804  	if err != nil {
   805  		return nil, err
   806  	}
   807  
   808  	shouldReserve := session.InReservedConn() && reservedID == 0
   809  	shouldBegin := session.InTransaction() && transactionID == 0 && !autocommit
   810  
   811  	var act = nothing
   812  	switch {
   813  	case shouldBegin && shouldReserve:
   814  		act = reserveBegin
   815  	case shouldReserve:
   816  		act = reserve
   817  	case shouldBegin:
   818  		act = begin
   819  	}
   820  
   821  	return &shardActionInfo{
   822  		actionNeeded:  act,
   823  		transactionID: transactionID,
   824  		reservedID:    reservedID,
   825  		alias:         alias,
   826  	}, nil
   827  }
   828  
   829  // lockInfo looks at the current session, and returns information about what needs to be done for this tablet
   830  func lockInfo(target *querypb.Target, session *SafeSession, lockFuncType sqlparser.LockingFuncType) (*shardActionInfo, error) {
   831  	info := &shardActionInfo{actionNeeded: nothing}
   832  	if session.LockSession != nil {
   833  		if !proto.Equal(target, session.LockSession.Target) {
   834  			return nil, vterrors.Errorf(vtrpcpb.Code_NOT_FOUND, "target does match the existing lock session target: (%v, %v)", target, session.LockSession.Target)
   835  		}
   836  		info.reservedID = session.LockSession.ReservedId
   837  		info.alias = session.LockSession.TabletAlias
   838  	}
   839  	// Only GetLock needs to start a reserved connection.
   840  	// Once in reserved connection, it will be used for other calls as well.
   841  	// But, we don't want to start a reserved connection for other calls like IsFreeLock, IsUsedLock, etc.
   842  	if lockFuncType != sqlparser.GetLock {
   843  		return info, nil
   844  	}
   845  	if info.reservedID == 0 {
   846  		info.actionNeeded = reserve
   847  	}
   848  	return info, nil
   849  }
   850  
   851  type shardActionInfo struct {
   852  	actionNeeded              actionNeeded
   853  	reservedID, transactionID int64
   854  	alias                     *topodatapb.TabletAlias
   855  }
   856  
   857  func (sai *shardActionInfo) updateTransactionAndReservedID(txID int64, rID int64, alias *topodatapb.TabletAlias) *shardActionInfo {
   858  	if txID == sai.transactionID && rID == sai.reservedID {
   859  		// As transaction id and reserved id have not changed, there is nothing to update in session shard sessions.
   860  		return nil
   861  	}
   862  	newInfo := *sai
   863  	newInfo.reservedID = rID
   864  	newInfo.transactionID = txID
   865  	newInfo.alias = alias
   866  	return &newInfo
   867  }
   868  
   869  type actionNeeded int
   870  
   871  const (
   872  	nothing actionNeeded = iota
   873  	reserveBegin
   874  	reserve
   875  	begin
   876  )