vitess.io/vitess@v0.16.2/go/vt/vtgate/scatter_conn.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vtgate 18 19 import ( 20 "context" 21 "io" 22 "sync" 23 "time" 24 25 "vitess.io/vitess/go/vt/sqlparser" 26 27 "google.golang.org/protobuf/proto" 28 29 "vitess.io/vitess/go/mysql" 30 "vitess.io/vitess/go/sqltypes" 31 "vitess.io/vitess/go/stats" 32 "vitess.io/vitess/go/vt/concurrency" 33 "vitess.io/vitess/go/vt/discovery" 34 "vitess.io/vitess/go/vt/log" 35 "vitess.io/vitess/go/vt/srvtopo" 36 "vitess.io/vitess/go/vt/topo/topoproto" 37 "vitess.io/vitess/go/vt/vterrors" 38 "vitess.io/vitess/go/vt/vtgate/engine" 39 "vitess.io/vitess/go/vt/vttablet/queryservice" 40 41 querypb "vitess.io/vitess/go/vt/proto/query" 42 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 43 vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" 44 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 45 ) 46 47 // ScatterConn is used for executing queries across 48 // multiple shard level connections. 49 type ScatterConn struct { 50 timings *stats.MultiTimings 51 tabletCallErrorCount *stats.CountersWithMultiLabels 52 txConn *TxConn 53 gateway *TabletGateway 54 } 55 56 // shardActionFunc defines the contract for a shard action 57 // outside of a transaction. Every such function executes the 58 // necessary action on a shard, sends the results to sResults, and 59 // return an error if any. multiGo is capable of executing 60 // multiple shardActionFunc actions in parallel and 61 // consolidating the results and errors for the caller. 62 type shardActionFunc func(rs *srvtopo.ResolvedShard, i int) error 63 64 // shardActionTransactionFunc defines the contract for a shard action 65 // that may be in a transaction. Every such function executes the 66 // necessary action on a shard (with an optional Begin call), aggregates 67 // the results, and return an error if any. 68 // multiGoTransaction is capable of executing multiple 69 // shardActionTransactionFunc actions in parallel and consolidating 70 // the results and errors for the caller. 71 type shardActionTransactionFunc func(rs *srvtopo.ResolvedShard, i int, shardActionInfo *shardActionInfo) (*shardActionInfo, error) 72 73 // NewScatterConn creates a new ScatterConn. 74 func NewScatterConn(statsName string, txConn *TxConn, gw *TabletGateway) *ScatterConn { 75 // this only works with TabletGateway 76 tabletCallErrorCountStatsName := "" 77 if statsName != "" { 78 tabletCallErrorCountStatsName = statsName + "ErrorCount" 79 } 80 return &ScatterConn{ 81 timings: stats.NewMultiTimings( 82 statsName, 83 "Scatter connection timings", 84 []string{"Operation", "Keyspace", "ShardName", "DbType"}), 85 tabletCallErrorCount: stats.NewCountersWithMultiLabels( 86 tabletCallErrorCountStatsName, 87 "Error count from tablet calls in scatter conns", 88 []string{"Operation", "Keyspace", "ShardName", "DbType"}), 89 txConn: txConn, 90 gateway: gw, 91 } 92 } 93 94 func (stc *ScatterConn) startAction(name string, target *querypb.Target) (time.Time, []string) { 95 statsKey := []string{name, target.Keyspace, target.Shard, topoproto.TabletTypeLString(target.TabletType)} 96 startTime := time.Now() 97 return startTime, statsKey 98 } 99 100 func (stc *ScatterConn) endAction(startTime time.Time, allErrors *concurrency.AllErrorRecorder, statsKey []string, err *error, session *SafeSession) { 101 if *err != nil { 102 allErrors.RecordError(*err) 103 // Don't increment the error counter for duplicate 104 // keys or bad queries, as those errors are caused by 105 // client queries and are not VTGate's fault. 106 ec := vterrors.Code(*err) 107 if ec != vtrpcpb.Code_ALREADY_EXISTS && ec != vtrpcpb.Code_INVALID_ARGUMENT { 108 stc.tabletCallErrorCount.Add(statsKey, 1) 109 } 110 if ec == vtrpcpb.Code_RESOURCE_EXHAUSTED || ec == vtrpcpb.Code_ABORTED { 111 session.SetRollback() 112 } 113 } 114 stc.timings.Record(statsKey, startTime) 115 } 116 117 func (stc *ScatterConn) endLockAction(startTime time.Time, allErrors *concurrency.AllErrorRecorder, statsKey []string, err *error) { 118 if *err != nil { 119 allErrors.RecordError(*err) 120 stc.tabletCallErrorCount.Add(statsKey, 1) 121 } 122 stc.timings.Record(statsKey, startTime) 123 } 124 125 type reset int 126 127 const ( 128 none reset = iota 129 shard 130 newQS 131 ) 132 133 // ExecuteMultiShard is like Execute, 134 // but each shard gets its own Sql Queries and BindVariables. 135 // 136 // It always returns a non-nil query result and an array of 137 // shard errors which may be nil so that callers can optionally 138 // process a partially-successful operation. 139 func (stc *ScatterConn) ExecuteMultiShard( 140 ctx context.Context, 141 primitive engine.Primitive, 142 rss []*srvtopo.ResolvedShard, 143 queries []*querypb.BoundQuery, 144 session *SafeSession, 145 autocommit bool, 146 ignoreMaxMemoryRows bool, 147 ) (qr *sqltypes.Result, errs []error) { 148 149 if len(rss) != len(queries) { 150 return nil, []error{vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] got mismatched number of queries and shards")} 151 } 152 153 // mu protects qr 154 var mu sync.Mutex 155 qr = new(sqltypes.Result) 156 157 if session.InLockSession() && session.TriggerLockHeartBeat() { 158 go stc.runLockQuery(ctx, session) 159 } 160 161 allErrors := stc.multiGoTransaction( 162 ctx, 163 "Execute", 164 rss, 165 session, 166 autocommit, 167 func(rs *srvtopo.ResolvedShard, i int, info *shardActionInfo) (*shardActionInfo, error) { 168 var ( 169 innerqr *sqltypes.Result 170 err error 171 opts *querypb.ExecuteOptions 172 alias *topodatapb.TabletAlias 173 qs queryservice.QueryService 174 ) 175 transactionID := info.transactionID 176 reservedID := info.reservedID 177 178 if session != nil && session.Session != nil { 179 opts = session.Session.Options 180 } 181 182 if autocommit { 183 // As this is auto-commit, the transactionID is supposed to be zero. 184 if transactionID != int64(0) { 185 return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "in autocommit mode, transactionID should be zero but was: %d", transactionID) 186 } 187 } 188 189 qs, err = getQueryService(rs, info, session, false) 190 if err != nil { 191 return nil, err 192 } 193 194 retryRequest := func(exec func()) { 195 retry := checkAndResetShardSession(info, err, session, rs.Target) 196 switch retry { 197 case newQS: 198 // Current tablet is not available, try querying new tablet using gateway. 199 qs = rs.Gateway 200 fallthrough 201 case shard: 202 // if we need to reset a reserved connection, here is our chance to try executing again, 203 // against a new connection 204 exec() 205 } 206 } 207 208 switch info.actionNeeded { 209 case nothing: 210 innerqr, err = qs.Execute(ctx, rs.Target, queries[i].Sql, queries[i].BindVariables, info.transactionID, info.reservedID, opts) 211 if err != nil { 212 retryRequest(func() { 213 // we seem to have lost our connection. it was a reserved connection, let's try to recreate it 214 info.actionNeeded = reserve 215 var state queryservice.ReservedState 216 state, innerqr, err = qs.ReserveExecute(ctx, rs.Target, session.SetPreQueries(), queries[i].Sql, queries[i].BindVariables, 0 /*transactionId*/, opts) 217 reservedID = state.ReservedID 218 alias = state.TabletAlias 219 }) 220 } 221 case begin: 222 var state queryservice.TransactionState 223 state, innerqr, err = qs.BeginExecute(ctx, rs.Target, session.SavePoints(), queries[i].Sql, queries[i].BindVariables, reservedID, opts) 224 transactionID = state.TransactionID 225 alias = state.TabletAlias 226 if err != nil { 227 retryRequest(func() { 228 // we seem to have lost our connection. it was a reserved connection, let's try to recreate it 229 info.actionNeeded = reserveBegin 230 var state queryservice.ReservedTransactionState 231 state, innerqr, err = qs.ReserveBeginExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), queries[i].Sql, queries[i].BindVariables, opts) 232 transactionID = state.TransactionID 233 reservedID = state.ReservedID 234 alias = state.TabletAlias 235 }) 236 } 237 case reserve: 238 var state queryservice.ReservedState 239 state, innerqr, err = qs.ReserveExecute(ctx, rs.Target, session.SetPreQueries(), queries[i].Sql, queries[i].BindVariables, transactionID, opts) 240 reservedID = state.ReservedID 241 alias = state.TabletAlias 242 case reserveBegin: 243 var state queryservice.ReservedTransactionState 244 state, innerqr, err = qs.ReserveBeginExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), queries[i].Sql, queries[i].BindVariables, opts) 245 transactionID = state.TransactionID 246 reservedID = state.ReservedID 247 alias = state.TabletAlias 248 default: 249 return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] unexpected actionNeeded on query execution: %v", info.actionNeeded) 250 } 251 session.logging.log(primitive, rs.Target, rs.Gateway, queries[i].Sql, info.actionNeeded == begin || info.actionNeeded == reserveBegin, queries[i].BindVariables) 252 253 // We need to new shard info irrespective of the error. 254 newInfo := info.updateTransactionAndReservedID(transactionID, reservedID, alias) 255 if err != nil { 256 return newInfo, err 257 } 258 mu.Lock() 259 defer mu.Unlock() 260 261 // Don't append more rows if row count is exceeded. 262 if ignoreMaxMemoryRows || len(qr.Rows) <= maxMemoryRows { 263 qr.AppendResult(innerqr) 264 } 265 return newInfo, nil 266 }, 267 ) 268 269 if !ignoreMaxMemoryRows && len(qr.Rows) > maxMemoryRows { 270 return nil, []error{vterrors.NewErrorf(vtrpcpb.Code_RESOURCE_EXHAUSTED, vterrors.NetPacketTooLarge, "in-memory row count exceeded allowed limit of %d", maxMemoryRows)} 271 } 272 273 return qr, allErrors.GetErrors() 274 } 275 276 func (stc *ScatterConn) runLockQuery(ctx context.Context, session *SafeSession) { 277 rs := &srvtopo.ResolvedShard{Target: session.LockSession.Target, Gateway: stc.gateway} 278 query := &querypb.BoundQuery{Sql: "select 1", BindVariables: nil} 279 _, lockErr := stc.ExecuteLock(ctx, rs, query, session, sqlparser.IsUsedLock) 280 if lockErr != nil { 281 log.Warningf("Locking heartbeat failed, held locks might be released: %s", lockErr.Error()) 282 } 283 } 284 285 func checkAndResetShardSession(info *shardActionInfo, err error, session *SafeSession, target *querypb.Target) reset { 286 retry := none 287 if info.reservedID != 0 && info.transactionID == 0 { 288 if wasConnectionClosed(err) { 289 retry = shard 290 } 291 if requireNewQS(err, target) { 292 retry = newQS 293 } 294 } 295 if retry != none { 296 _ = session.ResetShard(info.alias) 297 } 298 return retry 299 } 300 301 func getQueryService(rs *srvtopo.ResolvedShard, info *shardActionInfo, session *SafeSession, skipReset bool) (queryservice.QueryService, error) { 302 if info.alias == nil { 303 return rs.Gateway, nil 304 } 305 qs, err := rs.Gateway.QueryServiceByAlias(info.alias, rs.Target) 306 if err == nil || skipReset { 307 return qs, err 308 } 309 // If the session info has only reserved connection and no transaction then we will route it through gateway 310 // Otherwise, we will fail. 311 if info.reservedID == 0 || info.transactionID != 0 { 312 return nil, err 313 } 314 err = session.ResetShard(info.alias) 315 if err != nil { 316 return nil, err 317 } 318 // Returning rs.Gateway will make the gateway to choose new healthy tablet for the targeted tablet type. 319 return rs.Gateway, nil 320 } 321 322 func (stc *ScatterConn) processOneStreamingResult(mu *sync.Mutex, fieldSent *bool, qr *sqltypes.Result, callback func(*sqltypes.Result) error) error { 323 mu.Lock() 324 defer mu.Unlock() 325 if *fieldSent { 326 if len(qr.Rows) == 0 { 327 // It's another field info result. Don't send. 328 return nil 329 } 330 } else { 331 if len(qr.Fields) == 0 { 332 // Unreachable: this can happen only if vttablet misbehaves. 333 return vterrors.VT13001("received rows before fields") 334 } 335 *fieldSent = true 336 } 337 338 return callback(qr) 339 } 340 341 // StreamExecuteMulti is like StreamExecute, 342 // but each shard gets its own bindVars. If len(shards) is not equal to 343 // len(bindVars), the function panics. 344 // Note we guarantee the callback will not be called concurrently 345 // by multiple go routines, through processOneStreamingResult. 346 func (stc *ScatterConn) StreamExecuteMulti( 347 ctx context.Context, 348 primitive engine.Primitive, 349 query string, 350 rss []*srvtopo.ResolvedShard, 351 bindVars []map[string]*querypb.BindVariable, 352 session *SafeSession, 353 autocommit bool, 354 callback func(reply *sqltypes.Result) error, 355 ) []error { 356 if session.InLockSession() && session.TriggerLockHeartBeat() { 357 go stc.runLockQuery(ctx, session) 358 } 359 360 allErrors := stc.multiGoTransaction( 361 ctx, 362 "StreamExecute", 363 rss, 364 session, 365 autocommit, 366 func(rs *srvtopo.ResolvedShard, i int, info *shardActionInfo) (*shardActionInfo, error) { 367 var ( 368 err error 369 opts *querypb.ExecuteOptions 370 alias *topodatapb.TabletAlias 371 qs queryservice.QueryService 372 ) 373 transactionID := info.transactionID 374 reservedID := info.reservedID 375 376 if session != nil && session.Session != nil { 377 opts = session.Session.Options 378 } 379 380 if autocommit { 381 // As this is auto-commit, the transactionID is supposed to be zero. 382 if transactionID != int64(0) { 383 return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "in autocommit mode, transactionID should be zero but was: %d", transactionID) 384 } 385 } 386 387 qs, err = getQueryService(rs, info, session, false) 388 if err != nil { 389 return nil, err 390 } 391 392 retryRequest := func(exec func()) { 393 retry := checkAndResetShardSession(info, err, session, rs.Target) 394 switch retry { 395 case newQS: 396 // Current tablet is not available, try querying new tablet using gateway. 397 qs = rs.Gateway 398 fallthrough 399 case shard: 400 // if we need to reset a reserved connection, here is our chance to try executing again, 401 // against a new connection 402 exec() 403 } 404 } 405 406 switch info.actionNeeded { 407 case nothing: 408 err = qs.StreamExecute(ctx, rs.Target, query, bindVars[i], transactionID, reservedID, opts, callback) 409 if err != nil { 410 retryRequest(func() { 411 // we seem to have lost our connection. it was a reserved connection, let's try to recreate it 412 info.actionNeeded = reserve 413 var state queryservice.ReservedState 414 state, err = qs.ReserveStreamExecute(ctx, rs.Target, session.SetPreQueries(), query, bindVars[i], 0 /*transactionId*/, opts, callback) 415 reservedID = state.ReservedID 416 alias = state.TabletAlias 417 }) 418 } 419 case begin: 420 var state queryservice.TransactionState 421 state, err = qs.BeginStreamExecute(ctx, rs.Target, session.SavePoints(), query, bindVars[i], reservedID, opts, callback) 422 transactionID = state.TransactionID 423 alias = state.TabletAlias 424 if err != nil { 425 retryRequest(func() { 426 // we seem to have lost our connection. it was a reserved connection, let's try to recreate it 427 info.actionNeeded = reserveBegin 428 var state queryservice.ReservedTransactionState 429 state, err = qs.ReserveBeginStreamExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), query, bindVars[i], opts, callback) 430 transactionID = state.TransactionID 431 reservedID = state.ReservedID 432 alias = state.TabletAlias 433 }) 434 } 435 case reserve: 436 var state queryservice.ReservedState 437 state, err = qs.ReserveStreamExecute(ctx, rs.Target, session.SetPreQueries(), query, bindVars[i], transactionID, opts, callback) 438 reservedID = state.ReservedID 439 alias = state.TabletAlias 440 case reserveBegin: 441 var state queryservice.ReservedTransactionState 442 state, err = qs.ReserveBeginStreamExecute(ctx, rs.Target, session.SetPreQueries(), session.SavePoints(), query, bindVars[i], opts, callback) 443 transactionID = state.TransactionID 444 reservedID = state.ReservedID 445 alias = state.TabletAlias 446 default: 447 return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] unexpected actionNeeded on query execution: %v", info.actionNeeded) 448 } 449 session.logging.log(primitive, rs.Target, rs.Gateway, query, info.actionNeeded == begin || info.actionNeeded == reserveBegin, bindVars[i]) 450 451 // We need to new shard info irrespective of the error. 452 newInfo := info.updateTransactionAndReservedID(transactionID, reservedID, alias) 453 if err != nil { 454 return newInfo, err 455 } 456 457 return newInfo, nil 458 }, 459 ) 460 return allErrors.GetErrors() 461 } 462 463 // timeTracker is a convenience wrapper used by MessageStream 464 // to track how long a stream has been unavailable. 465 type timeTracker struct { 466 mu sync.Mutex 467 timestamps map[*querypb.Target]time.Time 468 } 469 470 func newTimeTracker() *timeTracker { 471 return &timeTracker{ 472 timestamps: make(map[*querypb.Target]time.Time), 473 } 474 } 475 476 // Reset resets the timestamp set by Record. 477 func (tt *timeTracker) Reset(target *querypb.Target) { 478 tt.mu.Lock() 479 defer tt.mu.Unlock() 480 delete(tt.timestamps, target) 481 } 482 483 // Record records the time to Now if there was no previous timestamp, 484 // and it keeps returning that value until the next Reset. 485 func (tt *timeTracker) Record(target *querypb.Target) time.Time { 486 tt.mu.Lock() 487 defer tt.mu.Unlock() 488 last, ok := tt.timestamps[target] 489 if !ok { 490 last = time.Now() 491 tt.timestamps[target] = last 492 } 493 return last 494 } 495 496 // MessageStream streams messages from the specified shards. 497 // Note we guarantee the callback will not be called concurrently 498 // by multiple go routines, through processOneStreamingResult. 499 func (stc *ScatterConn) MessageStream(ctx context.Context, rss []*srvtopo.ResolvedShard, name string, callback func(*sqltypes.Result) error) error { 500 // The cancelable context is used for handling errors 501 // from individual streams. 502 ctx, cancel := context.WithCancel(ctx) 503 defer cancel() 504 505 // mu is used to merge multiple callback calls into one. 506 var mu sync.Mutex 507 fieldSent := false 508 lastErrors := newTimeTracker() 509 allErrors := stc.multiGo("MessageStream", rss, func(rs *srvtopo.ResolvedShard, i int) error { 510 // This loop handles the case where a reparent happens, which can cause 511 // an individual stream to end. If we don't succeed on the retries for 512 // messageStreamGracePeriod, we abort and return an error. 513 for { 514 err := rs.Gateway.MessageStream(ctx, rs.Target, name, func(qr *sqltypes.Result) error { 515 lastErrors.Reset(rs.Target) 516 return stc.processOneStreamingResult(&mu, &fieldSent, qr, callback) 517 }) 518 // nil and EOF are equivalent. UNAVAILABLE can be returned by vttablet if it's demoted 519 // from primary to replica. For any of these conditions, we have to retry. 520 if err != nil && err != io.EOF && vterrors.Code(err) != vtrpcpb.Code_UNAVAILABLE { 521 cancel() 522 return err 523 } 524 525 // There was no error. We have to see if we need to retry. 526 // If context was canceled, likely due to client disconnect, 527 // return normally without retrying. 528 select { 529 case <-ctx.Done(): 530 return nil 531 default: 532 } 533 firstErrorTimeStamp := lastErrors.Record(rs.Target) 534 if time.Since(firstErrorTimeStamp) >= messageStreamGracePeriod { 535 // Cancel all streams and return an error. 536 cancel() 537 return vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "message stream from %v has repeatedly failed for longer than %v", rs.Target, messageStreamGracePeriod) 538 } 539 540 // It's not been too long since our last good send. Wait and retry. 541 select { 542 case <-ctx.Done(): 543 return nil 544 case <-time.After(messageStreamGracePeriod / 5): 545 } 546 } 547 }) 548 return allErrors.AggrError(vterrors.Aggregate) 549 } 550 551 // Close closes the underlying Gateway. 552 func (stc *ScatterConn) Close() error { 553 return stc.gateway.Close(context.Background()) 554 } 555 556 // GetGatewayCacheStatus returns a displayable version of the Gateway cache. 557 func (stc *ScatterConn) GetGatewayCacheStatus() TabletCacheStatusList { 558 return stc.gateway.CacheStatus() 559 } 560 561 // GetHealthCheckCacheStatus returns a displayable version of the HealthCheck cache. 562 func (stc *ScatterConn) GetHealthCheckCacheStatus() discovery.TabletsCacheStatusList { 563 return stc.gateway.TabletsCacheStatus() 564 } 565 566 // multiGo performs the requested 'action' on the specified 567 // shards in parallel. This does not handle any transaction state. 568 // The action function must match the shardActionFunc2 signature. 569 func (stc *ScatterConn) multiGo( 570 name string, 571 rss []*srvtopo.ResolvedShard, 572 action shardActionFunc, 573 ) (allErrors *concurrency.AllErrorRecorder) { 574 allErrors = new(concurrency.AllErrorRecorder) 575 if len(rss) == 0 { 576 return allErrors 577 } 578 579 oneShard := func(rs *srvtopo.ResolvedShard, i int) { 580 var err error 581 startTime, statsKey := stc.startAction(name, rs.Target) 582 // Send a dummy session. 583 // TODO(sougou): plumb a real session through this call. 584 defer stc.endAction(startTime, allErrors, statsKey, &err, NewSafeSession(nil)) 585 err = action(rs, i) 586 } 587 588 if len(rss) == 1 { 589 // only one shard, do it synchronously. 590 oneShard(rss[0], 0) 591 return allErrors 592 } 593 594 var wg sync.WaitGroup 595 for i, rs := range rss { 596 wg.Add(1) 597 go func(rs *srvtopo.ResolvedShard, i int) { 598 defer wg.Done() 599 oneShard(rs, i) 600 }(rs, i) 601 } 602 wg.Wait() 603 return allErrors 604 } 605 606 // multiGoTransaction performs the requested 'action' on the specified 607 // ResolvedShards in parallel. For each shard, if the requested 608 // session is in a transaction, it opens a new transactions on the connection, 609 // and updates the Session with the transaction id. If the session already 610 // contains a transaction id for the shard, it reuses it. 611 // The action function must match the shardActionTransactionFunc signature. 612 // 613 // It returns an error recorder in which each shard error is recorded positionally, 614 // i.e. if rss[2] had an error, then the error recorder will store that error 615 // in the second position. 616 func (stc *ScatterConn) multiGoTransaction( 617 ctx context.Context, 618 name string, 619 rss []*srvtopo.ResolvedShard, 620 session *SafeSession, 621 autocommit bool, 622 action shardActionTransactionFunc, 623 ) (allErrors *concurrency.AllErrorRecorder) { 624 625 numShards := len(rss) 626 allErrors = new(concurrency.AllErrorRecorder) 627 628 if numShards == 0 { 629 return allErrors 630 } 631 oneShard := func(rs *srvtopo.ResolvedShard, i int) { 632 var err error 633 startTime, statsKey := stc.startAction(name, rs.Target) 634 defer stc.endAction(startTime, allErrors, statsKey, &err, session) 635 636 shardActionInfo, err := actionInfo(ctx, rs.Target, session, autocommit, stc.txConn.mode) 637 if err != nil { 638 return 639 } 640 updated, err := action(rs, i, shardActionInfo) 641 if updated == nil { 642 return 643 } 644 if updated.actionNeeded != nothing && (updated.transactionID != 0 || updated.reservedID != 0) { 645 appendErr := session.AppendOrUpdate(&vtgatepb.Session_ShardSession{ 646 Target: rs.Target, 647 TransactionId: updated.transactionID, 648 ReservedId: updated.reservedID, 649 TabletAlias: updated.alias, 650 }, stc.txConn.mode) 651 if appendErr != nil { 652 err = appendErr 653 } 654 } 655 } 656 657 if numShards == 1 { 658 // only one shard, do it synchronously. 659 for i, rs := range rss { 660 oneShard(rs, i) 661 } 662 } else { 663 var wg sync.WaitGroup 664 for i, rs := range rss { 665 wg.Add(1) 666 go func(rs *srvtopo.ResolvedShard, i int) { 667 defer wg.Done() 668 oneShard(rs, i) 669 }(rs, i) 670 } 671 wg.Wait() 672 } 673 674 if session.MustRollback() { 675 _ = stc.txConn.Rollback(ctx, session) 676 } 677 return allErrors 678 } 679 680 // ExecuteLock performs the requested 'action' on the specified 681 // ResolvedShard. If the lock session already has a reserved connection, 682 // it reuses it. Otherwise open a new reserved connection. 683 // The action function must match the shardActionTransactionFunc signature. 684 // 685 // It returns an error recorder in which each shard error is recorded positionally, 686 // i.e. if rss[2] had an error, then the error recorder will store that error 687 // in the second position. 688 func (stc *ScatterConn) ExecuteLock(ctx context.Context, rs *srvtopo.ResolvedShard, query *querypb.BoundQuery, session *SafeSession, lockFuncType sqlparser.LockingFuncType) (*sqltypes.Result, error) { 689 690 var ( 691 qr *sqltypes.Result 692 err error 693 opts *querypb.ExecuteOptions 694 alias *topodatapb.TabletAlias 695 ) 696 allErrors := new(concurrency.AllErrorRecorder) 697 startTime, statsKey := stc.startAction("ExecuteLock", rs.Target) 698 defer stc.endLockAction(startTime, allErrors, statsKey, &err) 699 700 if session == nil || session.Session == nil { 701 return nil, vterrors.VT13001("session cannot be nil") 702 } 703 704 opts = session.Session.Options 705 info, err := lockInfo(rs.Target, session, lockFuncType) 706 // Lock session is created on alphabetic sorted keyspace. 707 // This error will occur if the existing session target does not match the current target. 708 // This will happen either due to re-sharding or a new keyspace which comes before the existing order. 709 // In which case, we will try to release old locks and return error. 710 if err != nil { 711 _ = stc.txConn.ReleaseLock(ctx, session) 712 return nil, vterrors.Wrap(err, "Any previous held locks are released") 713 } 714 qs, err := getQueryService(rs, info, nil, true) 715 if err != nil { 716 return nil, err 717 } 718 reservedID := info.reservedID 719 720 switch info.actionNeeded { 721 case nothing: 722 qr, err = qs.Execute(ctx, rs.Target, query.Sql, query.BindVariables, 0 /* transactionID */, reservedID, opts) 723 if err != nil && wasConnectionClosed(err) { 724 // TODO: try to acquire lock again. 725 session.ResetLock() 726 err = vterrors.Wrap(err, "held locks released") 727 } 728 if reservedID != 0 { 729 session.UpdateLockHeartbeat() 730 } 731 case reserve: 732 var state queryservice.ReservedState 733 state, qr, err = qs.ReserveExecute(ctx, rs.Target, session.SetPreQueries(), query.Sql, query.BindVariables, 0 /* transactionID */, opts) 734 reservedID = state.ReservedID 735 alias = state.TabletAlias 736 if err != nil && reservedID != 0 { 737 _ = stc.txConn.ReleaseLock(ctx, session) 738 } 739 740 if reservedID != 0 { 741 session.SetLockSession(&vtgatepb.Session_ShardSession{ 742 Target: rs.Target, 743 ReservedId: reservedID, 744 TabletAlias: alias, 745 }) 746 } 747 default: 748 return nil, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "[BUG] unexpected actionNeeded on lock execution: %v", info.actionNeeded) 749 } 750 751 if err != nil { 752 return nil, err 753 } 754 return qr, err 755 } 756 757 func wasConnectionClosed(err error) bool { 758 sqlErr := mysql.NewSQLErrorFromError(err).(*mysql.SQLError) 759 message := sqlErr.Error() 760 761 switch sqlErr.Number() { 762 case mysql.CRServerGone, mysql.CRServerLost: 763 return true 764 case mysql.ERQueryInterrupted: 765 return vterrors.TxClosed.MatchString(message) 766 default: 767 return false 768 } 769 } 770 771 // requireNewQS this checks if we need to fallback to new tablet. 772 func requireNewQS(err error, target *querypb.Target) bool { 773 code := vterrors.Code(err) 774 msg := err.Error() 775 switch code { 776 // when the tablet or mysql is unavailable for any reason. 777 case vtrpcpb.Code_UNAVAILABLE: 778 return true 779 // when received wrong tablet error message. 780 case vtrpcpb.Code_FAILED_PRECONDITION: 781 return vterrors.RxWrongTablet.MatchString(msg) 782 // when received cluster_event from tablet and tablet is not operational. 783 // this will also help in buffering the query if needed. 784 case vtrpcpb.Code_CLUSTER_EVENT: 785 return (target != nil && target.TabletType == topodatapb.TabletType_PRIMARY) || vterrors.RxOp.MatchString(msg) 786 } 787 return false 788 } 789 790 // actionInfo looks at the current session, and returns information about what needs to be done for this tablet 791 func actionInfo(ctx context.Context, target *querypb.Target, session *SafeSession, autocommit bool, txMode vtgatepb.TransactionMode) (*shardActionInfo, error) { 792 if !(session.InTransaction() || session.InReservedConn()) { 793 return &shardActionInfo{}, nil 794 } 795 ignoreSession := ctx.Value(engine.IgnoreReserveTxn) 796 if ignoreSession != nil { 797 return &shardActionInfo{}, nil 798 } 799 // No need to protect ourselves from the race condition between 800 // Find and AppendOrUpdate. The higher level functions ensure that no 801 // duplicate (target) tuples can execute 802 // this at the same time. 803 transactionID, reservedID, alias, err := session.FindAndChangeSessionIfInSingleTxMode(target.Keyspace, target.Shard, target.TabletType, txMode) 804 if err != nil { 805 return nil, err 806 } 807 808 shouldReserve := session.InReservedConn() && reservedID == 0 809 shouldBegin := session.InTransaction() && transactionID == 0 && !autocommit 810 811 var act = nothing 812 switch { 813 case shouldBegin && shouldReserve: 814 act = reserveBegin 815 case shouldReserve: 816 act = reserve 817 case shouldBegin: 818 act = begin 819 } 820 821 return &shardActionInfo{ 822 actionNeeded: act, 823 transactionID: transactionID, 824 reservedID: reservedID, 825 alias: alias, 826 }, nil 827 } 828 829 // lockInfo looks at the current session, and returns information about what needs to be done for this tablet 830 func lockInfo(target *querypb.Target, session *SafeSession, lockFuncType sqlparser.LockingFuncType) (*shardActionInfo, error) { 831 info := &shardActionInfo{actionNeeded: nothing} 832 if session.LockSession != nil { 833 if !proto.Equal(target, session.LockSession.Target) { 834 return nil, vterrors.Errorf(vtrpcpb.Code_NOT_FOUND, "target does match the existing lock session target: (%v, %v)", target, session.LockSession.Target) 835 } 836 info.reservedID = session.LockSession.ReservedId 837 info.alias = session.LockSession.TabletAlias 838 } 839 // Only GetLock needs to start a reserved connection. 840 // Once in reserved connection, it will be used for other calls as well. 841 // But, we don't want to start a reserved connection for other calls like IsFreeLock, IsUsedLock, etc. 842 if lockFuncType != sqlparser.GetLock { 843 return info, nil 844 } 845 if info.reservedID == 0 { 846 info.actionNeeded = reserve 847 } 848 return info, nil 849 } 850 851 type shardActionInfo struct { 852 actionNeeded actionNeeded 853 reservedID, transactionID int64 854 alias *topodatapb.TabletAlias 855 } 856 857 func (sai *shardActionInfo) updateTransactionAndReservedID(txID int64, rID int64, alias *topodatapb.TabletAlias) *shardActionInfo { 858 if txID == sai.transactionID && rID == sai.reservedID { 859 // As transaction id and reserved id have not changed, there is nothing to update in session shard sessions. 860 return nil 861 } 862 newInfo := *sai 863 newInfo.reservedID = rID 864 newInfo.transactionID = txID 865 newInfo.alias = alias 866 return &newInfo 867 } 868 869 type actionNeeded int 870 871 const ( 872 nothing actionNeeded = iota 873 reserveBegin 874 reserve 875 begin 876 )