vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletserver/state_manager.go (about) 1 /* 2 Copyright 2020 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tabletserver 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "vitess.io/vitess/go/vt/servenv" 26 27 "google.golang.org/protobuf/proto" 28 29 "vitess.io/vitess/go/sync2" 30 "vitess.io/vitess/go/timer" 31 "vitess.io/vitess/go/vt/log" 32 querypb "vitess.io/vitess/go/vt/proto/query" 33 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 34 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 35 "vitess.io/vitess/go/vt/vterrors" 36 "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" 37 ) 38 39 type servingState int64 40 41 const ( 42 // StateNotConnected is the state where tabletserver is not 43 // connected to an underlying mysql instance. In this state we close 44 // query engine since MySQL is probably unavailable 45 StateNotConnected = servingState(iota) 46 // StateNotServing is the state where tabletserver is connected 47 // to an underlying mysql instance, but is not serving queries. 48 // We do not close the query engine to not close the pool. We keep 49 // the query engine open but prevent queries from running by blocking them 50 // in StartRequest. 51 StateNotServing 52 // StateServing is where queries are allowed. 53 StateServing 54 ) 55 56 func (state servingState) String() string { 57 switch state { 58 case StateServing: 59 return "Serving" 60 case StateNotServing: 61 return "Not Serving" 62 } 63 return "Not connected to mysql" 64 } 65 66 // transitionRetryInterval is for tests. 67 var transitionRetryInterval = 1 * time.Second 68 69 // stateManager manages state transition for all the TabletServer 70 // subcomponents. 71 type stateManager struct { 72 // transitioning is a semaphore that must to be obtained 73 // before attempting a state transition. To prevent deadlocks, 74 // this must be acquired before the mu lock. We use a semaphore 75 // because we need TryAcquire, which is not supported by sync.Mutex. 76 // If an acquire is successful, we must either Release explicitly 77 // or invoke execTransition, which will release once it's done. 78 // There are no ordering restrictions on using TryAcquire. 79 transitioning *sync2.Semaphore 80 81 // mu should be held to access the group of variables under it. 82 // It is required in spite of the transitioning semaphore. 83 // This is because other goroutines will still want 84 // read the values while a transition is in progress. 85 // 86 // If a transition fails, we set retrying to true and launch 87 // retryTransition which loops until the state converges. 88 mu sync.Mutex 89 wantState servingState 90 wantTabletType topodatapb.TabletType 91 state servingState 92 target *querypb.Target 93 terTimestamp time.Time 94 retrying bool 95 replHealthy bool 96 lameduck bool 97 alsoAllow []topodatapb.TabletType 98 reason string 99 transitionErr error 100 101 requests sync.WaitGroup 102 103 // QueryList does not have an Open or Close. 104 statelessql *QueryList 105 statefulql *QueryList 106 olapql *QueryList 107 108 // Open must be done in forward order. 109 // Close must be done in reverse order. 110 // All Close functions must be called before Open. 111 hs *healthStreamer 112 se schemaEngine 113 rt replTracker 114 vstreamer subComponent 115 tracker subComponent 116 watcher subComponent 117 qe queryEngine 118 txThrottler txThrottler 119 te txEngine 120 messager subComponent 121 ddle onlineDDLExecutor 122 throttler lagThrottler 123 tableGC tableGarbageCollector 124 125 // hcticks starts on initialiazation and runs forever. 126 hcticks *timer.Timer 127 128 // checkMySQLThrottler ensures that CheckMysql 129 // doesn't get spammed. 130 checkMySQLThrottler *sync2.Semaphore 131 checkMySQLRunning sync2.AtomicBool 132 133 timebombDuration time.Duration 134 unhealthyThreshold sync2.AtomicDuration 135 shutdownGracePeriod time.Duration 136 transitionGracePeriod time.Duration 137 } 138 139 type ( 140 schemaEngine interface { 141 EnsureConnectionAndDB(topodatapb.TabletType) error 142 Open() error 143 MakeNonPrimary() 144 Close() 145 } 146 147 replTracker interface { 148 MakePrimary() 149 MakeNonPrimary() 150 Close() 151 Status() (time.Duration, error) 152 } 153 154 queryEngine interface { 155 Open() error 156 IsMySQLReachable() error 157 Close() 158 } 159 160 txEngine interface { 161 AcceptReadWrite() 162 AcceptReadOnly() 163 Close() 164 } 165 166 subComponent interface { 167 Open() 168 Close() 169 } 170 171 txThrottler interface { 172 Open() error 173 Close() 174 } 175 176 onlineDDLExecutor interface { 177 Open() error 178 Close() 179 } 180 181 lagThrottler interface { 182 Open() error 183 Close() 184 } 185 186 tableGarbageCollector interface { 187 Open() error 188 Close() 189 } 190 ) 191 192 // Init performs the second phase of initialization. 193 func (sm *stateManager) Init(env tabletenv.Env, target *querypb.Target) { 194 sm.target = proto.Clone(target).(*querypb.Target) 195 sm.transitioning = sync2.NewSemaphore(1, 0) 196 sm.checkMySQLThrottler = sync2.NewSemaphore(1, 0) 197 sm.timebombDuration = env.Config().OltpReadPool.TimeoutSeconds.Get() * 10 198 sm.hcticks = timer.NewTimer(env.Config().Healthcheck.IntervalSeconds.Get()) 199 sm.unhealthyThreshold = sync2.NewAtomicDuration(env.Config().Healthcheck.UnhealthyThresholdSeconds.Get()) 200 sm.shutdownGracePeriod = env.Config().GracePeriods.ShutdownSeconds.Get() 201 sm.transitionGracePeriod = env.Config().GracePeriods.TransitionSeconds.Get() 202 } 203 204 // SetServingType changes the state to the specified settings. 205 // If a transition is in progress, it waits and then executes the 206 // new request. If the transition fails, it returns an error, and 207 // launches retryTransition to ensure that the request will eventually 208 // be honored. 209 // If sm is already in the requested state, it returns stateChanged as 210 // false. 211 func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, terTimestamp time.Time, state servingState, reason string) error { 212 defer sm.ExitLameduck() 213 214 sm.hs.Open() 215 sm.hcticks.Start(sm.Broadcast) 216 217 if tabletType == topodatapb.TabletType_RESTORE || tabletType == topodatapb.TabletType_BACKUP { 218 state = StateNotConnected 219 } 220 221 log.Infof("Starting transition to %v %v, timestamp: %v", tabletType, state, terTimestamp) 222 if sm.mustTransition(tabletType, terTimestamp, state, reason) { 223 return sm.execTransition(tabletType, state) 224 } 225 return nil 226 } 227 228 // mustTransition returns true if the requested state does not match the current 229 // state. If so, it acquires the semaphore and returns true. If a transition is 230 // already in progress, it waits. If the desired state is already reached, it 231 // returns false without acquiring the semaphore. 232 func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, terTimestamp time.Time, state servingState, reason string) bool { 233 sm.transitioning.Acquire() 234 sm.mu.Lock() 235 defer sm.mu.Unlock() 236 237 sm.wantTabletType = tabletType 238 sm.wantState = state 239 sm.terTimestamp = terTimestamp 240 sm.reason = reason 241 if sm.target.TabletType == tabletType && sm.state == state { 242 sm.transitioning.Release() 243 return false 244 } 245 return true 246 } 247 248 func (sm *stateManager) execTransition(tabletType topodatapb.TabletType, state servingState) error { 249 defer sm.transitioning.Release() 250 251 var err error 252 switch state { 253 case StateServing: 254 if tabletType == topodatapb.TabletType_PRIMARY { 255 err = sm.servePrimary() 256 } else { 257 err = sm.serveNonPrimary(tabletType) 258 } 259 case StateNotServing: 260 if tabletType == topodatapb.TabletType_PRIMARY { 261 err = sm.unservePrimary() 262 } else { 263 err = sm.unserveNonPrimary(tabletType) 264 } 265 case StateNotConnected: 266 sm.closeAll() 267 } 268 sm.mu.Lock() 269 sm.transitionErr = err 270 sm.mu.Unlock() 271 if err != nil { 272 sm.retryTransition(fmt.Sprintf("Error transitioning to the desired state: %v, %v, will keep retrying: %v", tabletType, state, err)) 273 } 274 return err 275 } 276 277 func (sm *stateManager) retryTransition(message string) { 278 sm.mu.Lock() 279 defer sm.mu.Unlock() 280 if sm.retrying { 281 return 282 } 283 sm.retrying = true 284 285 log.Error(message) 286 go func() { 287 for { 288 time.Sleep(transitionRetryInterval) 289 if sm.recheckState() { 290 return 291 } 292 } 293 }() 294 } 295 296 func (sm *stateManager) recheckState() bool { 297 sm.mu.Lock() 298 defer sm.mu.Unlock() 299 300 if sm.wantState == sm.state && sm.wantTabletType == sm.target.TabletType { 301 sm.retrying = false 302 return true 303 } 304 if !sm.transitioning.TryAcquire() { 305 return false 306 } 307 go sm.execTransition(sm.wantTabletType, sm.wantState) 308 return false 309 } 310 311 // checkMySQL verifies that we can connect to mysql. 312 // If it fails, then we shutdown the service and initiate 313 // the retry loop. 314 func (sm *stateManager) checkMySQL() { 315 if !sm.checkMySQLThrottler.TryAcquire() { 316 return 317 } 318 log.Infof("CheckMySQL started") 319 sm.checkMySQLRunning.Set(true) 320 go func() { 321 defer func() { 322 time.Sleep(1 * time.Second) 323 sm.checkMySQLRunning.Set(false) 324 sm.checkMySQLThrottler.Release() 325 log.Infof("CheckMySQL finished") 326 }() 327 328 err := sm.qe.IsMySQLReachable() 329 if err == nil { 330 return 331 } 332 333 if !sm.transitioning.TryAcquire() { 334 // If we're already transitioning, don't interfere. 335 return 336 } 337 defer sm.transitioning.Release() 338 339 // This is required to prevent new queries from running in StartRequest 340 // unless they are part of a running transaction. 341 sm.setWantState(StateNotConnected) 342 sm.closeAll() 343 344 // Now that we reached the NotConnected state, we want to go back to the 345 // Serving state. The retry will only succeed once MySQL is reachable again 346 // Until then EnsureConnectionAndDB will error out. 347 sm.setWantState(StateServing) 348 sm.retryTransition(fmt.Sprintf("Cannot connect to MySQL, shutting down query service: %v", err)) 349 }() 350 } 351 352 func (sm *stateManager) setWantState(stateWanted servingState) { 353 sm.mu.Lock() 354 defer sm.mu.Unlock() 355 sm.wantState = stateWanted 356 } 357 358 // isCheckMySQLRunning returns 1 if CheckMySQL function is in progress 359 func (sm *stateManager) isCheckMySQLRunning() int64 { 360 if sm.checkMySQLRunning.Get() { 361 return 1 362 } 363 return 0 364 } 365 366 // StopService shuts down sm. If the shutdown doesn't complete 367 // within timeBombDuration, it crashes the process. 368 func (sm *stateManager) StopService() { 369 defer close(sm.setTimeBomb()) 370 371 log.Info("Stopping TabletServer") 372 sm.SetServingType(sm.Target().TabletType, time.Time{}, StateNotConnected, "service stopped") 373 sm.hcticks.Stop() 374 sm.hs.Close() 375 } 376 377 // StartRequest validates the current state and target and registers 378 // the request (a waitgroup) as started. Every StartRequest must be 379 // ended with an EndRequest. 380 func (sm *stateManager) StartRequest(ctx context.Context, target *querypb.Target, allowOnShutdown bool) (err error) { 381 sm.mu.Lock() 382 defer sm.mu.Unlock() 383 384 if sm.state != StateServing || !sm.replHealthy { 385 // This specific error string needs to be returned for vtgate buffering to work. 386 return vterrors.New(vtrpcpb.Code_CLUSTER_EVENT, vterrors.NotServing) 387 } 388 389 shuttingDown := sm.wantState != StateServing 390 if shuttingDown && !allowOnShutdown { 391 // This specific error string needs to be returned for vtgate buffering to work. 392 return vterrors.New(vtrpcpb.Code_CLUSTER_EVENT, vterrors.ShuttingDown) 393 } 394 395 err = sm.verifyTargetLocked(ctx, target) 396 if err != nil { 397 return err 398 } 399 sm.requests.Add(1) 400 return nil 401 } 402 403 // EndRequest unregisters the current request (a waitgroup) as done. 404 func (sm *stateManager) EndRequest() { 405 sm.requests.Done() 406 } 407 408 // VerifyTarget allows requests to be executed even in non-serving state. 409 // Such requests will get terminated without wait on shutdown. 410 func (sm *stateManager) VerifyTarget(ctx context.Context, target *querypb.Target) error { 411 sm.mu.Lock() 412 defer sm.mu.Unlock() 413 return sm.verifyTargetLocked(ctx, target) 414 } 415 416 func (sm *stateManager) verifyTargetLocked(ctx context.Context, target *querypb.Target) error { 417 if target != nil { 418 switch { 419 case target.Keyspace != sm.target.Keyspace: 420 return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid keyspace %v does not match expected %v", target.Keyspace, sm.target.Keyspace) 421 case target.Shard != sm.target.Shard: 422 return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "invalid shard %v does not match expected %v", target.Shard, sm.target.Shard) 423 case target.TabletType != sm.target.TabletType: 424 for _, otherType := range sm.alsoAllow { 425 if target.TabletType == otherType { 426 return nil 427 } 428 } 429 return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "%s: %v, want: %v or %v", vterrors.WrongTablet, target.TabletType, sm.target.TabletType, sm.alsoAllow) 430 } 431 } else { 432 if !tabletenv.IsLocalContext(ctx) { 433 return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "No target") 434 } 435 } 436 return nil 437 } 438 439 func (sm *stateManager) servePrimary() error { 440 sm.watcher.Close() 441 442 if err := sm.connect(topodatapb.TabletType_PRIMARY); err != nil { 443 return err 444 } 445 446 sm.rt.MakePrimary() 447 sm.tracker.Open() 448 // We instantly kill all stateful queries to allow for 449 // te to quickly transition into RW, but olap and stateless 450 // queries can continue serving. 451 sm.statefulql.TerminateAll() 452 sm.te.AcceptReadWrite() 453 sm.messager.Open() 454 sm.throttler.Open() 455 sm.tableGC.Open() 456 sm.ddle.Open() 457 sm.setState(topodatapb.TabletType_PRIMARY, StateServing) 458 return nil 459 } 460 461 func (sm *stateManager) unservePrimary() error { 462 sm.unserveCommon() 463 464 sm.watcher.Close() 465 466 if err := sm.connect(topodatapb.TabletType_PRIMARY); err != nil { 467 return err 468 } 469 470 sm.rt.MakePrimary() 471 sm.setState(topodatapb.TabletType_PRIMARY, StateNotServing) 472 return nil 473 } 474 475 func (sm *stateManager) serveNonPrimary(wantTabletType topodatapb.TabletType) error { 476 // We are likely transitioning from primary. We have to honor 477 // the shutdown grace period. 478 cancel := sm.handleShutdownGracePeriod() 479 defer cancel() 480 481 sm.ddle.Close() 482 sm.tableGC.Close() 483 sm.messager.Close() 484 sm.tracker.Close() 485 sm.se.MakeNonPrimary() 486 487 if err := sm.connect(wantTabletType); err != nil { 488 return err 489 } 490 491 sm.te.AcceptReadOnly() 492 sm.rt.MakeNonPrimary() 493 sm.watcher.Open() 494 sm.throttler.Open() 495 sm.setState(wantTabletType, StateServing) 496 return nil 497 } 498 499 func (sm *stateManager) unserveNonPrimary(wantTabletType topodatapb.TabletType) error { 500 sm.unserveCommon() 501 502 sm.se.MakeNonPrimary() 503 504 if err := sm.connect(wantTabletType); err != nil { 505 return err 506 } 507 508 sm.rt.MakeNonPrimary() 509 sm.watcher.Open() 510 sm.setState(wantTabletType, StateNotServing) 511 return nil 512 } 513 514 func (sm *stateManager) connect(tabletType topodatapb.TabletType) error { 515 if err := sm.se.EnsureConnectionAndDB(tabletType); err != nil { 516 return err 517 } 518 if err := sm.se.Open(); err != nil { 519 return err 520 } 521 sm.vstreamer.Open() 522 if err := sm.qe.Open(); err != nil { 523 return err 524 } 525 return sm.txThrottler.Open() 526 } 527 528 func (sm *stateManager) unserveCommon() { 529 log.Infof("Started execution of unserveCommon") 530 cancel := sm.handleShutdownGracePeriod() 531 log.Infof("Finished execution of handleShutdownGracePeriod") 532 defer cancel() 533 534 log.Infof("Started online ddl executor close") 535 sm.ddle.Close() 536 log.Infof("Finished online ddl executor close. Started table garbage collector close") 537 sm.tableGC.Close() 538 log.Infof("Finished table garbage collector close. Started lag throttler close") 539 sm.throttler.Close() 540 log.Infof("Finished lag throttler close. Started messager close") 541 sm.messager.Close() 542 log.Infof("Finished messager close. Started txEngine close") 543 sm.te.Close() 544 log.Infof("Finished txEngine close. Killing all OLAP queries") 545 sm.olapql.TerminateAll() 546 log.Info("Finished Killing all OLAP queries. Started tracker close") 547 sm.tracker.Close() 548 log.Infof("Finished tracker close. Started wait for requests") 549 sm.requests.Wait() 550 log.Infof("Finished wait for requests. Finished execution of unserveCommon") 551 } 552 553 func (sm *stateManager) handleShutdownGracePeriod() (cancel func()) { 554 if sm.shutdownGracePeriod == 0 { 555 return func() {} 556 } 557 ctx, cancel := context.WithCancel(context.TODO()) 558 go func() { 559 if err := timer.SleepContext(ctx, sm.shutdownGracePeriod); err != nil { 560 return 561 } 562 log.Infof("Grace Period %v exceeded. Killing all OLTP queries.", sm.shutdownGracePeriod) 563 sm.statelessql.TerminateAll() 564 log.Infof("Killed all stateful OLTP queries.") 565 sm.statefulql.TerminateAll() 566 log.Infof("Killed all OLTP queries.") 567 }() 568 return cancel 569 } 570 571 func (sm *stateManager) closeAll() { 572 defer close(sm.setTimeBomb()) 573 574 sm.unserveCommon() 575 sm.txThrottler.Close() 576 sm.qe.Close() 577 sm.watcher.Close() 578 sm.vstreamer.Close() 579 sm.rt.Close() 580 sm.se.Close() 581 sm.setState(topodatapb.TabletType_UNKNOWN, StateNotConnected) 582 } 583 584 func (sm *stateManager) setTimeBomb() chan struct{} { 585 done := make(chan struct{}) 586 go func() { 587 if sm.timebombDuration == 0 { 588 return 589 } 590 tmr := time.NewTimer(sm.timebombDuration) 591 defer tmr.Stop() 592 select { 593 case <-tmr.C: 594 log.Fatal("Shutdown took too long. Crashing") 595 case <-done: 596 } 597 }() 598 return done 599 } 600 601 // setState changes the state and logs the event. 602 func (sm *stateManager) setState(tabletType topodatapb.TabletType, state servingState) { 603 defer func() { 604 log.Infof("Tablet Init took %d ms", time.Since(servenv.GetInitStartTime()).Milliseconds()) 605 }() 606 sm.mu.Lock() 607 defer sm.mu.Unlock() 608 if tabletType == topodatapb.TabletType_UNKNOWN { 609 tabletType = sm.wantTabletType 610 } 611 log.Infof("TabletServer transition: %v -> %v for tablet %s:%s/%s", 612 sm.stateStringLocked(sm.target.TabletType, sm.state), sm.stateStringLocked(tabletType, state), 613 sm.target.Cell, sm.target.Keyspace, sm.target.Shard) 614 sm.handleGracePeriod(tabletType) 615 sm.target.TabletType = tabletType 616 if sm.state == StateNotConnected { 617 // If we're transitioning out of StateNotConnected, we have 618 // to also ensure replication status is healthy. 619 _, _ = sm.refreshReplHealthLocked() 620 } 621 sm.state = state 622 // Broadcast also obtains a lock. Trigger in a goroutine to avoid a deadlock. 623 go sm.hcticks.Trigger() 624 } 625 626 func (sm *stateManager) stateStringLocked(tabletType topodatapb.TabletType, state servingState) string { 627 if tabletType != topodatapb.TabletType_PRIMARY { 628 return fmt.Sprintf("%v: %v", tabletType, state) 629 } 630 return fmt.Sprintf("%v: %v, %v", tabletType, state, sm.terTimestamp.Local().Format("Jan 2, 2006 at 15:04:05 (MST)")) 631 } 632 633 func (sm *stateManager) handleGracePeriod(tabletType topodatapb.TabletType) { 634 if tabletType != topodatapb.TabletType_PRIMARY { 635 // We allow serving of previous type only for a primary transition. 636 sm.alsoAllow = nil 637 return 638 } 639 640 if tabletType == topodatapb.TabletType_PRIMARY && 641 sm.target.TabletType != topodatapb.TabletType_PRIMARY && 642 sm.transitionGracePeriod != 0 { 643 644 sm.alsoAllow = []topodatapb.TabletType{sm.target.TabletType} 645 // This is not a perfect solution because multiple back and forth 646 // transitions will launch multiple of these goroutines. But the 647 // system will eventually converge. 648 go func() { 649 time.Sleep(sm.transitionGracePeriod) 650 651 sm.mu.Lock() 652 defer sm.mu.Unlock() 653 sm.alsoAllow = nil 654 }() 655 } 656 } 657 658 // Broadcast fetches the replication status and broadcasts 659 // the state to all subscribed. 660 func (sm *stateManager) Broadcast() { 661 sm.mu.Lock() 662 defer sm.mu.Unlock() 663 664 lag, err := sm.refreshReplHealthLocked() 665 sm.hs.ChangeState(sm.target.TabletType, sm.terTimestamp, lag, err, sm.isServingLocked()) 666 } 667 668 func (sm *stateManager) refreshReplHealthLocked() (time.Duration, error) { 669 if sm.target.TabletType == topodatapb.TabletType_PRIMARY { 670 sm.replHealthy = true 671 return 0, nil 672 } 673 lag, err := sm.rt.Status() 674 if err != nil { 675 if sm.replHealthy { 676 log.Infof("Going unhealthy due to replication error: %v", err) 677 } 678 sm.replHealthy = false 679 } else { 680 if lag > sm.unhealthyThreshold.Get() { 681 if sm.replHealthy { 682 log.Infof("Going unhealthy due to high replication lag: %v", lag) 683 } 684 sm.replHealthy = false 685 } else { 686 if !sm.replHealthy { 687 log.Infof("Replication is healthy") 688 } 689 sm.replHealthy = true 690 } 691 } 692 return lag, err 693 } 694 695 // EnterLameduck causes tabletserver to enter the lameduck state. This 696 // state causes health checks to fail, but the behavior of tabletserver 697 // otherwise remains the same. Any subsequent calls to SetServingType will 698 // cause the tabletserver to exit this mode. 699 func (sm *stateManager) EnterLameduck() { 700 log.Info("State: entering lameduck") 701 sm.mu.Lock() 702 defer sm.mu.Unlock() 703 sm.lameduck = true 704 } 705 706 // ExitLameduck causes the tabletserver to exit the lameduck mode. 707 func (sm *stateManager) ExitLameduck() { 708 sm.mu.Lock() 709 defer sm.mu.Unlock() 710 sm.lameduck = false 711 log.Info("State: exiting lameduck") 712 } 713 714 // IsServing returns true if TabletServer is in SERVING state. 715 func (sm *stateManager) IsServing() bool { 716 sm.mu.Lock() 717 defer sm.mu.Unlock() 718 return sm.isServingLocked() 719 } 720 721 func (sm *stateManager) isServingLocked() bool { 722 return sm.state == StateServing && sm.wantState == StateServing && sm.replHealthy && !sm.lameduck 723 } 724 725 func (sm *stateManager) AppendDetails(details []*kv) []*kv { 726 sm.mu.Lock() 727 defer sm.mu.Unlock() 728 729 stateClass := func(state servingState) string { 730 switch state { 731 case StateServing: 732 return healthyClass 733 case StateNotServing: 734 return unhappyClass 735 } 736 return unhealthyClass 737 } 738 739 details = append(details, &kv{ 740 Key: "Current State", 741 Class: stateClass(sm.state), 742 Value: sm.stateStringLocked(sm.target.TabletType, sm.state), 743 }) 744 if sm.target.TabletType != sm.wantTabletType && sm.state != sm.wantState { 745 details = append(details, &kv{ 746 Key: "Desired State", 747 Class: stateClass(sm.wantState), 748 Value: sm.stateStringLocked(sm.wantTabletType, sm.wantState), 749 }) 750 } 751 if sm.reason != "" { 752 details = append(details, &kv{ 753 Key: "Reason", 754 Class: unhappyClass, 755 Value: sm.reason, 756 }) 757 } 758 if sm.transitionErr != nil { 759 details = append(details, &kv{ 760 Key: "Transition Error", 761 Class: unhealthyClass, 762 Value: sm.transitionErr.Error(), 763 }) 764 } 765 if sm.lameduck { 766 details = append(details, &kv{ 767 Key: "Lameduck", 768 Class: unhealthyClass, 769 Value: "ON", 770 }) 771 } 772 if len(sm.alsoAllow) != 0 { 773 details = append(details, &kv{ 774 Key: "Also Serving", 775 Class: healthyClass, 776 Value: sm.alsoAllow[0].String(), 777 }) 778 } 779 return details 780 } 781 782 func (sm *stateManager) State() servingState { 783 sm.mu.Lock() 784 defer sm.mu.Unlock() 785 // We should not change these state numbers without 786 // an announcement. Even though this is not perfect, 787 // this behavior keeps things backward compatible. 788 if !sm.replHealthy { 789 return StateNotConnected 790 } 791 return sm.state 792 } 793 794 func (sm *stateManager) Target() *querypb.Target { 795 sm.mu.Lock() 796 defer sm.mu.Unlock() 797 return proto.Clone(sm.target).(*querypb.Target) 798 } 799 800 // IsServingString returns the name of the current TabletServer state. 801 func (sm *stateManager) IsServingString() string { 802 if sm.IsServing() { 803 return "SERVING" 804 } 805 return "NOT_SERVING" 806 } 807 808 func (sm *stateManager) SetUnhealthyThreshold(v time.Duration) { 809 sm.unhealthyThreshold.Set(v) 810 }