vitess.io/vitess@v0.16.2/go/vt/vtorc/logic/orchestrator.go (about) 1 /* 2 Copyright 2014 Outbrain Inc. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package logic 18 19 import ( 20 "os" 21 "os/signal" 22 "sync" 23 "sync/atomic" 24 "syscall" 25 "time" 26 27 "github.com/patrickmn/go-cache" 28 "github.com/rcrowley/go-metrics" 29 "github.com/sjmudd/stopwatch" 30 31 "vitess.io/vitess/go/vt/log" 32 "vitess.io/vitess/go/vt/servenv" 33 "vitess.io/vitess/go/vt/vtorc/collection" 34 "vitess.io/vitess/go/vt/vtorc/config" 35 "vitess.io/vitess/go/vt/vtorc/discovery" 36 "vitess.io/vitess/go/vt/vtorc/inst" 37 ometrics "vitess.io/vitess/go/vt/vtorc/metrics" 38 "vitess.io/vitess/go/vt/vtorc/process" 39 "vitess.io/vitess/go/vt/vtorc/util" 40 ) 41 42 const ( 43 discoveryMetricsName = "DISCOVERY_METRICS" 44 ) 45 46 // discoveryQueue is a channel of deduplicated instanceKey-s 47 // that were requested for discovery. It can be continuously updated 48 // as discovery process progresses. 49 var discoveryQueue *discovery.Queue 50 var snapshotDiscoveryKeys chan inst.InstanceKey 51 var snapshotDiscoveryKeysMutex sync.Mutex 52 var hasReceivedSIGTERM int32 53 54 var discoveriesCounter = metrics.NewCounter() 55 var failedDiscoveriesCounter = metrics.NewCounter() 56 var instancePollSecondsExceededCounter = metrics.NewCounter() 57 var discoveryQueueLengthGauge = metrics.NewGauge() 58 var discoveryRecentCountGauge = metrics.NewGauge() 59 var isElectedGauge = metrics.NewGauge() 60 var isHealthyGauge = metrics.NewGauge() 61 var discoveryMetrics = collection.CreateOrReturnCollection(discoveryMetricsName) 62 63 var isElectedNode int64 64 65 var recentDiscoveryOperationKeys *cache.Cache 66 67 func init() { 68 snapshotDiscoveryKeys = make(chan inst.InstanceKey, 10) 69 70 _ = metrics.Register("discoveries.attempt", discoveriesCounter) 71 _ = metrics.Register("discoveries.fail", failedDiscoveriesCounter) 72 _ = metrics.Register("discoveries.instance_poll_seconds_exceeded", instancePollSecondsExceededCounter) 73 _ = metrics.Register("discoveries.queue_length", discoveryQueueLengthGauge) 74 _ = metrics.Register("discoveries.recent_count", discoveryRecentCountGauge) 75 _ = metrics.Register("elect.is_elected", isElectedGauge) 76 _ = metrics.Register("health.is_healthy", isHealthyGauge) 77 78 ometrics.OnMetricsTick(func() { 79 discoveryQueueLengthGauge.Update(int64(discoveryQueue.QueueLen())) 80 }) 81 ometrics.OnMetricsTick(func() { 82 if recentDiscoveryOperationKeys == nil { 83 return 84 } 85 discoveryRecentCountGauge.Update(int64(recentDiscoveryOperationKeys.ItemCount())) 86 }) 87 ometrics.OnMetricsTick(func() { 88 isElectedGauge.Update(atomic.LoadInt64(&isElectedNode)) 89 }) 90 ometrics.OnMetricsTick(func() { 91 isHealthyGauge.Update(atomic.LoadInt64(&process.LastContinousCheckHealthy)) 92 }) 93 } 94 95 func IsLeader() bool { 96 return atomic.LoadInt64(&isElectedNode) == 1 97 } 98 99 func IsLeaderOrActive() bool { 100 return atomic.LoadInt64(&isElectedNode) == 1 101 } 102 103 // used in several places 104 func instancePollSecondsDuration() time.Duration { 105 return time.Duration(config.Config.InstancePollSeconds) * time.Second 106 } 107 108 // acceptSighupSignal registers for SIGHUP signal from the OS to reload the configuration files. 109 func acceptSighupSignal() { 110 c := make(chan os.Signal, 1) 111 112 signal.Notify(c, syscall.SIGHUP) 113 go func() { 114 for range c { 115 log.Infof("Received SIGHUP. Reloading configuration") 116 _ = inst.AuditOperation("reload-configuration", nil, "Triggered via SIGHUP") 117 config.Reload() 118 discoveryMetrics.SetExpirePeriod(time.Duration(config.DiscoveryCollectionRetentionSeconds) * time.Second) 119 } 120 }() 121 } 122 123 // closeVTOrc runs all the operations required to cleanly shutdown VTOrc 124 func closeVTOrc() { 125 log.Infof("Starting VTOrc shutdown") 126 atomic.StoreInt32(&hasReceivedSIGTERM, 1) 127 discoveryMetrics.StopAutoExpiration() 128 // Poke other go routines to stop cleanly here ... 129 _ = inst.AuditOperation("shutdown", nil, "Triggered via SIGTERM") 130 // wait for the locks to be released 131 waitForLocksRelease() 132 log.Infof("VTOrc closed") 133 } 134 135 // waitForLocksRelease is used to wait for release of locks 136 func waitForLocksRelease() { 137 timeout := time.After(shutdownWaitTime) 138 for { 139 count := atomic.LoadInt32(&shardsLockCounter) 140 if count == 0 { 141 break 142 } 143 select { 144 case <-timeout: 145 log.Infof("wait for lock release timed out. Some locks might not have been released.") 146 default: 147 time.Sleep(50 * time.Millisecond) 148 continue 149 } 150 break 151 } 152 } 153 154 // handleDiscoveryRequests iterates the discoveryQueue channel and calls upon 155 // instance discovery per entry. 156 func handleDiscoveryRequests() { 157 discoveryQueue = discovery.CreateOrReturnQueue("DEFAULT") 158 159 // create a pool of discovery workers 160 for i := uint(0); i < config.DiscoveryMaxConcurrency; i++ { 161 go func() { 162 for { 163 instanceKey := discoveryQueue.Consume() 164 // Possibly this used to be the elected node, but has 165 // been demoted, while still the queue is full. 166 if !IsLeaderOrActive() { 167 log.Infof("Node apparently demoted. Skipping discovery of %+v. "+ 168 "Remaining queue size: %+v", instanceKey, discoveryQueue.QueueLen()) 169 discoveryQueue.Release(instanceKey) 170 continue 171 } 172 173 DiscoverInstance(instanceKey, false /* forceDiscovery */) 174 discoveryQueue.Release(instanceKey) 175 } 176 }() 177 } 178 } 179 180 // DiscoverInstance will attempt to discover (poll) an instance (unless 181 // it is already up to date) and will also ensure that its primary and 182 // replicas (if any) are also checked. 183 func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { 184 if inst.InstanceIsForgotten(&instanceKey) { 185 log.Infof("discoverInstance: skipping discovery of %+v because it is set to be forgotten", instanceKey) 186 return 187 } 188 189 // create stopwatch entries 190 latency := stopwatch.NewNamedStopwatch() 191 _ = latency.AddMany([]string{ 192 "backend", 193 "instance", 194 "total"}) 195 latency.Start("total") // start the total stopwatch (not changed anywhere else) 196 197 defer func() { 198 latency.Stop("total") 199 discoveryTime := latency.Elapsed("total") 200 if discoveryTime > instancePollSecondsDuration() { 201 instancePollSecondsExceededCounter.Inc(1) 202 log.Warningf("discoverInstance exceeded InstancePollSeconds for %+v, took %.4fs", instanceKey, discoveryTime.Seconds()) 203 } 204 }() 205 206 _, _ = instanceKey.ResolveHostname() 207 if !instanceKey.IsValid() { 208 return 209 } 210 211 // Calculate the expiry period each time as InstancePollSeconds 212 // _may_ change during the run of the process (via SIGHUP) and 213 // it is not possible to change the cache's default expiry.. 214 if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, instancePollSecondsDuration()); existsInCacheError != nil && !forceDiscovery { 215 // Just recently attempted 216 return 217 } 218 219 latency.Start("backend") 220 instance, found, _ := inst.ReadInstance(&instanceKey) 221 latency.Stop("backend") 222 if !forceDiscovery && found && instance.IsUpToDate && instance.IsLastCheckValid { 223 // we've already discovered this one. Skip! 224 return 225 } 226 227 discoveriesCounter.Inc(1) 228 229 // First we've ever heard of this instance. Continue investigation: 230 instance, err := inst.ReadTopologyInstanceBufferable(&instanceKey, latency) 231 // panic can occur (IO stuff). Therefore it may happen 232 // that instance is nil. Check it, but first get the timing metrics. 233 totalLatency := latency.Elapsed("total") 234 backendLatency := latency.Elapsed("backend") 235 instanceLatency := latency.Elapsed("instance") 236 237 if forceDiscovery { 238 log.Infof("Force discovered - %+v, err - %v", instance, err) 239 } 240 241 if instance == nil { 242 failedDiscoveriesCounter.Inc(1) 243 _ = discoveryMetrics.Append(&discovery.Metric{ 244 Timestamp: time.Now(), 245 InstanceKey: instanceKey, 246 TotalLatency: totalLatency, 247 BackendLatency: backendLatency, 248 InstanceLatency: instanceLatency, 249 Err: err, 250 }) 251 if util.ClearToLog("discoverInstance", instanceKey.StringCode()) { 252 log.Warningf(" DiscoverInstance(%+v) instance is nil in %.3fs (Backend: %.3fs, Instance: %.3fs), error=%+v", 253 instanceKey, 254 totalLatency.Seconds(), 255 backendLatency.Seconds(), 256 instanceLatency.Seconds(), 257 err) 258 } 259 return 260 } 261 262 _ = discoveryMetrics.Append(&discovery.Metric{ 263 Timestamp: time.Now(), 264 InstanceKey: instanceKey, 265 TotalLatency: totalLatency, 266 BackendLatency: backendLatency, 267 InstanceLatency: instanceLatency, 268 Err: nil, 269 }) 270 } 271 272 // onHealthTick handles the actions to take to discover/poll instances 273 func onHealthTick() { 274 wasAlreadyElected := IsLeader() 275 276 { 277 myIsElectedNode, err := process.AttemptElection() 278 if err != nil { 279 log.Error(err) 280 } 281 if myIsElectedNode { 282 atomic.StoreInt64(&isElectedNode, 1) 283 } else { 284 atomic.StoreInt64(&isElectedNode, 0) 285 } 286 if !myIsElectedNode { 287 if electedNode, _, err := process.ElectedNode(); err == nil { 288 log.Infof("Not elected as active node; active node: %v; polling", electedNode.Hostname) 289 } else { 290 log.Infof("Not elected as active node; active node: Unable to determine: %v; polling", err) 291 } 292 } 293 } 294 if !IsLeaderOrActive() { 295 return 296 } 297 instanceKeys, err := inst.ReadOutdatedInstanceKeys() 298 if err != nil { 299 log.Error(err) 300 } 301 302 if !wasAlreadyElected { 303 // Just turned to be leader! 304 go func() { 305 _, _ = process.RegisterNode(process.ThisNodeHealth) 306 }() 307 go func() { 308 _ = inst.ExpireMaintenance() 309 }() 310 } 311 312 func() { 313 // Normally onHealthTick() shouldn't run concurrently. It is kicked by a ticker. 314 // However it _is_ invoked inside a goroutine. I like to be safe here. 315 snapshotDiscoveryKeysMutex.Lock() 316 defer snapshotDiscoveryKeysMutex.Unlock() 317 318 countSnapshotKeys := len(snapshotDiscoveryKeys) 319 for i := 0; i < countSnapshotKeys; i++ { 320 instanceKeys = append(instanceKeys, <-snapshotDiscoveryKeys) 321 } 322 }() 323 // avoid any logging unless there's something to be done 324 if len(instanceKeys) > 0 { 325 for _, instanceKey := range instanceKeys { 326 if instanceKey.IsValid() { 327 discoveryQueue.Push(instanceKey) 328 } 329 } 330 } 331 } 332 333 // ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are 334 // periodically investigated and their status captured, and long since unseen instances are 335 // purged and forgotten. 336 // nolint SA1015: using time.Tick leaks the underlying ticker 337 func ContinuousDiscovery() { 338 log.Infof("continuous discovery: setting up") 339 continuousDiscoveryStartTime := time.Now() 340 checkAndRecoverWaitPeriod := 3 * instancePollSecondsDuration() 341 recentDiscoveryOperationKeys = cache.New(instancePollSecondsDuration(), time.Second) 342 343 _ = inst.LoadHostnameResolveCache() 344 go handleDiscoveryRequests() 345 346 healthTick := time.Tick(config.HealthPollSeconds * time.Second) 347 instancePollTick := time.Tick(instancePollSecondsDuration()) 348 caretakingTick := time.Tick(time.Minute) 349 recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) 350 tabletTopoTick := OpenTabletDiscovery() 351 var recoveryEntrance int64 352 var snapshotTopologiesTick <-chan time.Time 353 if config.Config.SnapshotTopologiesIntervalHours > 0 { 354 snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) 355 } 356 357 runCheckAndRecoverOperationsTimeRipe := func() bool { 358 return time.Since(continuousDiscoveryStartTime) >= checkAndRecoverWaitPeriod 359 } 360 361 go func() { 362 _ = ometrics.InitMetrics() 363 }() 364 go acceptSighupSignal() 365 // On termination of the server, we should close VTOrc cleanly 366 servenv.OnTermSync(closeVTOrc) 367 368 log.Infof("continuous discovery: starting") 369 for { 370 select { 371 case <-healthTick: 372 go func() { 373 onHealthTick() 374 }() 375 case <-instancePollTick: 376 go func() { 377 // This tick does NOT do instance poll (these are handled by the oversampling discoveryTick) 378 // But rather should invoke such routinely operations that need to be as (or roughly as) frequent 379 // as instance poll 380 if IsLeaderOrActive() { 381 go inst.ExpireDowntime() 382 } 383 }() 384 case <-caretakingTick: 385 // Various periodic internal maintenance tasks 386 go func() { 387 if IsLeaderOrActive() { 388 389 go inst.ForgetLongUnseenInstances() 390 go inst.ForgetUnseenInstancesDifferentlyResolved() 391 go inst.ForgetExpiredHostnameResolves() 392 go inst.DeleteInvalidHostnameResolves() 393 go inst.ResolveUnknownPrimaryHostnameResolves() 394 go inst.ExpireMaintenance() 395 go inst.ExpireCandidateInstances() 396 go inst.ExpireHostnameUnresolve() 397 go inst.ExpireAudit() 398 go inst.FlushNontrivialResolveCacheToDatabase() 399 go inst.ExpireStaleInstanceBinlogCoordinates() 400 go process.ExpireNodesHistory() 401 go process.ExpireAvailableNodes() 402 go ExpireFailureDetectionHistory() 403 go ExpireTopologyRecoveryHistory() 404 go ExpireTopologyRecoveryStepsHistory() 405 } else { 406 // Take this opportunity to refresh yourself 407 go inst.LoadHostnameResolveCache() 408 } 409 }() 410 case <-recoveryTick: 411 go func() { 412 if IsLeaderOrActive() { 413 go ClearActiveFailureDetections() 414 go ClearActiveRecoveries() 415 go ExpireBlockedRecoveries() 416 go AcknowledgeCrashedRecoveries() 417 go inst.ExpireInstanceAnalysisChangelog() 418 419 go func() { 420 // This function is non re-entrant (it can only be running once at any point in time) 421 if atomic.CompareAndSwapInt64(&recoveryEntrance, 0, 1) { 422 defer atomic.StoreInt64(&recoveryEntrance, 0) 423 } else { 424 return 425 } 426 if runCheckAndRecoverOperationsTimeRipe() { 427 CheckAndRecover(nil, nil, false) 428 } else { 429 log.Infof("Waiting for %+v seconds to pass before running failure detection/recovery", checkAndRecoverWaitPeriod.Seconds()) 430 } 431 }() 432 } 433 }() 434 case <-snapshotTopologiesTick: 435 go func() { 436 if IsLeaderOrActive() { 437 go inst.SnapshotTopologies() 438 } 439 }() 440 case <-tabletTopoTick: 441 go RefreshAllKeyspaces() 442 go refreshAllTablets() 443 } 444 } 445 }