vitess.io/vitess@v0.16.2/go/vt/vtgr/controller/diagnose.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controller 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "math/rand" 24 "os" 25 "sort" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/spf13/pflag" 31 32 "vitess.io/vitess/go/mysql" 33 "vitess.io/vitess/go/vt/concurrency" 34 "vitess.io/vitess/go/vt/servenv" 35 "vitess.io/vitess/go/vt/topo" 36 "vitess.io/vitess/go/vt/vterrors" 37 "vitess.io/vitess/go/vt/vtgr/db" 38 ) 39 40 var pingTabletTimeout = 2 * time.Second 41 42 func init() { 43 servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { 44 fs.DurationVar(&pingTabletTimeout, "ping_tablet_timeout", 2*time.Second, "time to wait when we ping a tablet") 45 }) 46 } 47 48 // DiagnoseType is the types of Diagnose result 49 type DiagnoseType string 50 51 type instanceGTIDSet struct { 52 gtids mysql.GTIDSet 53 instance *grInstance 54 } 55 56 // groupGTIDRecorder is used to help us query all the instance in parallel and record the result 57 // it helps us to take care of the consistency / synchronization among go routines 58 type groupGTIDRecorder struct { 59 name string 60 gtidWithInstances []*instanceGTIDSet 61 hasActive bool 62 sync.Mutex 63 } 64 65 const ( 66 // DiagnoseTypeError represents an DiagnoseTypeError status 67 DiagnoseTypeError DiagnoseType = "error" 68 // DiagnoseTypeHealthy represents everything is DiagnoseTypeHealthy 69 DiagnoseTypeHealthy = "Healthy" 70 // DiagnoseTypeShardHasNoGroup represents the cluster has not init yet 71 DiagnoseTypeShardHasNoGroup = "ShardHasNoGroup" 72 // DiagnoseTypeShardHasInactiveGroup represents the status where we have a group name but no member in it 73 DiagnoseTypeShardHasInactiveGroup = "ShardHasInactiveGroup" 74 // DiagnoseTypeInsufficientGroupSize represents the cluster has insufficient group members 75 DiagnoseTypeInsufficientGroupSize = "InsufficientGroupSize" 76 // DiagnoseTypeReadOnlyShard represents the cluster who has a read only node 77 DiagnoseTypeReadOnlyShard = "ReadOnlyShard" 78 // DiagnoseTypeUnreachablePrimary represents the primary tablet is unreachable 79 DiagnoseTypeUnreachablePrimary = "UnreachablePrimary" 80 // DiagnoseTypeWrongPrimaryTablet represents the primary tablet is incorrect based on mysql group 81 DiagnoseTypeWrongPrimaryTablet = "WrongPrimaryTablet" 82 // DiagnoseTypeUnconnectedReplica represents cluster with primary tablet, but a node is not connected to it 83 DiagnoseTypeUnconnectedReplica = "UnconnectedReplica" 84 // DiagnoseTypeBackoffError represents a transient error e.g., the primary is unreachable 85 DiagnoseTypeBackoffError = "BackoffError" 86 // DiagnoseTypeBootstrapBackoff represents an ongoing bootstrap 87 DiagnoseTypeBootstrapBackoff = "BootstrapBackoff" 88 89 // diagnoseTypeUnknown represents a unclear intermediate diagnose state 90 diagnoseTypeUnknown = "Unknown" 91 ) 92 93 // ScanAndRepairShard scans a particular shard by first Diagnose the shard with info from grShard 94 // and then repair the probelm if the shard is unhealthy 95 func (shard *GRShard) ScanAndRepairShard(ctx context.Context) { 96 status, err := shard.Diagnose(ctx) 97 if err != nil { 98 shard.logger.Errorf("fail to scanAndRepairShard %v/%v because of Diagnose error: %v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, err) 99 return 100 } 101 // We are able to get Diagnose without error 102 // 103 // Note: all the recovery function should first try to grab a shard level lock 104 // and check the trigger conditions before doing anything. This is to avoid 105 // other VTGR instance try to do the same thing 106 shard.logger.Infof("%v status is %v", formatKeyspaceShard(shard.KeyspaceShard), status) 107 if _, err := shard.Repair(ctx, status); err != nil { 108 shard.logger.Errorf("failed to repair %v: %v", status, err) 109 } 110 } 111 112 // Diagnose the shard in the following order: 113 // TODO: use FSM to make sure the status transition is correct 114 // 1. if the shard has a group that every node agreed on 115 // 2. if the group has any active (online / recovering) member 116 // 3. if the shard has initialized a Vitess primary 117 // 4. if primary tablet is reachable 118 // 5. if Vitess primary and mysql primary reconciled 119 // 6. if we have enough group members 120 // 7. if the primary node has read_only=OFF 121 // 8. if there is a node that is not in Mysql group 122 func (shard *GRShard) Diagnose(ctx context.Context) (DiagnoseType, error) { 123 shard.Lock() 124 defer shard.Unlock() 125 diagnoseResult, err := shard.diagnoseLocked(ctx) 126 shard.shardStatusCollector.recordDiagnoseResult(diagnoseResult) 127 shard.populateVTGRStatusLocked() 128 if diagnoseResult != DiagnoseTypeHealthy { 129 shard.logger.Warningf(`VTGR diagnose shard as unhealthy for %s/%s: result=%v, last_result=%v, instances=%v, primary=%v, primary_tablet=%v, problematics=%v, unreachables=%v,\n%v`, 130 shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, 131 shard.shardStatusCollector.status.DiagnoseResult, 132 shard.lastDiagnoseResult, 133 shard.shardStatusCollector.status.Instances, 134 shard.shardStatusCollector.status.Primary, 135 shard.primaryTabletAlias(), 136 shard.shardStatusCollector.status.Problematics, 137 shard.shardStatusCollector.status.Unreachables, 138 shard.sqlGroup.ToString()) 139 } 140 if diagnoseResult != shard.lastDiagnoseResult { 141 shard.lastDiagnoseResult = diagnoseResult 142 shard.lastDiagnoseSince = time.Now() 143 } 144 return diagnoseResult, err 145 } 146 147 func (shard *GRShard) diagnoseLocked(ctx context.Context) (DiagnoseType, error) { 148 // fast path only diagnose problem Vitess primary 149 // which does not needed if the shard is inactive 150 if shard.localDbPort != 0 && shard.isActive.Get() { 151 localView := shard.getLocalView() 152 if localView != nil { 153 fastDiagnose := shard.fastPathDiagnose(ctx, localView) 154 if fastDiagnose != diagnoseTypeUnknown { 155 // If we can use local sql group info to diagnose 156 // we should record the view as well. This view is all we need 157 // later VTGR needs to find group name, primary etc from 158 // SQLGroup for repairing instead of getting nil 159 shard.sqlGroup.overrideView([]*db.GroupView{localView}) 160 shard.logger.Infof("Diagnose %v from fast path", fastDiagnose) 161 return fastDiagnose, nil 162 } 163 } 164 } 165 // fast path is disabled or cannot diagnose the shard 166 // fall back to the normal strategy where we fetch info from all the nodes 167 err := shard.refreshSQLGroup() 168 if err != nil { 169 if errors.Is(err, db.ErrGroupBackoffError) { 170 return DiagnoseTypeBackoffError, nil 171 } 172 if errors.Is(err, db.ErrGroupOngoingBootstrap) { 173 return DiagnoseTypeBootstrapBackoff, nil 174 } 175 return DiagnoseTypeError, vterrors.Wrap(err, "fail to refreshSQLGroup") 176 } 177 // First, we check if there is any group in the shard 178 // if no, we should bootstrap one 179 mysqlGroup := shard.shardAgreedGroupName() 180 if mysqlGroup == "" { 181 if len(shard.sqlGroup.views) != shard.sqlGroup.expectedBootstrapSize { 182 return DiagnoseTypeError, fmt.Errorf("fail to diagnose ShardHasNoGroup with %v nodes", len(shard.sqlGroup.views)) 183 } 184 return DiagnoseTypeShardHasNoGroup, nil 185 } 186 // We handle the case where the shard has an agreed group name but all nodes are offline 187 // In this situation, instead of bootstrap a group, we should re-build the 188 // old group for the shard 189 if shard.isAllOfflineOrError() { 190 shard.logger.Info("Found all members are OFFLINE or ERROR") 191 // On rebootstrap, we always want to make sure _all_ the nodes in topo are reachable 192 // unless we override the rebootstrap size 193 desiredRebootstrapSize := len(shard.instances) 194 if shard.sqlGroup.rebootstrapSize != 0 { 195 desiredRebootstrapSize = shard.sqlGroup.rebootstrapSize 196 } 197 if len(shard.sqlGroup.views) != desiredRebootstrapSize { 198 return DiagnoseTypeError, fmt.Errorf("fail to diagnose ShardHasInactiveGroup with %v nodes expecting %v", len(shard.sqlGroup.views), desiredRebootstrapSize) 199 } 200 return DiagnoseTypeShardHasInactiveGroup, nil 201 } 202 203 // We only check Vitess primary iff shard is active. 204 // Otherwise VTGR will only make sure there is a mysql group in the shard. 205 if shard.isActive.Get() { 206 // Secondly, we check if there is a primary tablet. 207 // If there is a group but we cannot find a primary tablet 208 // we should set it based on mysql group 209 hasWrongPrimary, err := shard.hasWrongPrimaryTablet(ctx) 210 if err != nil { 211 // errMissingGroup means we cannot find a mysql group for the shard 212 // we are in DiagnoseTypeShardHasNoGroup state 213 if err == errMissingGroup { 214 shard.logger.Warning("Missing mysql group") 215 return DiagnoseTypeShardHasNoGroup, nil 216 } 217 // errMissingPrimaryTablet means we cannot find a tablet based on mysql primary 218 // which means the tablet disconnected from topo server and we cannot find it 219 if err == errMissingPrimaryTablet { 220 return DiagnoseTypeUnreachablePrimary, nil 221 } 222 return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose shardNeedsInitialized") 223 } 224 if hasWrongPrimary { 225 return DiagnoseTypeWrongPrimaryTablet, nil 226 } 227 228 // Thirdly, we check if primary tablet is reachable 229 isPrimaryReachable, err := shard.isPrimaryReachable(ctx) 230 if err != nil { 231 return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose isPrimaryReachable") 232 } 233 if !isPrimaryReachable { 234 return DiagnoseTypeUnreachablePrimary, nil 235 } 236 } 237 238 // At this point, the primary tablet should be consistent with mysql primary 239 // so the view from priamry tablet should be accurate 240 onlineMembers, isReadOnly := shard.getOnlineGroupInfo() 241 // If we found a writable shard in the inactive shard 242 // we should consider the shard as InsufficientGroupSize to set read only 243 if !isReadOnly && !shard.isActive.Get() { 244 return DiagnoseTypeInsufficientGroupSize, nil 245 } 246 // Then we check if we satisfy the minimum replica requirement 247 if shard.minNumReplicas > 0 { 248 if onlineMembers >= shard.minNumReplicas && isReadOnly && shard.isActive.Get() { 249 return DiagnoseTypeReadOnlyShard, nil 250 } 251 // If we disable readonly protection and still found we have a read only shard, 252 // we should return DiagnoseTypeReadOnlyShard so that VTGR can turn off read only 253 if shard.disableReadOnlyProtection && isReadOnly && shard.isActive.Get() { 254 return DiagnoseTypeReadOnlyShard, nil 255 } 256 // We don't check isActive here since if it is inactive, VTGR should already return InsufficientGroupSize 257 if !shard.disableReadOnlyProtection && onlineMembers < shard.minNumReplicas && !isReadOnly { 258 return DiagnoseTypeInsufficientGroupSize, nil 259 } 260 } 261 262 // Lastly, we check if there is a replica that is not connected to primary node 263 disconnectedInstance, err := shard.disconnectedInstance() 264 if err != nil { 265 return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose disconnectedInstance") 266 } 267 if disconnectedInstance != nil { 268 return DiagnoseTypeUnconnectedReplica, nil 269 } 270 271 // If we get here, shard is DiagnoseTypeHealthy 272 return DiagnoseTypeHealthy, nil 273 } 274 275 func (shard *GRShard) getLocalView() *db.GroupView { 276 localHostname, _ := os.Hostname() 277 localInst := shard.findTabletByHostAndPort(localHostname, shard.localDbPort) 278 if localInst == nil { 279 return nil 280 } 281 // TODO: consider using -db_socket to read local info 282 view, err := shard.dbAgent.FetchGroupView(localInst.alias, localInst.instanceKey) 283 // We still have the fallback logic if this failed, therefore we don't raise error 284 // but try to get local view with best effort 285 if err != nil { 286 shard.logger.Errorf("failed to fetch local group view: %v", err) 287 } 288 return view 289 } 290 291 func (shard *GRShard) fastPathDiagnose(ctx context.Context, view *db.GroupView) DiagnoseType { 292 pHost, pPort, isOnline := view.GetPrimaryView() 293 primaryTablet := shard.findShardPrimaryTablet() 294 if !isOnline || pHost == "" || pPort == 0 || primaryTablet == nil { 295 return diagnoseTypeUnknown 296 } 297 // VTGR will only bootstrap a group when it observes same number of views as group_size 298 // it means if we can find an ONLINE primary, we should be able to trust the view reported locally 299 // together with the primary tablet from topo server, we can determine: 300 // - if we need to failover vitess 301 // - if we need to failover mysql 302 if primaryTablet.instanceKey.Hostname != pHost || primaryTablet.instanceKey.Port != pPort { 303 // we find a mismatch but if the reported mysql primary is not in 304 // topology we should consider it as unreachable. 305 if shard.findTabletByHostAndPort(pHost, pPort) == nil { 306 return DiagnoseTypeUnreachablePrimary 307 } 308 return DiagnoseTypeWrongPrimaryTablet 309 } 310 if !shard.instanceReachable(ctx, primaryTablet) { 311 return DiagnoseTypeUnreachablePrimary 312 } 313 return diagnoseTypeUnknown 314 } 315 316 func (shard *GRShard) shardAgreedGroupName() string { 317 if len(shard.instances) == 0 { 318 return "" 319 } 320 return shard.sqlGroup.GetGroupName() 321 } 322 323 func (shard *GRShard) isAllOfflineOrError() bool { 324 return shard.sqlGroup.IsAllOfflineOrError() 325 } 326 327 func (shard *GRShard) getOnlineGroupInfo() (int, bool) { 328 return shard.sqlGroup.GetOnlineGroupInfo() 329 } 330 331 func (shard *GRShard) hasWrongPrimaryTablet(ctx context.Context) (bool, error) { 332 // Find out the hostname and port of the primary in mysql group 333 // we try to use local instance and then fallback to a random instance to check mysqld 334 // in case the primary is unreachable 335 host, port, _ := shard.sqlGroup.GetPrimary() 336 if !isHostPortValid(host, port) { 337 shard.logger.Warningf("Invalid address for primary %v:%v", host, port) 338 return false, errMissingGroup 339 } 340 // Make sure we have a tablet available 341 // findTabletByHostAndPort returns nil when we cannot find a tablet 342 // that is running on host:port, which means the tablet get stuck 343 // or when the tablet is not reachable 344 // we retrun errMissingPrimaryTablet so that VTGR will trigger a failover 345 tablet := shard.findTabletByHostAndPort(host, port) 346 if tablet == nil || !shard.instanceReachable(ctx, tablet) { 347 shard.logger.Errorf("Failed to find tablet that is running with mysql on %v:%v", host, port) 348 return false, errMissingPrimaryTablet 349 } 350 // Now we know we have a valid mysql primary in the group 351 // we should make sure tablets are aligned with it 352 primary := shard.findShardPrimaryTablet() 353 // If we failed to find primary for shard, it mostly means we are initializing the shard 354 // return true directly so that VTGR will set primary tablet according to MySQL group 355 if primary == nil { 356 shard.logger.Infof("unable to find primary tablet for %v", formatKeyspaceShard(shard.KeyspaceShard)) 357 return true, nil 358 } 359 return (host != primary.instanceKey.Hostname) || (port != primary.instanceKey.Port), nil 360 } 361 362 func (shard *GRShard) isPrimaryReachable(ctx context.Context) (bool, error) { 363 primaryTablet := shard.findShardPrimaryTablet() 364 if primaryTablet == nil { 365 return false, fmt.Errorf("unable to find primary for %v", formatKeyspaceShard(shard.KeyspaceShard)) 366 } 367 return shard.instanceReachable(ctx, primaryTablet), nil 368 } 369 370 func (shard *GRShard) instanceReachable(ctx context.Context, instance *grInstance) bool { 371 pingCtx, cancel := context.WithTimeout(context.Background(), pingTabletTimeout) 372 defer cancel() 373 c := make(chan error, 1) 374 // tmc.Ping create grpc client connection first without timeout via dial 375 // then call the grpc endpoint using the context with timeout 376 // this is problematic if the host is really unreachable, we have to wait the 377 // all the retries inside grpc.dial with exponential backoff 378 go func() { c <- shard.tmc.Ping(pingCtx, instance.tablet) }() 379 select { 380 case <-pingCtx.Done(): 381 shard.logger.Errorf("Ping abort timeout %v", pingTabletTimeout) 382 return false 383 case err := <-c: 384 if err != nil { 385 shard.logger.Errorf("Ping error host=%v: %v", instance.instanceKey.Hostname, err) 386 } 387 return err == nil 388 } 389 } 390 391 // findShardPrimaryTablet returns the primary for the shard 392 // it is either based on shard info from global topo or based on tablet types 393 // from local topo 394 func (shard *GRShard) findShardPrimaryTablet() *grInstance { 395 var primaryInstance *grInstance 396 for _, instance := range shard.instances { 397 if shard.primaryAlias == instance.alias { 398 return instance 399 } 400 } 401 return primaryInstance 402 } 403 404 func (shard *GRShard) primaryTabletAlias() string { 405 primary := shard.findShardPrimaryTablet() 406 if primary == nil { 407 return "UNKNOWN" 408 } 409 return primary.alias 410 } 411 412 // disconnectedInstance iterates all known the replica records 413 // and checks mysql to see if the group replication is setup on it 414 func (shard *GRShard) disconnectedInstance() (*grInstance, error) { 415 primaryInstance := shard.findShardPrimaryTablet() 416 // if there is no primary, we should recover from DiagnoseTypeWrongPrimaryTablet 417 if primaryInstance == nil { 418 return nil, fmt.Errorf("%v does not have primary", formatKeyspaceShard(shard.KeyspaceShard)) 419 } 420 // Up to this check, we know: 421 // - shard has an agreed group 422 // - shard has a primary tablet 423 // - shard primary tablet is running on the same node as mysql 424 rand.Shuffle(len(shard.instances), func(i, j int) { 425 shard.instances[i], shard.instances[j] = shard.instances[j], shard.instances[i] 426 }) 427 for _, instance := range shard.instances { 428 // Skip instance without hostname because they are not up and running 429 // also skip instances that raised unrecoverable errors 430 if shard.shardStatusCollector.isUnreachable(instance) { 431 shard.logger.Infof("Skip %v to check disconnectedInstance because it is unhealthy", instance.alias) 432 continue 433 } 434 isUnconnected := shard.sqlGroup.IsUnconnectedReplica(instance.instanceKey) 435 if isUnconnected { 436 return instance, nil 437 } 438 } 439 return nil, nil 440 } 441 442 func (recorder *groupGTIDRecorder) recordGroupStatus(name string, isActive bool) error { 443 recorder.Lock() 444 defer recorder.Unlock() 445 if recorder.name != "" && recorder.name != name { 446 return fmt.Errorf("group has more than one group name") 447 } 448 recorder.name = name 449 // hasActive records true if any node finds an active member 450 if isActive { 451 recorder.hasActive = true 452 } 453 return nil 454 } 455 456 func (recorder *groupGTIDRecorder) recordGroupGTIDs(gtids mysql.GTIDSet, instance *grInstance) { 457 recorder.Lock() 458 defer recorder.Unlock() 459 recorder.gtidWithInstances = append(recorder.gtidWithInstances, &instanceGTIDSet{gtids: gtids, instance: instance}) 460 } 461 462 func (recorder *groupGTIDRecorder) sort() { 463 sort.SliceStable(recorder.gtidWithInstances, func(i, j int) bool { 464 return recorder.gtidWithInstances[i].instance.alias < recorder.gtidWithInstances[j].instance.alias 465 }) 466 } 467 468 func (collector *shardStatusCollector) recordDiagnoseResult(result DiagnoseType) { 469 collector.Lock() 470 defer collector.Unlock() 471 collector.status.DiagnoseResult = result 472 } 473 474 func (collector *shardStatusCollector) recordUnreachables(instance *grInstance) { 475 collector.Lock() 476 defer collector.Unlock() 477 // dedup 478 // the list size is at most same as number instances in a shard so iterate to dedup is not terrible 479 for _, alias := range collector.status.Unreachables { 480 if alias == instance.alias { 481 return 482 } 483 } 484 collector.status.Unreachables = append(collector.status.Unreachables, instance.alias) 485 } 486 487 func (collector *shardStatusCollector) clear() { 488 collector.Lock() 489 defer collector.Unlock() 490 collector.status.Unreachables = nil 491 collector.status.Problematics = nil 492 } 493 494 func (collector *shardStatusCollector) recordProblematics(instance *grInstance) { 495 collector.Lock() 496 defer collector.Unlock() 497 // dedup 498 // the list size is at most same as number instances in a shard so iterate to dedup is not terrible 499 for _, alias := range collector.status.Problematics { 500 if alias == instance.alias { 501 return 502 } 503 } 504 collector.status.Problematics = append(collector.status.Problematics, instance.alias) 505 } 506 507 func formatKeyspaceShard(keyspaceShard *topo.KeyspaceShard) string { 508 return fmt.Sprintf("%v/%v", keyspaceShard.Keyspace, keyspaceShard.Shard) 509 } 510 511 func isHostPortValid(host string, port int) bool { 512 return host != "" && port != 0 513 } 514 515 // We use forAllInstances in two cases: 516 // 1. FetchGroupView GTIDs to find a candidate for failover. 517 // If a node is not healthy it should not be considered as a failover candidate 518 // 519 // 2. FetchGroupView group member status to see if we need to bootstrap a group, 520 // either for the first time or rebuild a group after all the nodes are died. 521 // 522 // caller will be responsible to decide if they want to tolerate errors from the forAllInstances call 523 func (shard *GRShard) forAllInstances(task func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder)) *concurrency.AllErrorRecorder { 524 errorRecord := concurrency.AllErrorRecorder{} 525 shard.shardStatusCollector.clear() 526 var wg sync.WaitGroup 527 for _, instance := range shard.instances { 528 wg.Add(1) 529 go task(instance, &wg, &errorRecord) 530 } 531 wg.Wait() 532 if len(errorRecord.Errors) > 0 { 533 shard.logger.Errorf("get errors in forAllInstances call: %v", errorRecord.Error()) 534 } 535 return &errorRecord 536 } 537 538 func unreachableError(err error) bool { 539 contains := []string{ 540 // "no such host"/"no route to host" is the error when a host is not reachalbe 541 "no such host", 542 "no route to host", 543 // "connect: connection refused" is the error when a mysqld refused the connection 544 "connect: connection refused", 545 // "invalid mysql instance key" is the error when a tablet does not populate mysql hostname or port 546 // this can happen if the tablet crashed. We keep them in the grShard.instances list to compute 547 // quorum but consider it as an unreachable host. 548 "invalid mysql instance key", 549 } 550 for _, k := range contains { 551 if strings.Contains(err.Error(), k) { 552 return true 553 } 554 } 555 return false 556 } 557 558 // refreshSQLGroup hits all instances and renders a SQL group locally for later diagnoses 559 // the SQL group contains a list of "views" for the group from all the available nodes 560 func (shard *GRShard) refreshSQLGroup() error { 561 // reset views in sql group 562 shard.sqlGroup.clear() 563 er := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { 564 defer wg.Done() 565 view, err := shard.dbAgent.FetchGroupView(instance.alias, instance.instanceKey) 566 // We just log error here because we rely on mysql tells us if it is happy or not 567 // If the node is unreachable 568 if err != nil { 569 er.RecordError(err) 570 shard.shardStatusCollector.recordProblematics(instance) 571 if unreachableError(err) { 572 shard.shardStatusCollector.recordUnreachables(instance) 573 } 574 shard.logger.Errorf("%v get error while fetch group info: %v", instance.alias, err) 575 return 576 } 577 shard.sqlGroup.recordView(view) 578 }) 579 // Only raise error if we failed to get any data from mysql 580 // otherwise, we will use what we get from mysql directly 581 if len(er.Errors) == len(shard.instances) { 582 shard.logger.Errorf("fail to fetch any data for mysql") 583 return db.ErrGroupBackoffError 584 } 585 return shard.sqlGroup.Resolve() 586 }