vitess.io/vitess@v0.16.2/go/vt/vtgr/controller/refresh.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controller 18 19 import ( 20 "fmt" 21 "strconv" 22 "sync" 23 "time" 24 25 "vitess.io/vitess/go/vt/topo/topoproto" 26 27 "golang.org/x/net/context" 28 29 "vitess.io/vitess/go/stats" 30 "vitess.io/vitess/go/sync2" 31 "vitess.io/vitess/go/vt/logutil" 32 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 33 "vitess.io/vitess/go/vt/topo" 34 "vitess.io/vitess/go/vt/vtgr/config" 35 "vitess.io/vitess/go/vt/vtgr/db" 36 "vitess.io/vitess/go/vt/vtgr/inst" 37 "vitess.io/vitess/go/vt/vtgr/log" 38 ) 39 40 var ( 41 lockShardTimingsMs = stats.NewMultiTimings("lockShard", "time vtgr takes to lock the shard", []string{"operation", "success"}) 42 ) 43 44 // grInstance represents an instance that's running MySQL GR 45 // it wraps a InstanceKey plus some tablet related information 46 type grInstance struct { 47 instanceKey *inst.InstanceKey 48 tablet *topodatapb.Tablet 49 primaryTimeStamp time.Time 50 alias string 51 } 52 53 // GRTopo is VTGR wrapper for topo server 54 type GRTopo interface { 55 GetShardNames(ctx context.Context, keyspace string) ([]string, error) 56 GetShard(ctx context.Context, keyspace, shard string) (*topo.ShardInfo, error) 57 GetTabletMapForShardByCell(ctx context.Context, keyspace, shard string, cells []string) (map[string]*topo.TabletInfo, error) 58 LockShard(ctx context.Context, keyspace, shard, action string) (context.Context, func(*error), error) 59 } 60 61 // GRTmcClient is VTGR wrapper for tmc client 62 type GRTmcClient interface { 63 ChangeType(ctx context.Context, tablet *topodatapb.Tablet, dbType topodatapb.TabletType, semiSync bool) error 64 Ping(ctx context.Context, tablet *topodatapb.Tablet) error 65 } 66 67 // GRShard stores the information about a Vitess shard that's running MySQL GR 68 type GRShard struct { 69 KeyspaceShard *topo.KeyspaceShard 70 cells []string 71 instances []*grInstance 72 primaryAlias string 73 shardStatusCollector *shardStatusCollector 74 sqlGroup *SQLGroup 75 ts GRTopo 76 tmc GRTmcClient 77 dbAgent db.Agent 78 79 // Every GRShard tracks a unlock function after it grab a topo lock for the shard 80 // VTGR needs to release the topo lock before gracefully shutdown 81 unlock func(*error) 82 // mutex to protect unlock function access 83 unlockMu sync.Mutex 84 85 // configuration 86 minNumReplicas int 87 localDbPort int 88 disableReadOnlyProtection bool 89 90 transientErrorWaitTime time.Duration 91 bootstrapWaitTime time.Duration 92 93 lastDiagnoseResult DiagnoseType 94 lastDiagnoseSince time.Time 95 96 isActive sync2.AtomicBool 97 98 logger *log.Logger 99 100 // lock prevents multiple go routine fights with each other 101 sync.Mutex 102 } 103 104 // shardStatusCollector is used for collecting shard status 105 type shardStatusCollector struct { 106 status *ShardStatus 107 sync.Mutex 108 } 109 110 // ShardStatus is used for debugging purpose to get current status of a shard 111 type ShardStatus struct { 112 Keyspace string 113 Shard string 114 Instances []string 115 Unreachables []string 116 Problematics []string 117 Primary string 118 DiagnoseResult DiagnoseType 119 } 120 121 func newShardStatusCollector(keyspace, shard string) *shardStatusCollector { 122 return &shardStatusCollector{ 123 status: &ShardStatus{Keyspace: keyspace, Shard: shard}, 124 } 125 } 126 127 // NewGRShard creates a new GRShard 128 func NewGRShard( 129 keyspace, shard string, 130 cells []string, 131 tmc GRTmcClient, 132 ts GRTopo, 133 dbAgent db.Agent, 134 config *config.VTGRConfig, 135 localDbPort int, 136 isActive bool) *GRShard { 137 grShard := &GRShard{ 138 KeyspaceShard: &topo.KeyspaceShard{Keyspace: keyspace, Shard: shard}, 139 cells: cells, 140 shardStatusCollector: newShardStatusCollector(keyspace, shard), 141 tmc: tmc, 142 ts: ts, 143 dbAgent: dbAgent, 144 unlock: nil, 145 sqlGroup: NewSQLGroup(config.BootstrapGroupSize, true, keyspace, shard), 146 minNumReplicas: config.MinNumReplica, 147 disableReadOnlyProtection: config.DisableReadOnlyProtection, 148 localDbPort: localDbPort, 149 logger: log.NewVTGRLogger(keyspace, shard), 150 transientErrorWaitTime: time.Duration(config.BackoffErrorWaitTimeSeconds) * time.Second, 151 bootstrapWaitTime: time.Duration(config.BootstrapWaitTimeSeconds) * time.Second, 152 } 153 grShard.isActive.Set(isActive) 154 return grShard 155 } 156 157 // refreshTabletsInShardLocked is called by repair to get a fresh view of the shard 158 // The caller is responsible to make sure the lock on GRShard 159 func (shard *GRShard) refreshTabletsInShardLocked(ctx context.Context) { 160 instances, err := shard.refreshTabletsInShardInternal(ctx) 161 if err == nil { 162 shard.instances = instances 163 } 164 primary, err := shard.refreshPrimaryShard(ctx) 165 if err == nil { 166 shard.primaryAlias = primary 167 return 168 } 169 // If we failed to refreshPrimaryShard, use primary from local tablets 170 shard.primaryAlias = shard.findPrimaryFromLocalCell() 171 } 172 173 // UpdateTabletsInShardWithLock updates the shard instances with a lock 174 func (shard *GRShard) UpdateTabletsInShardWithLock(ctx context.Context) { 175 instances, err := shard.refreshTabletsInShardInternal(ctx) 176 if err == nil { 177 // Take a per shard lock here when we actually refresh the data to avoid 178 // race conditions bewteen controller and repair tasks 179 shard.Lock() 180 shard.instances = instances 181 shard.Unlock() 182 } 183 primary, err := shard.refreshPrimaryShard(ctx) 184 // We set primary separately from instances so that if global topo is not available 185 // VTGR can still discover the new tablets from local cell 186 shard.Lock() 187 defer shard.Unlock() 188 if err == nil { 189 shard.primaryAlias = primary 190 return 191 } 192 shard.primaryAlias = shard.findPrimaryFromLocalCell() 193 } 194 195 func (shard *GRShard) refreshTabletsInShardInternal(ctx context.Context) ([]*grInstance, error) { 196 keyspace, shardName := shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard 197 tablets, err := shard.ts.GetTabletMapForShardByCell(ctx, keyspace, shardName, shard.cells) 198 if err != nil { 199 shard.logger.Errorf("Error fetching tablets for keyspace/shardName %v/%v: %v", keyspace, shardName, err) 200 return nil, err 201 } 202 return parseTabletInfos(tablets), nil 203 } 204 205 func (shard *GRShard) refreshPrimaryShard(ctx context.Context) (string, error) { 206 keyspace, shardName := shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard 207 si, err := shard.ts.GetShard(ctx, keyspace, shardName) 208 if err != nil { 209 shard.logger.Errorf("Error calling GetShard: %v", err) 210 return "", err 211 } 212 return topoproto.TabletAliasString(si.PrimaryAlias), nil 213 } 214 215 // findPrimaryFromLocalCell iterates through the replicas stored in grShard and returns 216 // the one that's marked as primary 217 func (shard *GRShard) findPrimaryFromLocalCell() string { 218 var latestPrimaryTimestamp time.Time 219 var primaryInstance *grInstance 220 for _, instance := range shard.instances { 221 if instance.tablet.Type == topodatapb.TabletType_PRIMARY { 222 // It is possible that there are more than one master in topo server 223 // we should compare timestamp to pick the latest one 224 if latestPrimaryTimestamp.Before(instance.primaryTimeStamp) { 225 latestPrimaryTimestamp = instance.primaryTimeStamp 226 primaryInstance = instance 227 } 228 } 229 } 230 if primaryInstance != nil { 231 return primaryInstance.alias 232 } 233 return "" 234 } 235 236 // parseTabletInfos replaces the replica reports for the shard key 237 // Note: this is not thread-safe 238 func parseTabletInfos(tablets map[string]*topo.TabletInfo) []*grInstance { 239 // collect all replicas 240 var newReplicas []*grInstance 241 for alias, tabletInfo := range tablets { 242 tablet := tabletInfo.Tablet 243 // Only monitor primary, replica and ronly tablet types 244 switch tablet.Type { 245 case topodatapb.TabletType_PRIMARY, topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY: 246 // mysql hostname and port might be empty here if tablet is not running 247 // we will treat them as unreachable 248 instanceKey := inst.InstanceKey{ 249 Hostname: tablet.MysqlHostname, 250 Port: int(tablet.MysqlPort), 251 } 252 grInstance := grInstance{ 253 instanceKey: &instanceKey, 254 tablet: tablet, 255 primaryTimeStamp: logutil.ProtoToTime(tablet.PrimaryTermStartTime), 256 alias: alias, 257 } 258 newReplicas = append(newReplicas, &grInstance) 259 } 260 } 261 return newReplicas 262 } 263 264 // LockShard locks the keyspace-shard on topo server to prevent others from executing conflicting actions. 265 func (shard *GRShard) LockShard(ctx context.Context, action string) (context.Context, error) { 266 if shard.KeyspaceShard.Keyspace == "" || shard.KeyspaceShard.Shard == "" { 267 return nil, fmt.Errorf("try to grab lock with incomplete information: %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) 268 } 269 shard.unlockMu.Lock() 270 defer shard.unlockMu.Unlock() 271 if shard.unlock != nil { 272 return nil, fmt.Errorf("try to grab lock for %s/%s while the shard holds an unlock function", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) 273 } 274 start := time.Now() 275 ctx, unlock, err := shard.ts.LockShard(ctx, shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, fmt.Sprintf("VTGR repairing %s", action)) 276 lockShardTimingsMs.Record([]string{action, strconv.FormatBool(err == nil)}, start) 277 if err != nil { 278 return nil, err 279 } 280 shard.unlock = unlock 281 return ctx, nil 282 } 283 284 // UnlockShard unlocks the keyspace-shard on topo server 285 // and set the unlock function to nil in the container 286 func (shard *GRShard) UnlockShard() { 287 shard.unlockMu.Lock() 288 defer shard.unlockMu.Unlock() 289 if shard.unlock == nil { 290 shard.logger.Warningf("Shard %s/%s does not hold a lock", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) 291 return 292 } 293 var err error 294 shard.unlock(&err) 295 shard.unlock = nil 296 } 297 298 func (shard *GRShard) findTabletByHostAndPort(host string, port int) *grInstance { 299 for _, instance := range shard.instances { 300 if instance.instanceKey.Hostname == host && instance.instanceKey.Port == port { 301 return instance 302 } 303 } 304 return nil 305 } 306 307 func (shard *GRShard) getToleratedNumError() int { 308 quorum := len(shard.instances)/2 + 1 309 return len(shard.instances) - quorum 310 } 311 312 func (shard *GRShard) populateVTGRStatusLocked() { 313 var instanceList []string 314 for _, instance := range shard.instances { 315 instanceList = append(instanceList, instance.alias) 316 } 317 shard.shardStatusCollector.status.Instances = instanceList 318 if primary := shard.findShardPrimaryTablet(); primary != nil { 319 shard.shardStatusCollector.status.Primary = primary.alias 320 } 321 } 322 323 // GetCurrentShardStatuses returns the status collector has 324 func (shard *GRShard) GetCurrentShardStatuses() ShardStatus { 325 shard.Lock() 326 collector := shard.shardStatusCollector 327 // dereference status so that we return a copy of the struct 328 status := *collector.status 329 shard.Unlock() 330 return status 331 } 332 333 // OverrideRebootstrapGroupSize force override the group expectedBootstrapSize used in safety check for rebootstrap 334 func (shard *GRShard) OverrideRebootstrapGroupSize(groupSize int) error { 335 shard.Lock() 336 defer shard.Unlock() 337 shard.logger.Infof("Override rebootstrap group size=%v", groupSize) 338 shard.sqlGroup.rebootstrapSize = groupSize 339 return nil 340 } 341 342 // GetUnlock returns the unlock function for the shard for testing 343 func (shard *GRShard) GetUnlock() func(*error) { 344 shard.unlockMu.Lock() 345 defer shard.unlockMu.Unlock() 346 return shard.unlock 347 } 348 349 // SetIsActive sets isActive for the shard 350 func (shard *GRShard) SetIsActive(isActive bool) { 351 shard.logger.Infof("Setting is active to %v", isActive) 352 shard.isActive.Set(isActive) 353 } 354 355 func (collector *shardStatusCollector) isUnreachable(instance *grInstance) bool { 356 if instance.instanceKey == nil || instance.instanceKey.Hostname == "" { 357 return true 358 } 359 for _, alias := range collector.status.Unreachables { 360 if instance.alias == alias { 361 return true 362 } 363 } 364 return false 365 }