vitess.io/vitess@v0.16.2/go/vt/vtgr/vtgr.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vtgr 18 19 import ( 20 "context" 21 "errors" 22 "os" 23 "os/signal" 24 "strings" 25 "sync" 26 "syscall" 27 "time" 28 29 "github.com/spf13/pflag" 30 31 "vitess.io/vitess/go/sync2" 32 "vitess.io/vitess/go/vt/concurrency" 33 "vitess.io/vitess/go/vt/log" 34 "vitess.io/vitess/go/vt/servenv" 35 "vitess.io/vitess/go/vt/topo" 36 "vitess.io/vitess/go/vt/vtgr/config" 37 "vitess.io/vitess/go/vt/vtgr/controller" 38 "vitess.io/vitess/go/vt/vtgr/db" 39 "vitess.io/vitess/go/vt/vttablet/tmclient" 40 ) 41 42 var ( 43 refreshInterval = 10 * time.Second 44 scanInterval = 3 * time.Second 45 scanAndRepairTimeout = 3 * time.Second 46 vtgrConfigFile string 47 48 localDbPort int 49 ) 50 51 func init() { 52 servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { 53 fs.DurationVar(&refreshInterval, "refresh_interval", 10*time.Second, "Refresh interval to load tablets.") 54 fs.DurationVar(&scanInterval, "scan_interval", 3*time.Second, "Scan interval to diagnose and repair.") 55 fs.DurationVar(&scanAndRepairTimeout, "scan_repair_timeout", 3*time.Second, "Time to wait for a Diagnose and repair operation.") 56 fs.StringVar(&vtgrConfigFile, "vtgr_config", "", "Config file for vtgr.") 57 fs.IntVar(&localDbPort, "db_port", 0, "Local mysql port, set this to enable local fast check.") 58 }) 59 } 60 61 // VTGR is the interface to manage the component to set up group replication with Vitess. 62 // The main goal of it is to reconcile MySQL group and the Vitess topology. 63 // Caller should use OpenTabletDiscovery to create the VTGR instance. 64 type VTGR struct { 65 // Shards are all the shards that a VTGR is monitoring. 66 // Caller can choose to iterate the shards to scan and repair for more granular control (e.g., stats report) 67 // instead of calling ScanAndRepair() directly. 68 Shards []*controller.GRShard 69 topo controller.GRTopo 70 tmc tmclient.TabletManagerClient 71 ctx context.Context 72 73 stopped sync2.AtomicBool 74 } 75 76 func newVTGR(ctx context.Context, ts controller.GRTopo, tmc tmclient.TabletManagerClient) *VTGR { 77 return &VTGR{ 78 topo: ts, 79 tmc: tmc, 80 ctx: ctx, 81 } 82 } 83 84 // OpenTabletDiscovery calls OpenTabletDiscoveryWithAcitve and set the shard to be active 85 // it opens connection with topo server 86 // and triggers the first round of controller based on specified cells and keyspace/shards. 87 func OpenTabletDiscovery(ctx context.Context, cellsToWatch, clustersToWatch []string) *VTGR { 88 return OpenTabletDiscoveryWithAcitve(ctx, cellsToWatch, clustersToWatch, true) 89 } 90 91 // OpenTabletDiscoveryWithAcitve opens connection with topo server 92 // and triggers the first round of controller based on parameter 93 func OpenTabletDiscoveryWithAcitve(ctx context.Context, cellsToWatch, clustersToWatch []string, active bool) *VTGR { 94 if vtgrConfigFile == "" { 95 log.Fatal("vtgr_config is required") 96 } 97 config, err := config.ReadVTGRConfig(vtgrConfigFile) 98 if err != nil { 99 log.Fatalf("Cannot load vtgr config file: %v", err) 100 } 101 vtgr := newVTGR( 102 ctx, 103 topo.Open(), 104 tmclient.NewTabletManagerClient(), 105 ) 106 var shards []*controller.GRShard 107 ctx, cancel := context.WithTimeout(vtgr.ctx, topo.RemoteOperationTimeout) 108 defer cancel() 109 for _, ks := range clustersToWatch { 110 if strings.Contains(ks, "/") { 111 // This is a keyspace/shard specification 112 input := strings.Split(ks, "/") 113 shards = append(shards, controller.NewGRShard(input[0], input[1], cellsToWatch, vtgr.tmc, vtgr.topo, db.NewVTGRSqlAgent(), config, localDbPort, active)) 114 } else { 115 // Assume this is a keyspace and find all shards in keyspace 116 shardNames, err := vtgr.topo.GetShardNames(ctx, ks) 117 if err != nil { 118 // Log the error and continue 119 log.Errorf("Error fetching shards for keyspace %v: %v", ks, err) 120 continue 121 } 122 if len(shardNames) == 0 { 123 log.Errorf("Topo has no shards for ks: %v", ks) 124 continue 125 } 126 for _, s := range shardNames { 127 shards = append(shards, controller.NewGRShard(ks, s, cellsToWatch, vtgr.tmc, vtgr.topo, db.NewVTGRSqlAgent(), config, localDbPort, active)) 128 } 129 } 130 } 131 vtgr.handleSignal(os.Exit) 132 vtgr.Shards = shards 133 log.Infof("Monitoring shards size %v", len(vtgr.Shards)) 134 // Force refresh all tablet here to populate data for vtgr 135 var wg sync.WaitGroup 136 for _, shard := range vtgr.Shards { 137 wg.Add(1) 138 go func(shard *controller.GRShard) { 139 defer wg.Done() 140 shard.UpdateTabletsInShardWithLock(ctx) 141 }(shard) 142 } 143 wg.Wait() 144 log.Info("Ready to start VTGR") 145 return vtgr 146 } 147 148 // RefreshCluster get the latest tablets from topo server 149 func (vtgr *VTGR) RefreshCluster() { 150 for _, shard := range vtgr.Shards { 151 go func(shard *controller.GRShard) { 152 ticker := time.Tick(refreshInterval) 153 for range ticker { 154 ctx, cancel := context.WithTimeout(vtgr.ctx, refreshInterval) 155 shard.UpdateTabletsInShardWithLock(ctx) 156 cancel() 157 } 158 }(shard) 159 } 160 } 161 162 // ScanAndRepair starts the scanAndFix routine 163 func (vtgr *VTGR) ScanAndRepair() { 164 for _, shard := range vtgr.Shards { 165 go func(shard *controller.GRShard) { 166 ticker := time.Tick(scanInterval) 167 for range ticker { 168 func() { 169 ctx, cancel := context.WithTimeout(vtgr.ctx, scanAndRepairTimeout) 170 defer cancel() 171 if !vtgr.stopped.Get() { 172 log.Infof("Start scan and repair %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) 173 shard.ScanAndRepairShard(ctx) 174 log.Infof("Finished scan and repair %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) 175 } 176 }() 177 } 178 }(shard) 179 } 180 } 181 182 // Diagnose exposes the endpoint to diagnose a particular shard 183 func (vtgr *VTGR) Diagnose(ctx context.Context, shard *controller.GRShard) (controller.DiagnoseType, error) { 184 return shard.Diagnose(ctx) 185 } 186 187 // Repair exposes the endpoint to repair a particular shard 188 func (vtgr *VTGR) Repair(ctx context.Context, shard *controller.GRShard, diagnose controller.DiagnoseType) (controller.RepairResultCode, error) { 189 if vtgr.stopped.Get() { 190 return controller.Fail, errors.New("VTGR is stopped") 191 } 192 return shard.Repair(ctx, diagnose) 193 } 194 195 // GetCurrentShardStatuses is used when we want to know what VTGR observes 196 // it contains information about a list of instances and primary tablet 197 func (vtgr *VTGR) GetCurrentShardStatuses() []controller.ShardStatus { 198 var result []controller.ShardStatus 199 for _, shard := range vtgr.Shards { 200 status := shard.GetCurrentShardStatuses() 201 result = append(result, status) 202 } 203 return result 204 } 205 206 // OverrideRebootstrapGroupSize forces an override the group size used in safety check for rebootstrap 207 func (vtgr *VTGR) OverrideRebootstrapGroupSize(groupSize int) error { 208 errorRecord := concurrency.AllErrorRecorder{} 209 for _, shard := range vtgr.Shards { 210 err := shard.OverrideRebootstrapGroupSize(groupSize) 211 if err != nil { 212 errorRecord.RecordError(err) 213 } 214 } 215 return errorRecord.Error() 216 } 217 218 func (vtgr *VTGR) handleSignal(action func(int)) { 219 sigChan := make(chan os.Signal, 1) 220 signal.Notify(sigChan, syscall.SIGHUP) 221 go func() { 222 // block until the signal is received 223 <-sigChan 224 log.Infof("Handling SIGHUP") 225 // Set stopped to true so that following repair call won't do anything 226 // For the ongoing repairs, checkShardLocked will abort if needed 227 vtgr.stopped.Set(true) 228 for _, shard := range vtgr.Shards { 229 shard.UnlockShard() 230 } 231 action(1) 232 }() 233 }