vitess.io/vitess@v0.16.2/go/vt/vtgr/vtgr.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vtgr
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"os"
    23  	"os/signal"
    24  	"strings"
    25  	"sync"
    26  	"syscall"
    27  	"time"
    28  
    29  	"github.com/spf13/pflag"
    30  
    31  	"vitess.io/vitess/go/sync2"
    32  	"vitess.io/vitess/go/vt/concurrency"
    33  	"vitess.io/vitess/go/vt/log"
    34  	"vitess.io/vitess/go/vt/servenv"
    35  	"vitess.io/vitess/go/vt/topo"
    36  	"vitess.io/vitess/go/vt/vtgr/config"
    37  	"vitess.io/vitess/go/vt/vtgr/controller"
    38  	"vitess.io/vitess/go/vt/vtgr/db"
    39  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    40  )
    41  
    42  var (
    43  	refreshInterval      = 10 * time.Second
    44  	scanInterval         = 3 * time.Second
    45  	scanAndRepairTimeout = 3 * time.Second
    46  	vtgrConfigFile       string
    47  
    48  	localDbPort int
    49  )
    50  
    51  func init() {
    52  	servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) {
    53  		fs.DurationVar(&refreshInterval, "refresh_interval", 10*time.Second, "Refresh interval to load tablets.")
    54  		fs.DurationVar(&scanInterval, "scan_interval", 3*time.Second, "Scan interval to diagnose and repair.")
    55  		fs.DurationVar(&scanAndRepairTimeout, "scan_repair_timeout", 3*time.Second, "Time to wait for a Diagnose and repair operation.")
    56  		fs.StringVar(&vtgrConfigFile, "vtgr_config", "", "Config file for vtgr.")
    57  		fs.IntVar(&localDbPort, "db_port", 0, "Local mysql port, set this to enable local fast check.")
    58  	})
    59  }
    60  
    61  // VTGR is the interface to manage the component to set up group replication with Vitess.
    62  // The main goal of it is to reconcile MySQL group and the Vitess topology.
    63  // Caller should use OpenTabletDiscovery to create the VTGR instance.
    64  type VTGR struct {
    65  	// Shards are all the shards that a VTGR is monitoring.
    66  	// Caller can choose to iterate the shards to scan and repair for more granular control (e.g., stats report)
    67  	// instead of calling ScanAndRepair() directly.
    68  	Shards []*controller.GRShard
    69  	topo   controller.GRTopo
    70  	tmc    tmclient.TabletManagerClient
    71  	ctx    context.Context
    72  
    73  	stopped sync2.AtomicBool
    74  }
    75  
    76  func newVTGR(ctx context.Context, ts controller.GRTopo, tmc tmclient.TabletManagerClient) *VTGR {
    77  	return &VTGR{
    78  		topo: ts,
    79  		tmc:  tmc,
    80  		ctx:  ctx,
    81  	}
    82  }
    83  
    84  // OpenTabletDiscovery calls OpenTabletDiscoveryWithAcitve and set the shard to be active
    85  // it opens connection with topo server
    86  // and triggers the first round of controller based on specified cells and keyspace/shards.
    87  func OpenTabletDiscovery(ctx context.Context, cellsToWatch, clustersToWatch []string) *VTGR {
    88  	return OpenTabletDiscoveryWithAcitve(ctx, cellsToWatch, clustersToWatch, true)
    89  }
    90  
    91  // OpenTabletDiscoveryWithAcitve opens connection with topo server
    92  // and triggers the first round of controller based on parameter
    93  func OpenTabletDiscoveryWithAcitve(ctx context.Context, cellsToWatch, clustersToWatch []string, active bool) *VTGR {
    94  	if vtgrConfigFile == "" {
    95  		log.Fatal("vtgr_config is required")
    96  	}
    97  	config, err := config.ReadVTGRConfig(vtgrConfigFile)
    98  	if err != nil {
    99  		log.Fatalf("Cannot load vtgr config file: %v", err)
   100  	}
   101  	vtgr := newVTGR(
   102  		ctx,
   103  		topo.Open(),
   104  		tmclient.NewTabletManagerClient(),
   105  	)
   106  	var shards []*controller.GRShard
   107  	ctx, cancel := context.WithTimeout(vtgr.ctx, topo.RemoteOperationTimeout)
   108  	defer cancel()
   109  	for _, ks := range clustersToWatch {
   110  		if strings.Contains(ks, "/") {
   111  			// This is a keyspace/shard specification
   112  			input := strings.Split(ks, "/")
   113  			shards = append(shards, controller.NewGRShard(input[0], input[1], cellsToWatch, vtgr.tmc, vtgr.topo, db.NewVTGRSqlAgent(), config, localDbPort, active))
   114  		} else {
   115  			// Assume this is a keyspace and find all shards in keyspace
   116  			shardNames, err := vtgr.topo.GetShardNames(ctx, ks)
   117  			if err != nil {
   118  				// Log the error and continue
   119  				log.Errorf("Error fetching shards for keyspace %v: %v", ks, err)
   120  				continue
   121  			}
   122  			if len(shardNames) == 0 {
   123  				log.Errorf("Topo has no shards for ks: %v", ks)
   124  				continue
   125  			}
   126  			for _, s := range shardNames {
   127  				shards = append(shards, controller.NewGRShard(ks, s, cellsToWatch, vtgr.tmc, vtgr.topo, db.NewVTGRSqlAgent(), config, localDbPort, active))
   128  			}
   129  		}
   130  	}
   131  	vtgr.handleSignal(os.Exit)
   132  	vtgr.Shards = shards
   133  	log.Infof("Monitoring shards size %v", len(vtgr.Shards))
   134  	// Force refresh all tablet here to populate data for vtgr
   135  	var wg sync.WaitGroup
   136  	for _, shard := range vtgr.Shards {
   137  		wg.Add(1)
   138  		go func(shard *controller.GRShard) {
   139  			defer wg.Done()
   140  			shard.UpdateTabletsInShardWithLock(ctx)
   141  		}(shard)
   142  	}
   143  	wg.Wait()
   144  	log.Info("Ready to start VTGR")
   145  	return vtgr
   146  }
   147  
   148  // RefreshCluster get the latest tablets from topo server
   149  func (vtgr *VTGR) RefreshCluster() {
   150  	for _, shard := range vtgr.Shards {
   151  		go func(shard *controller.GRShard) {
   152  			ticker := time.Tick(refreshInterval)
   153  			for range ticker {
   154  				ctx, cancel := context.WithTimeout(vtgr.ctx, refreshInterval)
   155  				shard.UpdateTabletsInShardWithLock(ctx)
   156  				cancel()
   157  			}
   158  		}(shard)
   159  	}
   160  }
   161  
   162  // ScanAndRepair starts the scanAndFix routine
   163  func (vtgr *VTGR) ScanAndRepair() {
   164  	for _, shard := range vtgr.Shards {
   165  		go func(shard *controller.GRShard) {
   166  			ticker := time.Tick(scanInterval)
   167  			for range ticker {
   168  				func() {
   169  					ctx, cancel := context.WithTimeout(vtgr.ctx, scanAndRepairTimeout)
   170  					defer cancel()
   171  					if !vtgr.stopped.Get() {
   172  						log.Infof("Start scan and repair %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard)
   173  						shard.ScanAndRepairShard(ctx)
   174  						log.Infof("Finished scan and repair %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard)
   175  					}
   176  				}()
   177  			}
   178  		}(shard)
   179  	}
   180  }
   181  
   182  // Diagnose exposes the endpoint to diagnose a particular shard
   183  func (vtgr *VTGR) Diagnose(ctx context.Context, shard *controller.GRShard) (controller.DiagnoseType, error) {
   184  	return shard.Diagnose(ctx)
   185  }
   186  
   187  // Repair exposes the endpoint to repair a particular shard
   188  func (vtgr *VTGR) Repair(ctx context.Context, shard *controller.GRShard, diagnose controller.DiagnoseType) (controller.RepairResultCode, error) {
   189  	if vtgr.stopped.Get() {
   190  		return controller.Fail, errors.New("VTGR is stopped")
   191  	}
   192  	return shard.Repair(ctx, diagnose)
   193  }
   194  
   195  // GetCurrentShardStatuses is used when we want to know what VTGR observes
   196  // it contains information about a list of instances and primary tablet
   197  func (vtgr *VTGR) GetCurrentShardStatuses() []controller.ShardStatus {
   198  	var result []controller.ShardStatus
   199  	for _, shard := range vtgr.Shards {
   200  		status := shard.GetCurrentShardStatuses()
   201  		result = append(result, status)
   202  	}
   203  	return result
   204  }
   205  
   206  // OverrideRebootstrapGroupSize forces an override the group size used in safety check for rebootstrap
   207  func (vtgr *VTGR) OverrideRebootstrapGroupSize(groupSize int) error {
   208  	errorRecord := concurrency.AllErrorRecorder{}
   209  	for _, shard := range vtgr.Shards {
   210  		err := shard.OverrideRebootstrapGroupSize(groupSize)
   211  		if err != nil {
   212  			errorRecord.RecordError(err)
   213  		}
   214  	}
   215  	return errorRecord.Error()
   216  }
   217  
   218  func (vtgr *VTGR) handleSignal(action func(int)) {
   219  	sigChan := make(chan os.Signal, 1)
   220  	signal.Notify(sigChan, syscall.SIGHUP)
   221  	go func() {
   222  		// block until the signal is received
   223  		<-sigChan
   224  		log.Infof("Handling SIGHUP")
   225  		// Set stopped to true so that following repair call won't do anything
   226  		// For the ongoing repairs, checkShardLocked will abort if needed
   227  		vtgr.stopped.Set(true)
   228  		for _, shard := range vtgr.Shards {
   229  			shard.UnlockShard()
   230  		}
   231  		action(1)
   232  	}()
   233  }