vitess.io/vitess@v0.16.2/go/vt/vtorc/logic/tablet_discovery.go (about)

     1  /*
     2  Copyright 2020 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package logic
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"strings"
    23  	"sync"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	"github.com/spf13/pflag"
    28  
    29  	"vitess.io/vitess/go/vt/external/golib/sqlutils"
    30  
    31  	"google.golang.org/protobuf/encoding/prototext"
    32  	"google.golang.org/protobuf/proto"
    33  
    34  	"vitess.io/vitess/go/vt/log"
    35  	"vitess.io/vitess/go/vt/topo/topoproto"
    36  
    37  	"vitess.io/vitess/go/vt/vtorc/config"
    38  
    39  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    40  	"vitess.io/vitess/go/vt/topo"
    41  	"vitess.io/vitess/go/vt/topotools"
    42  	"vitess.io/vitess/go/vt/vtorc/db"
    43  	"vitess.io/vitess/go/vt/vtorc/inst"
    44  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    45  )
    46  
    47  var (
    48  	ts                *topo.Server
    49  	tmc               tmclient.TabletManagerClient
    50  	clustersToWatch   []string
    51  	shutdownWaitTime  = 30 * time.Second
    52  	shardsLockCounter int32
    53  	// ErrNoPrimaryTablet is a fixed error message.
    54  	ErrNoPrimaryTablet = errors.New("no primary tablet found")
    55  )
    56  
    57  // RegisterFlags registers the flags required by VTOrc
    58  func RegisterFlags(fs *pflag.FlagSet) {
    59  	fs.StringSliceVar(&clustersToWatch, "clusters_to_watch", clustersToWatch, "Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: \"ks1,ks2/-80\"")
    60  	fs.DurationVar(&shutdownWaitTime, "shutdown_wait_time", shutdownWaitTime, "Maximum time to wait for VTOrc to release all the locks that it is holding before shutting down on SIGTERM")
    61  }
    62  
    63  // OpenTabletDiscovery opens the vitess topo if enables and returns a ticker
    64  // channel for polling.
    65  func OpenTabletDiscovery() <-chan time.Time {
    66  	// TODO(sougou): If there's a shutdown signal, we have to close the topo.
    67  	ts = topo.Open()
    68  	// TODO(sougou): remove ts and push some functions into inst.
    69  	inst.TopoServ = ts
    70  	tmc = tmclient.NewTabletManagerClient()
    71  	// Clear existing cache and perform a new refresh.
    72  	if _, err := db.ExecVTOrc("delete from vitess_tablet"); err != nil {
    73  		log.Error(err)
    74  	}
    75  	return time.Tick(time.Second * time.Duration(config.Config.TopoInformationRefreshSeconds)) //nolint SA1015: using time.Tick leaks the underlying ticker
    76  }
    77  
    78  // refreshAllTablets reloads the tablets from topo and discovers the ones which haven't been refreshed in a while
    79  func refreshAllTablets() {
    80  	refreshTabletsUsing(func(instanceKey *inst.InstanceKey) {
    81  		DiscoverInstance(*instanceKey, false /* forceDiscovery */)
    82  	}, false /* forceRefresh */)
    83  }
    84  
    85  func refreshTabletsUsing(loader func(instanceKey *inst.InstanceKey), forceRefresh bool) {
    86  	if !IsLeaderOrActive() {
    87  		return
    88  	}
    89  	if len(clustersToWatch) == 0 { // all known clusters
    90  		ctx, cancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout)
    91  		defer cancel()
    92  		cells, err := ts.GetKnownCells(ctx)
    93  		if err != nil {
    94  			log.Error(err)
    95  			return
    96  		}
    97  
    98  		refreshCtx, refreshCancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout)
    99  		defer refreshCancel()
   100  		var wg sync.WaitGroup
   101  		for _, cell := range cells {
   102  			wg.Add(1)
   103  			go func(cell string) {
   104  				defer wg.Done()
   105  				refreshTabletsInCell(refreshCtx, cell, loader, forceRefresh)
   106  			}(cell)
   107  		}
   108  		wg.Wait()
   109  	} else {
   110  		// Parse input and build list of keyspaces / shards
   111  		var keyspaceShards []*topo.KeyspaceShard
   112  		for _, ks := range clustersToWatch {
   113  			if strings.Contains(ks, "/") {
   114  				// This is a keyspace/shard specification
   115  				input := strings.Split(ks, "/")
   116  				keyspaceShards = append(keyspaceShards, &topo.KeyspaceShard{Keyspace: input[0], Shard: input[1]})
   117  			} else {
   118  				// Assume this is a keyspace and find all shards in keyspace
   119  				ctx, cancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout)
   120  				defer cancel()
   121  				shards, err := ts.GetShardNames(ctx, ks)
   122  				if err != nil {
   123  					// Log the errr and continue
   124  					log.Errorf("Error fetching shards for keyspace: %v", ks)
   125  					continue
   126  				}
   127  				if len(shards) == 0 {
   128  					log.Errorf("Topo has no shards for ks: %v", ks)
   129  					continue
   130  				}
   131  				for _, s := range shards {
   132  					keyspaceShards = append(keyspaceShards, &topo.KeyspaceShard{Keyspace: ks, Shard: s})
   133  				}
   134  			}
   135  		}
   136  		if len(keyspaceShards) == 0 {
   137  			log.Errorf("Found no keyspaceShards for input: %+v", clustersToWatch)
   138  			return
   139  		}
   140  		refreshCtx, refreshCancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout)
   141  		defer refreshCancel()
   142  		var wg sync.WaitGroup
   143  		for _, ks := range keyspaceShards {
   144  			wg.Add(1)
   145  			go func(ks *topo.KeyspaceShard) {
   146  				defer wg.Done()
   147  				refreshTabletsInKeyspaceShard(refreshCtx, ks.Keyspace, ks.Shard, loader, forceRefresh)
   148  			}(ks)
   149  		}
   150  		wg.Wait()
   151  	}
   152  }
   153  
   154  func refreshTabletsInCell(ctx context.Context, cell string, loader func(instanceKey *inst.InstanceKey), forceRefresh bool) {
   155  	tablets, err := topotools.GetTabletMapForCell(ctx, ts, cell)
   156  	if err != nil {
   157  		log.Errorf("Error fetching topo info for cell %v: %v", cell, err)
   158  		return
   159  	}
   160  	query := "select hostname, port, info from vitess_tablet where cell = ?"
   161  	args := sqlutils.Args(cell)
   162  	refreshTablets(tablets, query, args, loader, forceRefresh)
   163  }
   164  
   165  // forceRefreshAllTabletsInShard is used to refresh all the tablet's information (both MySQL information and topo records)
   166  // for a given shard. This function is meant to be called before or after a cluster-wide operation that we know will
   167  // change the replication information for the entire cluster drastically enough to warrant a full forceful refresh
   168  func forceRefreshAllTabletsInShard(ctx context.Context, keyspace, shard string) {
   169  	log.Infof("force refresh of all tablets in shard - %v/%v", keyspace, shard)
   170  	refreshCtx, refreshCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   171  	defer refreshCancel()
   172  	refreshTabletsInKeyspaceShard(refreshCtx, keyspace, shard, func(instanceKey *inst.InstanceKey) {
   173  		DiscoverInstance(*instanceKey, true)
   174  	}, true)
   175  }
   176  
   177  // refreshTabletInfoOfShard only refreshes the tablet records from the topo-server for all the tablets
   178  // of the given keyspace-shard.
   179  func refreshTabletInfoOfShard(ctx context.Context, keyspace, shard string) {
   180  	log.Infof("refresh of tablet records of shard - %v/%v", keyspace, shard)
   181  	refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(instanceKey *inst.InstanceKey) {
   182  		// No-op
   183  		// We only want to refresh the tablet information for the given shard
   184  	}, false)
   185  }
   186  
   187  func refreshTabletsInKeyspaceShard(ctx context.Context, keyspace, shard string, loader func(instanceKey *inst.InstanceKey), forceRefresh bool) {
   188  	tablets, err := ts.GetTabletMapForShard(ctx, keyspace, shard)
   189  	if err != nil {
   190  		log.Errorf("Error fetching tablets for keyspace/shard %v/%v: %v", keyspace, shard, err)
   191  		return
   192  	}
   193  	query := "select hostname, port, info from vitess_tablet where keyspace = ? and shard = ?"
   194  	args := sqlutils.Args(keyspace, shard)
   195  	refreshTablets(tablets, query, args, loader, forceRefresh)
   196  }
   197  
   198  func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []any, loader func(instanceKey *inst.InstanceKey), forceRefresh bool) {
   199  	// Discover new tablets.
   200  	// TODO(sougou): enhance this to work with multi-schema,
   201  	// where each instanceKey can have multiple tablets.
   202  	latestInstances := make(map[string]bool)
   203  	var wg sync.WaitGroup
   204  	for _, tabletInfo := range tablets {
   205  		tablet := tabletInfo.Tablet
   206  		if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) {
   207  			continue
   208  		}
   209  		latestInstances[topoproto.TabletAliasString(tablet.Alias)] = true
   210  		if tablet.MysqlHostname == "" {
   211  			continue
   212  		}
   213  		instanceKey := inst.InstanceKey{
   214  			Hostname: tablet.MysqlHostname,
   215  			Port:     int(tablet.MysqlPort),
   216  		}
   217  		old, err := inst.ReadTablet(instanceKey)
   218  		if err != nil && err != inst.ErrTabletAliasNil {
   219  			log.Error(err)
   220  			continue
   221  		}
   222  		if !forceRefresh && proto.Equal(tablet, old) {
   223  			continue
   224  		}
   225  		if err := inst.SaveTablet(tablet); err != nil {
   226  			log.Error(err)
   227  			continue
   228  		}
   229  		wg.Add(1)
   230  		go func() {
   231  			defer wg.Done()
   232  			loader(&instanceKey)
   233  		}()
   234  		log.Infof("Discovered: %v", tablet)
   235  	}
   236  	wg.Wait()
   237  
   238  	// Forget tablets that were removed.
   239  	toForget := make(map[inst.InstanceKey]*topodatapb.Tablet)
   240  	err := db.QueryVTOrc(query, args, func(row sqlutils.RowMap) error {
   241  		curKey := inst.InstanceKey{
   242  			Hostname: row.GetString("hostname"),
   243  			Port:     row.GetInt("port"),
   244  		}
   245  		tablet := &topodatapb.Tablet{}
   246  		if err := prototext.Unmarshal([]byte(row.GetString("info")), tablet); err != nil {
   247  			log.Error(err)
   248  			return nil
   249  		}
   250  		if !latestInstances[topoproto.TabletAliasString(tablet.Alias)] {
   251  			toForget[curKey] = tablet
   252  		}
   253  		return nil
   254  	})
   255  	if err != nil {
   256  		log.Error(err)
   257  	}
   258  	for instanceKey, tablet := range toForget {
   259  		log.Infof("Forgetting: %v", tablet)
   260  		_, err := db.ExecVTOrc(`
   261  					delete
   262  						from vitess_tablet
   263  					where
   264  						hostname=? and port=?`,
   265  			instanceKey.Hostname,
   266  			instanceKey.Port,
   267  		)
   268  		if err != nil {
   269  			log.Error(err)
   270  		}
   271  		if err := inst.ForgetInstance(&instanceKey); err != nil {
   272  			log.Error(err)
   273  		}
   274  	}
   275  }
   276  
   277  // LockShard locks the keyspace-shard preventing others from performing conflicting actions.
   278  func LockShard(ctx context.Context, instanceKey inst.InstanceKey) (context.Context, func(*error), error) {
   279  	if instanceKey.Hostname == "" {
   280  		return nil, nil, errors.New("Can't lock shard: instance is unspecified")
   281  	}
   282  	val := atomic.LoadInt32(&hasReceivedSIGTERM)
   283  	if val > 0 {
   284  		return nil, nil, errors.New("Can't lock shard: SIGTERM received")
   285  	}
   286  
   287  	tablet, err := inst.ReadTablet(instanceKey)
   288  	if err != nil {
   289  		return nil, nil, err
   290  	}
   291  
   292  	atomic.AddInt32(&shardsLockCounter, 1)
   293  	ctx, unlock, err := ts.TryLockShard(ctx, tablet.Keyspace, tablet.Shard, "Orc Recovery")
   294  	if err != nil {
   295  		atomic.AddInt32(&shardsLockCounter, -1)
   296  		return nil, nil, err
   297  	}
   298  	return ctx, func(e *error) {
   299  		defer atomic.AddInt32(&shardsLockCounter, -1)
   300  		unlock(e)
   301  	}, nil
   302  }
   303  
   304  // tabletUndoDemotePrimary calls the said RPC for the given tablet.
   305  func tabletUndoDemotePrimary(ctx context.Context, tablet *topodatapb.Tablet, semiSync bool) error {
   306  	return tmc.UndoDemotePrimary(ctx, tablet, semiSync)
   307  }
   308  
   309  // setReadOnly calls the said RPC for the given tablet
   310  func setReadOnly(ctx context.Context, tablet *topodatapb.Tablet) error {
   311  	return tmc.SetReadOnly(ctx, tablet)
   312  }
   313  
   314  // setReplicationSource calls the said RPC with the parameters provided
   315  func setReplicationSource(ctx context.Context, replica *topodatapb.Tablet, primary *topodatapb.Tablet, semiSync bool) error {
   316  	return tmc.SetReplicationSource(ctx, replica, primary.Alias, 0, "", true, semiSync)
   317  }
   318  
   319  // shardPrimary finds the primary of the given keyspace-shard by reading the vtorc backend
   320  func shardPrimary(keyspace string, shard string) (primary *topodatapb.Tablet, err error) {
   321  	query := `SELECT
   322  		info,
   323  		hostname,
   324  		port,
   325  		tablet_type,
   326  		primary_timestamp
   327  	FROM 
   328  		vitess_tablet
   329  	WHERE
   330  		keyspace = ? AND shard = ?
   331  		AND tablet_type = ?
   332  	ORDER BY
   333  		primary_timestamp DESC
   334  	LIMIT 1
   335  `
   336  	err = db.Db.QueryVTOrc(query, sqlutils.Args(keyspace, shard, topodatapb.TabletType_PRIMARY), func(m sqlutils.RowMap) error {
   337  		if primary == nil {
   338  			primary = &topodatapb.Tablet{}
   339  			return prototext.Unmarshal([]byte(m.GetString("info")), primary)
   340  		}
   341  		return nil
   342  	})
   343  	if primary == nil && err == nil {
   344  		err = ErrNoPrimaryTablet
   345  	}
   346  	return primary, err
   347  }
   348  
   349  // restartsReplication restarts the replication on the provided replicaKey. It also sets the correct semi-sync settings when it starts replication
   350  func restartReplication(replicaKey *inst.InstanceKey) error {
   351  	replicaTablet, err := inst.ReadTablet(*replicaKey)
   352  	if err != nil {
   353  		log.Info("Could not read tablet - %+v", replicaKey)
   354  		return err
   355  	}
   356  
   357  	primaryTablet, err := shardPrimary(replicaTablet.Keyspace, replicaTablet.Shard)
   358  	if err != nil {
   359  		log.Info("Could not compute primary for %v/%v", replicaTablet.Keyspace, replicaTablet.Shard)
   360  		return err
   361  	}
   362  
   363  	durabilityPolicy, err := inst.GetDurabilityPolicy(replicaTablet)
   364  	if err != nil {
   365  		log.Info("Could not read the durability policy for %v/%v", replicaTablet.Keyspace, replicaTablet.Shard)
   366  		return err
   367  	}
   368  
   369  	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(config.Config.WaitReplicasTimeoutSeconds)*time.Second)
   370  	defer cancel()
   371  	err = tmc.StopReplication(ctx, replicaTablet)
   372  	if err != nil {
   373  		log.Info("Could not stop replication on %v", topoproto.TabletAliasString(replicaTablet.Alias))
   374  		return err
   375  	}
   376  	err = tmc.StartReplication(ctx, replicaTablet, inst.IsReplicaSemiSync(durabilityPolicy, primaryTablet, replicaTablet))
   377  	if err != nil {
   378  		log.Info("Could not start replication on %v", topoproto.TabletAliasString(replicaTablet.Alias))
   379  		return err
   380  	}
   381  	return nil
   382  }