vitess.io/vitess@v0.16.2/go/vt/vtctl/grpcvtctldserver/topo.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package grpcvtctldserver
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	"vitess.io/vitess/go/trace"
    25  	"vitess.io/vitess/go/vt/log"
    26  	"vitess.io/vitess/go/vt/topo"
    27  	"vitess.io/vitess/go/vt/topo/topoproto"
    28  	"vitess.io/vitess/go/vt/topotools"
    29  	"vitess.io/vitess/go/vt/vterrors"
    30  
    31  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    32  	"vitess.io/vitess/go/vt/proto/vtrpc"
    33  )
    34  
    35  func deleteShard(ctx context.Context, ts *topo.Server, keyspace string, shard string, recursive bool, evenIfServing bool, force bool) (err error) {
    36  	span, ctx := trace.NewSpan(ctx, "VtctldServer.deleteShard")
    37  	defer span.Finish()
    38  
    39  	span.Annotate("keyspace", keyspace)
    40  	span.Annotate("shard", shard)
    41  	span.Annotate("recursive", recursive)
    42  	span.Annotate("even_if_serving", evenIfServing)
    43  	span.Annotate("force", force)
    44  
    45  	lctx, unlock, lerr := ts.LockShard(ctx, keyspace, shard, "DeleteShard")
    46  	switch {
    47  	case lerr == nil:
    48  		// We locked the shard, all good
    49  		ctx = lctx
    50  	case !force:
    51  		return fmt.Errorf("failed to lock %s/%s; if you really want to delete this shard, re-run with Force=true: %w", keyspace, shard, lerr)
    52  	default:
    53  		// Failed to lock, but force=true. Warn and continue
    54  		log.Warningf("%s: failed to lock shard %s/%s for deletion, but force=true, proceeding anyway ...", lerr, keyspace, shard)
    55  	}
    56  
    57  	if unlock != nil {
    58  		defer func() {
    59  			// Attempting to unlock a shard we successfully deleted results in
    60  			// ts.unlockShard returning an error, which can make the overall
    61  			// RPC _seem_ like it failed.
    62  			//
    63  			// So, we do this extra checking to allow for specifically this
    64  			// scenario to result in "success."
    65  			origErr := err
    66  			unlock(&err)
    67  			if origErr == nil && topo.IsErrType(err, topo.NoNode) {
    68  				err = nil
    69  			}
    70  		}()
    71  	}
    72  
    73  	// Read the Shard object. If it's not in the topo, try to clean up the topo
    74  	// anyway.
    75  	shardInfo, err := ts.GetShard(ctx, keyspace, shard)
    76  	if err != nil {
    77  		if topo.IsErrType(err, topo.NoNode) {
    78  			log.Infof("Shard %v/%v doesn't seem to exist; cleaning up any potential leftover topo data", keyspace, shard)
    79  
    80  			_ = ts.DeleteShard(ctx, keyspace, shard)
    81  			return nil
    82  		}
    83  
    84  		return err
    85  	}
    86  
    87  	servingCells, err := ts.GetShardServingCells(ctx, shardInfo)
    88  	if err != nil {
    89  		return err
    90  	}
    91  
    92  	// We never want to remove a potentially serving shard unless someone
    93  	// explicitly requested it.
    94  	if len(servingCells) > 0 && !evenIfServing {
    95  		return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "shard %v/%v is still serving; cannot delete it; use EvenIfServing = true to delete anyway", keyspace, shard)
    96  	}
    97  
    98  	cells, err := ts.GetCellInfoNames(ctx)
    99  	if err != nil {
   100  		return err
   101  	}
   102  
   103  	for _, cell := range cells {
   104  		err = deleteShardCell(ctx, ts, keyspace, shard, cell, recursive)
   105  		if err != nil {
   106  			return err
   107  		}
   108  	}
   109  
   110  	// Try to remove the replication and serving graphs from each cell,
   111  	// regardless of whether they exist.
   112  	for _, cell := range cells {
   113  		if err := ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && !topo.IsErrType(err, topo.NoNode) {
   114  			log.Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %w", cell, keyspace, shard, err)
   115  		}
   116  	}
   117  
   118  	err = ts.DeleteShard(ctx, keyspace, shard)
   119  	return err
   120  }
   121  
   122  // deleteShardCell is the per-cell helper function for deleteShard, and is
   123  // distinct from the RemoveShardCell rpc. Despite having similar names, they are
   124  // **not** the same!
   125  func deleteShardCell(ctx context.Context, ts *topo.Server, keyspace string, shard string, cell string, recursive bool) error {
   126  	span, ctx := trace.NewSpan(ctx, "VtctldServer.deleteShardCell")
   127  	defer span.Finish()
   128  
   129  	span.Annotate("keyspace", keyspace)
   130  	span.Annotate("shard", shard)
   131  	span.Annotate("cell", cell)
   132  	span.Annotate("recursive", recursive)
   133  
   134  	var aliases []*topodatapb.TabletAlias
   135  
   136  	// Get the ShardReplication object for the cell. Collect all the tablets
   137  	// that belong to the shard.
   138  	sri, err := ts.GetShardReplication(ctx, cell, keyspace, shard)
   139  	switch {
   140  	case topo.IsErrType(err, topo.NoNode):
   141  		// No ShardReplication object means that the topo is inconsistent.
   142  		// Therefore we read all the tablets for that cell, and if we find any
   143  		// in our shard, we'll either abort or try to delete them, depending on
   144  		// whether recursive=true.
   145  		aliases, err = ts.GetTabletAliasesByCell(ctx, cell)
   146  		if err != nil {
   147  			return fmt.Errorf("GetTabletAliasesByCell(%v) failed: %w", cell, err)
   148  		}
   149  	case err == nil:
   150  		// If a ShardReplication object exists, we trust it to have all the
   151  		// tablet records for the shard in that cell.
   152  		aliases = make([]*topodatapb.TabletAlias, len(sri.Nodes))
   153  
   154  		for i, node := range sri.Nodes {
   155  			aliases[i] = node.TabletAlias
   156  		}
   157  	default:
   158  		return fmt.Errorf("GetShardReplication(%v, %v, %v) failed: %w", cell, keyspace, shard, err)
   159  	}
   160  
   161  	// Get all the tablet records for the aliases we've collected. Note that
   162  	// GetTabletMap ignores ErrNoNode, which is convenient for our purpose; it
   163  	// means a tablet was deleted but is still referenced.
   164  	tabletMap, err := ts.GetTabletMap(ctx, aliases)
   165  	if err != nil {
   166  		return fmt.Errorf("GetTabletMap() failed: %w", err)
   167  	}
   168  
   169  	// In the case where no ShardReplication object exists, we collect the
   170  	// aliases of every tablet in the cell, so we'll need to filter
   171  	// out anything not in our shard.
   172  	for alias, ti := range tabletMap {
   173  		if !(ti.Keyspace == keyspace && ti.Shard == shard) {
   174  			delete(tabletMap, alias)
   175  		}
   176  	}
   177  
   178  	// If there are any tablets in the shard in the cell, delete them.
   179  	if len(tabletMap) > 0 {
   180  		if !recursive {
   181  			return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "Shard %v/%v still hase %v tablets in cell %v; use Recursive = true or remove them manually", keyspace, shard, len(tabletMap), cell)
   182  		}
   183  
   184  		log.Infof("Deleting all %d tablets in shard %v/%v cell %v", len(tabletMap), keyspace, shard, cell)
   185  		for alias, tablet := range tabletMap {
   186  			// We don't care about updating the ShardReplication object, because
   187  			// later we're going to delete the entire object.
   188  			log.Infof("Deleting tablet %v", alias)
   189  			if err := ts.DeleteTablet(ctx, tablet.Alias); err != nil && !topo.IsErrType(err, topo.NoNode) {
   190  				// We don't want to continue if a DeleteTablet fails for any
   191  				// reason other than a missing tablet (in which case it's just
   192  				// topo server inconsistency, which we can ignore). If we were
   193  				// to continue and delete the replication graph, the tablet
   194  				// record would become orphaned, since we'd no longer know that
   195  				// it belongs to this shard.
   196  				//
   197  				// If the problem is temporary, or resolved externally,
   198  				// re-running DeleteShard will skip over tablets that were
   199  				// already deleted.
   200  				return fmt.Errorf("cannot delete tablet %v: %w", alias, err)
   201  			}
   202  		}
   203  	}
   204  
   205  	return nil
   206  }
   207  
   208  func deleteTablet(ctx context.Context, ts *topo.Server, alias *topodatapb.TabletAlias, allowPrimary bool) (err error) {
   209  	span, ctx := trace.NewSpan(ctx, "VtctldServer.deleteTablet")
   210  	defer span.Finish()
   211  
   212  	span.Annotate("tablet_alias", topoproto.TabletAliasString(alias))
   213  	span.Annotate("allow_primary", allowPrimary)
   214  
   215  	tablet, err := ts.GetTablet(ctx, alias)
   216  	if err != nil {
   217  		return err
   218  	}
   219  
   220  	isPrimary, err := topotools.IsPrimaryTablet(ctx, ts, tablet)
   221  	if err != nil {
   222  		return err
   223  	}
   224  
   225  	span.Annotate("is_primary", isPrimary)
   226  
   227  	if isPrimary && !allowPrimary {
   228  		return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cannot delete tablet %v as it is a primary, pass AllowPrimary = true", topoproto.TabletAliasString(alias))
   229  	}
   230  
   231  	// Update the Shard object if the primary was scrapped. We do this before
   232  	// calling DeleteTablet so that the operation can be retried in case of
   233  	// failure.
   234  	if isPrimary {
   235  		lockCtx, unlock, lockErr := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, fmt.Sprintf("DeleteTablet(%v)", topoproto.TabletAliasString(alias)))
   236  		if lockErr != nil {
   237  			return lockErr
   238  		}
   239  
   240  		defer unlock(&err)
   241  
   242  		if _, err := ts.UpdateShardFields(lockCtx, tablet.Keyspace, tablet.Shard, func(si *topo.ShardInfo) error {
   243  			if !topoproto.TabletAliasEqual(si.PrimaryAlias, alias) {
   244  				log.Warningf(
   245  					"Deleting primary %v from shard %v/%v but primary in Shard object was %v",
   246  					topoproto.TabletAliasString(alias), tablet.Keyspace, tablet.Shard, topoproto.TabletAliasString(si.PrimaryAlias),
   247  				)
   248  
   249  				return topo.NewError(topo.NoUpdateNeeded, si.Keyspace()+"/"+si.ShardName())
   250  			}
   251  			si.PrimaryAlias = nil
   252  			si.SetPrimaryTermStartTime(time.Now())
   253  			return nil
   254  		}); err != nil {
   255  			return err
   256  		}
   257  	}
   258  
   259  	// Remove the tablet record and its replication graph entry.
   260  	if err := topotools.DeleteTablet(ctx, ts, tablet.Tablet); err != nil {
   261  		return err
   262  	}
   263  
   264  	// Return any error from unlocking the keyspace.
   265  	return err
   266  }
   267  
   268  func removeShardCell(ctx context.Context, ts *topo.Server, cell string, keyspace string, shardName string, recursive bool, force bool) error {
   269  	span, ctx := trace.NewSpan(ctx, "VtctldServer.removeShardCell")
   270  	defer span.Finish()
   271  
   272  	span.Annotate("keyspace", keyspace)
   273  	span.Annotate("shard", shardName)
   274  	span.Annotate("cell", cell)
   275  	span.Annotate("recursive", recursive)
   276  	span.Annotate("force", force)
   277  
   278  	shard, err := ts.GetShard(ctx, keyspace, shardName)
   279  	if err != nil {
   280  		return err
   281  	}
   282  
   283  	servingCells, err := ts.GetShardServingCells(ctx, shard)
   284  	if err != nil {
   285  		return err
   286  	}
   287  
   288  	if !topo.InCellList(cell, servingCells) {
   289  		return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "shard %v/%v does not have serving cell %v", keyspace, shardName, cell)
   290  	}
   291  
   292  	if shard.PrimaryAlias != nil && shard.PrimaryAlias.Cell == cell {
   293  		return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cannot remove cell %v; shard primary %v is in cell", cell, topoproto.TabletAliasString(shard.PrimaryAlias))
   294  	}
   295  
   296  	replication, err := ts.GetShardReplication(ctx, cell, keyspace, shardName)
   297  	switch {
   298  	case err == nil:
   299  		// We have tablets in the shard in this cell.
   300  		if recursive {
   301  			log.Infof("Deleting all tablets in cell %v in shard %v/%v", cell, keyspace, shardName)
   302  			for _, node := range replication.Nodes {
   303  				// We don't care about scrapping or updating the replication
   304  				// graph, because we're about to delete the entire replication
   305  				// graph.
   306  				log.Infof("Deleting tablet %v", topoproto.TabletAliasString(node.TabletAlias))
   307  				if err := ts.DeleteTablet(ctx, node.TabletAlias); err != nil && !topo.IsErrType(err, topo.NoNode) {
   308  					return fmt.Errorf("cannot delete tablet %v: %w", topoproto.TabletAliasString(node.TabletAlias), err)
   309  				}
   310  			}
   311  		} else if len(replication.Nodes) > 0 {
   312  			return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cell %v has %v possible tablets in replication graph", cell, len(replication.Nodes))
   313  		}
   314  
   315  		// Remove the empty replication graph.
   316  		if err := ts.DeleteShardReplication(ctx, cell, keyspace, shardName); err != nil && !topo.IsErrType(err, topo.NoNode) {
   317  			return fmt.Errorf("error deleting ShardReplication object in cell %v: %w", cell, err)
   318  		}
   319  	case topo.IsErrType(err, topo.NoNode):
   320  		// No ShardReplication object. This is the expected path when there are
   321  		// no tablets in the shard in that cell.
   322  		err = nil
   323  	default:
   324  		// If we can't get the replication object out of the local topo, we
   325  		// assume the topo server is down in that cell, so we'll only continue
   326  		// if Force was specified.
   327  		if !force {
   328  			return err
   329  		}
   330  
   331  		log.Warningf("Cannot get ShardReplication from cell %v; assuming cell topo server is down and forcing removal", cell)
   332  	}
   333  
   334  	// Finally, update the shard.
   335  
   336  	log.Infof("Removing cell %v from SrvKeyspace %v/%v", cell, keyspace, shardName)
   337  
   338  	ctx, unlock, lockErr := ts.LockKeyspace(ctx, keyspace, "Locking keyspace to remove shard from SrvKeyspace")
   339  	if lockErr != nil {
   340  		return lockErr
   341  	}
   342  
   343  	defer unlock(&err)
   344  
   345  	if err := ts.DeleteSrvKeyspacePartitions(ctx, keyspace, []*topo.ShardInfo{shard}, topodatapb.TabletType_RDONLY, []string{cell}); err != nil {
   346  		return err
   347  	}
   348  
   349  	if err := ts.DeleteSrvKeyspacePartitions(ctx, keyspace, []*topo.ShardInfo{shard}, topodatapb.TabletType_REPLICA, []string{cell}); err != nil {
   350  		return err
   351  	}
   352  
   353  	if err := ts.DeleteSrvKeyspacePartitions(ctx, keyspace, []*topo.ShardInfo{shard}, topodatapb.TabletType_PRIMARY, []string{cell}); err != nil {
   354  		return err
   355  	}
   356  
   357  	return err
   358  }