vitess.io/vitess@v0.16.2/go/vt/vtctl/grpcvtctldserver/topo.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package grpcvtctldserver 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 "vitess.io/vitess/go/trace" 25 "vitess.io/vitess/go/vt/log" 26 "vitess.io/vitess/go/vt/topo" 27 "vitess.io/vitess/go/vt/topo/topoproto" 28 "vitess.io/vitess/go/vt/topotools" 29 "vitess.io/vitess/go/vt/vterrors" 30 31 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 32 "vitess.io/vitess/go/vt/proto/vtrpc" 33 ) 34 35 func deleteShard(ctx context.Context, ts *topo.Server, keyspace string, shard string, recursive bool, evenIfServing bool, force bool) (err error) { 36 span, ctx := trace.NewSpan(ctx, "VtctldServer.deleteShard") 37 defer span.Finish() 38 39 span.Annotate("keyspace", keyspace) 40 span.Annotate("shard", shard) 41 span.Annotate("recursive", recursive) 42 span.Annotate("even_if_serving", evenIfServing) 43 span.Annotate("force", force) 44 45 lctx, unlock, lerr := ts.LockShard(ctx, keyspace, shard, "DeleteShard") 46 switch { 47 case lerr == nil: 48 // We locked the shard, all good 49 ctx = lctx 50 case !force: 51 return fmt.Errorf("failed to lock %s/%s; if you really want to delete this shard, re-run with Force=true: %w", keyspace, shard, lerr) 52 default: 53 // Failed to lock, but force=true. Warn and continue 54 log.Warningf("%s: failed to lock shard %s/%s for deletion, but force=true, proceeding anyway ...", lerr, keyspace, shard) 55 } 56 57 if unlock != nil { 58 defer func() { 59 // Attempting to unlock a shard we successfully deleted results in 60 // ts.unlockShard returning an error, which can make the overall 61 // RPC _seem_ like it failed. 62 // 63 // So, we do this extra checking to allow for specifically this 64 // scenario to result in "success." 65 origErr := err 66 unlock(&err) 67 if origErr == nil && topo.IsErrType(err, topo.NoNode) { 68 err = nil 69 } 70 }() 71 } 72 73 // Read the Shard object. If it's not in the topo, try to clean up the topo 74 // anyway. 75 shardInfo, err := ts.GetShard(ctx, keyspace, shard) 76 if err != nil { 77 if topo.IsErrType(err, topo.NoNode) { 78 log.Infof("Shard %v/%v doesn't seem to exist; cleaning up any potential leftover topo data", keyspace, shard) 79 80 _ = ts.DeleteShard(ctx, keyspace, shard) 81 return nil 82 } 83 84 return err 85 } 86 87 servingCells, err := ts.GetShardServingCells(ctx, shardInfo) 88 if err != nil { 89 return err 90 } 91 92 // We never want to remove a potentially serving shard unless someone 93 // explicitly requested it. 94 if len(servingCells) > 0 && !evenIfServing { 95 return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "shard %v/%v is still serving; cannot delete it; use EvenIfServing = true to delete anyway", keyspace, shard) 96 } 97 98 cells, err := ts.GetCellInfoNames(ctx) 99 if err != nil { 100 return err 101 } 102 103 for _, cell := range cells { 104 err = deleteShardCell(ctx, ts, keyspace, shard, cell, recursive) 105 if err != nil { 106 return err 107 } 108 } 109 110 // Try to remove the replication and serving graphs from each cell, 111 // regardless of whether they exist. 112 for _, cell := range cells { 113 if err := ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && !topo.IsErrType(err, topo.NoNode) { 114 log.Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %w", cell, keyspace, shard, err) 115 } 116 } 117 118 err = ts.DeleteShard(ctx, keyspace, shard) 119 return err 120 } 121 122 // deleteShardCell is the per-cell helper function for deleteShard, and is 123 // distinct from the RemoveShardCell rpc. Despite having similar names, they are 124 // **not** the same! 125 func deleteShardCell(ctx context.Context, ts *topo.Server, keyspace string, shard string, cell string, recursive bool) error { 126 span, ctx := trace.NewSpan(ctx, "VtctldServer.deleteShardCell") 127 defer span.Finish() 128 129 span.Annotate("keyspace", keyspace) 130 span.Annotate("shard", shard) 131 span.Annotate("cell", cell) 132 span.Annotate("recursive", recursive) 133 134 var aliases []*topodatapb.TabletAlias 135 136 // Get the ShardReplication object for the cell. Collect all the tablets 137 // that belong to the shard. 138 sri, err := ts.GetShardReplication(ctx, cell, keyspace, shard) 139 switch { 140 case topo.IsErrType(err, topo.NoNode): 141 // No ShardReplication object means that the topo is inconsistent. 142 // Therefore we read all the tablets for that cell, and if we find any 143 // in our shard, we'll either abort or try to delete them, depending on 144 // whether recursive=true. 145 aliases, err = ts.GetTabletAliasesByCell(ctx, cell) 146 if err != nil { 147 return fmt.Errorf("GetTabletAliasesByCell(%v) failed: %w", cell, err) 148 } 149 case err == nil: 150 // If a ShardReplication object exists, we trust it to have all the 151 // tablet records for the shard in that cell. 152 aliases = make([]*topodatapb.TabletAlias, len(sri.Nodes)) 153 154 for i, node := range sri.Nodes { 155 aliases[i] = node.TabletAlias 156 } 157 default: 158 return fmt.Errorf("GetShardReplication(%v, %v, %v) failed: %w", cell, keyspace, shard, err) 159 } 160 161 // Get all the tablet records for the aliases we've collected. Note that 162 // GetTabletMap ignores ErrNoNode, which is convenient for our purpose; it 163 // means a tablet was deleted but is still referenced. 164 tabletMap, err := ts.GetTabletMap(ctx, aliases) 165 if err != nil { 166 return fmt.Errorf("GetTabletMap() failed: %w", err) 167 } 168 169 // In the case where no ShardReplication object exists, we collect the 170 // aliases of every tablet in the cell, so we'll need to filter 171 // out anything not in our shard. 172 for alias, ti := range tabletMap { 173 if !(ti.Keyspace == keyspace && ti.Shard == shard) { 174 delete(tabletMap, alias) 175 } 176 } 177 178 // If there are any tablets in the shard in the cell, delete them. 179 if len(tabletMap) > 0 { 180 if !recursive { 181 return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "Shard %v/%v still hase %v tablets in cell %v; use Recursive = true or remove them manually", keyspace, shard, len(tabletMap), cell) 182 } 183 184 log.Infof("Deleting all %d tablets in shard %v/%v cell %v", len(tabletMap), keyspace, shard, cell) 185 for alias, tablet := range tabletMap { 186 // We don't care about updating the ShardReplication object, because 187 // later we're going to delete the entire object. 188 log.Infof("Deleting tablet %v", alias) 189 if err := ts.DeleteTablet(ctx, tablet.Alias); err != nil && !topo.IsErrType(err, topo.NoNode) { 190 // We don't want to continue if a DeleteTablet fails for any 191 // reason other than a missing tablet (in which case it's just 192 // topo server inconsistency, which we can ignore). If we were 193 // to continue and delete the replication graph, the tablet 194 // record would become orphaned, since we'd no longer know that 195 // it belongs to this shard. 196 // 197 // If the problem is temporary, or resolved externally, 198 // re-running DeleteShard will skip over tablets that were 199 // already deleted. 200 return fmt.Errorf("cannot delete tablet %v: %w", alias, err) 201 } 202 } 203 } 204 205 return nil 206 } 207 208 func deleteTablet(ctx context.Context, ts *topo.Server, alias *topodatapb.TabletAlias, allowPrimary bool) (err error) { 209 span, ctx := trace.NewSpan(ctx, "VtctldServer.deleteTablet") 210 defer span.Finish() 211 212 span.Annotate("tablet_alias", topoproto.TabletAliasString(alias)) 213 span.Annotate("allow_primary", allowPrimary) 214 215 tablet, err := ts.GetTablet(ctx, alias) 216 if err != nil { 217 return err 218 } 219 220 isPrimary, err := topotools.IsPrimaryTablet(ctx, ts, tablet) 221 if err != nil { 222 return err 223 } 224 225 span.Annotate("is_primary", isPrimary) 226 227 if isPrimary && !allowPrimary { 228 return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cannot delete tablet %v as it is a primary, pass AllowPrimary = true", topoproto.TabletAliasString(alias)) 229 } 230 231 // Update the Shard object if the primary was scrapped. We do this before 232 // calling DeleteTablet so that the operation can be retried in case of 233 // failure. 234 if isPrimary { 235 lockCtx, unlock, lockErr := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, fmt.Sprintf("DeleteTablet(%v)", topoproto.TabletAliasString(alias))) 236 if lockErr != nil { 237 return lockErr 238 } 239 240 defer unlock(&err) 241 242 if _, err := ts.UpdateShardFields(lockCtx, tablet.Keyspace, tablet.Shard, func(si *topo.ShardInfo) error { 243 if !topoproto.TabletAliasEqual(si.PrimaryAlias, alias) { 244 log.Warningf( 245 "Deleting primary %v from shard %v/%v but primary in Shard object was %v", 246 topoproto.TabletAliasString(alias), tablet.Keyspace, tablet.Shard, topoproto.TabletAliasString(si.PrimaryAlias), 247 ) 248 249 return topo.NewError(topo.NoUpdateNeeded, si.Keyspace()+"/"+si.ShardName()) 250 } 251 si.PrimaryAlias = nil 252 si.SetPrimaryTermStartTime(time.Now()) 253 return nil 254 }); err != nil { 255 return err 256 } 257 } 258 259 // Remove the tablet record and its replication graph entry. 260 if err := topotools.DeleteTablet(ctx, ts, tablet.Tablet); err != nil { 261 return err 262 } 263 264 // Return any error from unlocking the keyspace. 265 return err 266 } 267 268 func removeShardCell(ctx context.Context, ts *topo.Server, cell string, keyspace string, shardName string, recursive bool, force bool) error { 269 span, ctx := trace.NewSpan(ctx, "VtctldServer.removeShardCell") 270 defer span.Finish() 271 272 span.Annotate("keyspace", keyspace) 273 span.Annotate("shard", shardName) 274 span.Annotate("cell", cell) 275 span.Annotate("recursive", recursive) 276 span.Annotate("force", force) 277 278 shard, err := ts.GetShard(ctx, keyspace, shardName) 279 if err != nil { 280 return err 281 } 282 283 servingCells, err := ts.GetShardServingCells(ctx, shard) 284 if err != nil { 285 return err 286 } 287 288 if !topo.InCellList(cell, servingCells) { 289 return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "shard %v/%v does not have serving cell %v", keyspace, shardName, cell) 290 } 291 292 if shard.PrimaryAlias != nil && shard.PrimaryAlias.Cell == cell { 293 return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cannot remove cell %v; shard primary %v is in cell", cell, topoproto.TabletAliasString(shard.PrimaryAlias)) 294 } 295 296 replication, err := ts.GetShardReplication(ctx, cell, keyspace, shardName) 297 switch { 298 case err == nil: 299 // We have tablets in the shard in this cell. 300 if recursive { 301 log.Infof("Deleting all tablets in cell %v in shard %v/%v", cell, keyspace, shardName) 302 for _, node := range replication.Nodes { 303 // We don't care about scrapping or updating the replication 304 // graph, because we're about to delete the entire replication 305 // graph. 306 log.Infof("Deleting tablet %v", topoproto.TabletAliasString(node.TabletAlias)) 307 if err := ts.DeleteTablet(ctx, node.TabletAlias); err != nil && !topo.IsErrType(err, topo.NoNode) { 308 return fmt.Errorf("cannot delete tablet %v: %w", topoproto.TabletAliasString(node.TabletAlias), err) 309 } 310 } 311 } else if len(replication.Nodes) > 0 { 312 return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "cell %v has %v possible tablets in replication graph", cell, len(replication.Nodes)) 313 } 314 315 // Remove the empty replication graph. 316 if err := ts.DeleteShardReplication(ctx, cell, keyspace, shardName); err != nil && !topo.IsErrType(err, topo.NoNode) { 317 return fmt.Errorf("error deleting ShardReplication object in cell %v: %w", cell, err) 318 } 319 case topo.IsErrType(err, topo.NoNode): 320 // No ShardReplication object. This is the expected path when there are 321 // no tablets in the shard in that cell. 322 err = nil 323 default: 324 // If we can't get the replication object out of the local topo, we 325 // assume the topo server is down in that cell, so we'll only continue 326 // if Force was specified. 327 if !force { 328 return err 329 } 330 331 log.Warningf("Cannot get ShardReplication from cell %v; assuming cell topo server is down and forcing removal", cell) 332 } 333 334 // Finally, update the shard. 335 336 log.Infof("Removing cell %v from SrvKeyspace %v/%v", cell, keyspace, shardName) 337 338 ctx, unlock, lockErr := ts.LockKeyspace(ctx, keyspace, "Locking keyspace to remove shard from SrvKeyspace") 339 if lockErr != nil { 340 return lockErr 341 } 342 343 defer unlock(&err) 344 345 if err := ts.DeleteSrvKeyspacePartitions(ctx, keyspace, []*topo.ShardInfo{shard}, topodatapb.TabletType_RDONLY, []string{cell}); err != nil { 346 return err 347 } 348 349 if err := ts.DeleteSrvKeyspacePartitions(ctx, keyspace, []*topo.ShardInfo{shard}, topodatapb.TabletType_REPLICA, []string{cell}); err != nil { 350 return err 351 } 352 353 if err := ts.DeleteSrvKeyspacePartitions(ctx, keyspace, []*topo.ShardInfo{shard}, topodatapb.TabletType_PRIMARY, []string{cell}); err != nil { 354 return err 355 } 356 357 return err 358 }