vitess.io/vitess@v0.16.2/go/vt/topo/replication.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package topo
    18  
    19  import (
    20  	"context"
    21  	"path"
    22  
    23  	"google.golang.org/protobuf/proto"
    24  
    25  	"vitess.io/vitess/go/trace"
    26  	"vitess.io/vitess/go/vt/log"
    27  	"vitess.io/vitess/go/vt/logutil"
    28  	"vitess.io/vitess/go/vt/topo/topoproto"
    29  	"vitess.io/vitess/go/vt/vterrors"
    30  
    31  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    32  )
    33  
    34  // ShardReplicationInfo is the companion structure for ShardReplication.
    35  type ShardReplicationInfo struct {
    36  	*topodatapb.ShardReplication
    37  	cell     string
    38  	keyspace string
    39  	shard    string
    40  }
    41  
    42  // NewShardReplicationInfo is for topo.Server implementations to
    43  // create the structure
    44  func NewShardReplicationInfo(sr *topodatapb.ShardReplication, cell, keyspace, shard string) *ShardReplicationInfo {
    45  	return &ShardReplicationInfo{
    46  		ShardReplication: sr,
    47  		cell:             cell,
    48  		keyspace:         keyspace,
    49  		shard:            shard,
    50  	}
    51  }
    52  
    53  // Cell returns the cell for a ShardReplicationInfo
    54  func (sri *ShardReplicationInfo) Cell() string {
    55  	return sri.cell
    56  }
    57  
    58  // Keyspace returns the keyspace for a ShardReplicationInfo
    59  func (sri *ShardReplicationInfo) Keyspace() string {
    60  	return sri.keyspace
    61  }
    62  
    63  // Shard returns the shard for a ShardReplicationInfo
    64  func (sri *ShardReplicationInfo) Shard() string {
    65  	return sri.shard
    66  }
    67  
    68  // GetShardReplicationNode finds a node for a given tablet.
    69  func (sri *ShardReplicationInfo) GetShardReplicationNode(tabletAlias *topodatapb.TabletAlias) (*topodatapb.ShardReplication_Node, error) {
    70  	for _, rl := range sri.Nodes {
    71  		if proto.Equal(rl.TabletAlias, tabletAlias) {
    72  			return rl, nil
    73  		}
    74  	}
    75  	return nil, NewError(NoNode, tabletAlias.String())
    76  }
    77  
    78  // UpdateShardReplicationRecord is a low level function to add / update an
    79  // entry to the ShardReplication object.
    80  func UpdateShardReplicationRecord(ctx context.Context, ts *Server, keyspace, shard string, tabletAlias *topodatapb.TabletAlias) error {
    81  	span, ctx := trace.NewSpan(ctx, "TopoServer.UpdateShardReplicationFields")
    82  	span.Annotate("keyspace", keyspace)
    83  	span.Annotate("shard", shard)
    84  	span.Annotate("tablet", topoproto.TabletAliasString(tabletAlias))
    85  	defer span.Finish()
    86  
    87  	return ts.UpdateShardReplicationFields(ctx, tabletAlias.Cell, keyspace, shard, func(sr *topodatapb.ShardReplication) error {
    88  		// Not very efficient, but easy to read, and allows us
    89  		// to remove duplicate entries if any.
    90  		nodes := make([]*topodatapb.ShardReplication_Node, 0, len((*sr).Nodes)+1)
    91  		found := false
    92  		modified := false
    93  		for _, node := range (*sr).Nodes {
    94  			if proto.Equal(node.TabletAlias, tabletAlias) {
    95  				if found {
    96  					log.Warningf("Found a second ShardReplication_Node for tablet %v, deleting it", tabletAlias)
    97  					modified = true
    98  					continue
    99  				}
   100  				found = true
   101  			}
   102  			nodes = append(nodes, node)
   103  		}
   104  		if !found {
   105  			nodes = append(nodes, &topodatapb.ShardReplication_Node{TabletAlias: tabletAlias})
   106  			modified = true
   107  		}
   108  		if !modified {
   109  			return NewError(NoUpdateNeeded, tabletAlias.String())
   110  		}
   111  		(*sr).Nodes = nodes
   112  		return nil
   113  	})
   114  }
   115  
   116  // RemoveShardReplicationRecord is a low level function to remove an
   117  // entry from the ShardReplication object.
   118  func RemoveShardReplicationRecord(ctx context.Context, ts *Server, cell, keyspace, shard string, tabletAlias *topodatapb.TabletAlias) error {
   119  	err := ts.UpdateShardReplicationFields(ctx, cell, keyspace, shard, func(sr *topodatapb.ShardReplication) error {
   120  		nodes := make([]*topodatapb.ShardReplication_Node, 0, len((*sr).Nodes))
   121  		for _, node := range (*sr).Nodes {
   122  			if !proto.Equal(node.TabletAlias, tabletAlias) {
   123  				nodes = append(nodes, node)
   124  			}
   125  		}
   126  		(*sr).Nodes = nodes
   127  		return nil
   128  	})
   129  	return err
   130  }
   131  
   132  // FixShardReplication will fix the first problem it encounters within
   133  // a ShardReplication object. It returns info about the error being fixed, if
   134  // an error was found.
   135  //
   136  // A return value of (nil, nil) indicates no issues in the replication graph.
   137  func FixShardReplication(ctx context.Context, ts *Server, logger logutil.Logger, cell, keyspace, shard string) (*topodatapb.ShardReplicationError, error) {
   138  	sri, err := ts.GetShardReplication(ctx, cell, keyspace, shard)
   139  	if err != nil {
   140  		return nil, err
   141  	}
   142  
   143  	for _, node := range sri.Nodes {
   144  		problem := &topodatapb.ShardReplicationError{
   145  			TabletAlias: node.TabletAlias,
   146  		}
   147  
   148  		ti, err := ts.GetTablet(ctx, node.TabletAlias)
   149  		if IsErrType(err, NoNode) {
   150  			problem.Type = topodatapb.ShardReplicationError_NOT_FOUND
   151  			logger.Warningf("Tablet %v is in the replication graph, but does not exist, removing it", node.TabletAlias)
   152  			return problem, RemoveShardReplicationRecord(ctx, ts, cell, keyspace, shard, node.TabletAlias)
   153  		}
   154  		if err != nil {
   155  			// unknown error, we probably don't want to continue
   156  			return nil, err
   157  		}
   158  
   159  		if ti.Keyspace != keyspace || ti.Shard != shard || ti.Alias.Cell != cell {
   160  			problem.Type = topodatapb.ShardReplicationError_TOPOLOGY_MISMATCH
   161  			logger.Warningf("Tablet '%v' is in the replication graph, but has wrong keyspace/shard/cell, removing it", ti.Tablet)
   162  			return problem, RemoveShardReplicationRecord(ctx, ts, cell, keyspace, shard, node.TabletAlias)
   163  		}
   164  
   165  		logger.Infof("Keeping tablet %v in the replication graph", node.TabletAlias)
   166  	}
   167  
   168  	logger.Infof("All entries in replication graph are valid")
   169  	return nil, nil
   170  }
   171  
   172  // UpdateShardReplicationFields updates the fields inside a topo.ShardReplication object.
   173  func (ts *Server) UpdateShardReplicationFields(ctx context.Context, cell, keyspace, shard string, update func(*topodatapb.ShardReplication) error) error {
   174  	nodePath := path.Join(KeyspacesPath, keyspace, ShardsPath, shard, ShardReplicationFile)
   175  
   176  	conn, err := ts.ConnForCell(ctx, cell)
   177  	if err != nil {
   178  		return err
   179  	}
   180  
   181  	for {
   182  		data, version, err := conn.Get(ctx, nodePath)
   183  		sr := &topodatapb.ShardReplication{}
   184  		switch {
   185  		case IsErrType(err, NoNode):
   186  			// Empty node, version is nil
   187  		case err == nil:
   188  			// Use any data we got.
   189  			if err = proto.Unmarshal(data, sr); err != nil {
   190  				return vterrors.Wrap(err, "bad ShardReplication data")
   191  			}
   192  		default:
   193  			return err
   194  		}
   195  
   196  		err = update(sr)
   197  		switch {
   198  		case IsErrType(err, NoUpdateNeeded):
   199  			return nil
   200  		case err == nil:
   201  			// keep going
   202  		default:
   203  			return err
   204  		}
   205  
   206  		// marshall and save
   207  		data, err = proto.Marshal(sr)
   208  		if err != nil {
   209  			return err
   210  		}
   211  		if version == nil {
   212  			// We have to create, and we catch NodeExists.
   213  			_, err = conn.Create(ctx, nodePath, data)
   214  			if IsErrType(err, NodeExists) {
   215  				// Node was created by another process, try
   216  				// again.
   217  				continue
   218  			}
   219  			return err
   220  		}
   221  
   222  		// We have to update, and we catch ErrBadVersion.
   223  		_, err = conn.Update(ctx, nodePath, data, version)
   224  		if IsErrType(err, BadVersion) {
   225  			// Node was updated by another process, try again.
   226  			continue
   227  		}
   228  		return err
   229  	}
   230  }
   231  
   232  // GetShardReplication returns the ShardReplicationInfo object.
   233  func (ts *Server) GetShardReplication(ctx context.Context, cell, keyspace, shard string) (*ShardReplicationInfo, error) {
   234  	conn, err := ts.ConnForCell(ctx, cell)
   235  	if err != nil {
   236  		return nil, err
   237  	}
   238  
   239  	nodePath := path.Join(KeyspacesPath, keyspace, ShardsPath, shard, ShardReplicationFile)
   240  	data, _, err := conn.Get(ctx, nodePath)
   241  	if err != nil {
   242  		return nil, err
   243  	}
   244  
   245  	sr := &topodatapb.ShardReplication{}
   246  	if err = proto.Unmarshal(data, sr); err != nil {
   247  		return nil, vterrors.Wrap(err, "bad ShardReplication data")
   248  	}
   249  
   250  	return NewShardReplicationInfo(sr, cell, keyspace, shard), nil
   251  }
   252  
   253  // DeleteShardReplication deletes a ShardReplication object.
   254  func (ts *Server) DeleteShardReplication(ctx context.Context, cell, keyspace, shard string) error {
   255  	conn, err := ts.ConnForCell(ctx, cell)
   256  	if err != nil {
   257  		return err
   258  	}
   259  
   260  	nodePath := path.Join(KeyspacesPath, keyspace, ShardsPath, shard, ShardReplicationFile)
   261  	return conn.Delete(ctx, nodePath, nil)
   262  }
   263  
   264  // DeleteKeyspaceReplication deletes all the ShardReplication objects for a cell/keyspace.
   265  func (ts *Server) DeleteKeyspaceReplication(ctx context.Context, cell, keyspace string) error {
   266  	conn, err := ts.ConnForCell(ctx, cell)
   267  	if err != nil {
   268  		return err
   269  	}
   270  
   271  	nodePath := path.Join(KeyspacesPath, keyspace)
   272  	return conn.Delete(ctx, nodePath, nil)
   273  }