vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/vdiff/controller.go (about)

     1  /*
     2  Copyright 2022 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vdiff
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"strings"
    24  	"time"
    25  
    26  	"vitess.io/vitess/go/vt/proto/tabletmanagerdata"
    27  	"vitess.io/vitess/go/vt/vterrors"
    28  
    29  	"google.golang.org/protobuf/encoding/prototext"
    30  
    31  	"vitess.io/vitess/go/mysql"
    32  	"vitess.io/vitess/go/sqltypes"
    33  	"vitess.io/vitess/go/vt/binlog/binlogplayer"
    34  	"vitess.io/vitess/go/vt/log"
    35  	binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
    36  	vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
    37  	"vitess.io/vitess/go/vt/topo"
    38  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    39  )
    40  
    41  /*
    42  vdiff operation states: pending/started/stopped/completed/error/unknown
    43  vdiff table states: pending/started/stopped/completed/error/unknown
    44  */
    45  type VDiffState string //nolint
    46  const (
    47  	PendingState    VDiffState = "pending"
    48  	StartedState    VDiffState = "started"
    49  	StoppedState    VDiffState = "stopped"
    50  	CompletedState  VDiffState = "completed"
    51  	ErrorState      VDiffState = "error"
    52  	UnknownState    VDiffState = ""
    53  	TimestampFormat            = "2006-01-02 15:04:05"
    54  )
    55  
    56  type controller struct {
    57  	id              int64 // id from row in _vt.vdiff
    58  	uuid            string
    59  	workflow        string
    60  	cancel          context.CancelFunc
    61  	dbClientFactory func() binlogplayer.DBClient
    62  	ts              *topo.Server
    63  	vde             *Engine // the singleton vdiff engine
    64  	done            chan struct{}
    65  
    66  	sources        map[string]*migrationSource // currently picked source tablets for this shard's data
    67  	workflowFilter string
    68  	sourceKeyspace string
    69  	tmc            tmclient.TabletManagerClient
    70  
    71  	targetShardStreamer *shardStreamer
    72  	filter              *binlogdatapb.Filter            // vreplication row filter
    73  	options             *tabletmanagerdata.VDiffOptions // options initially from vtctld command and later from _vt.vdiff
    74  
    75  	sourceTimeZone, targetTimeZone string // named time zones if conversions are necessary for datetime values
    76  
    77  	externalCluster string // for Mount+Migrate
    78  }
    79  
    80  func newController(ctx context.Context, row sqltypes.RowNamedValues, dbClientFactory func() binlogplayer.DBClient,
    81  	ts *topo.Server, vde *Engine, options *tabletmanagerdata.VDiffOptions) (*controller, error) {
    82  
    83  	log.Infof("VDiff controller initializing for %+v", row)
    84  	id, _ := row["id"].ToInt64()
    85  
    86  	ct := &controller{
    87  		id:              id,
    88  		uuid:            row["vdiff_uuid"].ToString(),
    89  		workflow:        row["workflow"].ToString(),
    90  		dbClientFactory: dbClientFactory,
    91  		ts:              ts,
    92  		vde:             vde,
    93  		done:            make(chan struct{}),
    94  		tmc:             vde.tmClientFactory(),
    95  		sources:         make(map[string]*migrationSource),
    96  		options:         options,
    97  	}
    98  	ctx, ct.cancel = context.WithCancel(ctx)
    99  	go ct.run(ctx)
   100  
   101  	return ct, nil
   102  }
   103  
   104  func (ct *controller) Stop() {
   105  	ct.cancel()
   106  	<-ct.done
   107  }
   108  
   109  func (ct *controller) run(ctx context.Context) {
   110  	defer func() {
   111  		log.Infof("Run finished for vdiff %s", ct.uuid)
   112  		close(ct.done)
   113  	}()
   114  
   115  	dbClient := ct.vde.dbClientFactoryFiltered()
   116  	if err := dbClient.Connect(); err != nil {
   117  		log.Errorf("Encountered an error connecting to database for vdiff %s: %v", ct.uuid, err)
   118  		return
   119  	}
   120  	defer dbClient.Close()
   121  
   122  	qr, err := ct.vde.getVDiffByID(ctx, dbClient, ct.id)
   123  	if err != nil {
   124  		log.Errorf("Encountered an error getting vdiff record for %s: %v", ct.uuid, err)
   125  		return
   126  	}
   127  
   128  	row := qr.Named().Row()
   129  	state := VDiffState(strings.ToLower(row["state"].ToString()))
   130  	switch state {
   131  	case PendingState, StartedState:
   132  		action := "Starting"
   133  		if state == StartedState {
   134  			action = "Restarting"
   135  		}
   136  		log.Infof("%s vdiff %s", action, ct.uuid)
   137  		if err := ct.start(ctx, dbClient); err != nil {
   138  			log.Errorf("Encountered an error for vdiff %s: %s", ct.uuid, err)
   139  			if err := ct.saveErrorState(ctx, err); err != nil {
   140  				log.Errorf("Unable to save error state for vdiff %s; giving up because %s", ct.uuid, err.Error())
   141  			}
   142  		}
   143  	default:
   144  		log.Infof("VDiff %s was not marked as runnable (state: %s), doing nothing", ct.uuid, state)
   145  	}
   146  }
   147  
   148  type migrationSource struct {
   149  	*shardStreamer
   150  
   151  	vrID     int64
   152  	position mysql.Position
   153  }
   154  
   155  func (ct *controller) updateState(dbClient binlogplayer.DBClient, state VDiffState, err error) error {
   156  	extraCols := ""
   157  	switch state {
   158  	case StartedState:
   159  		extraCols = ", started_at = utc_timestamp()"
   160  	case CompletedState:
   161  		extraCols = ", completed_at = utc_timestamp()"
   162  	default:
   163  	}
   164  	if err == nil {
   165  		// Clear out any previous error for the vdiff on this shard
   166  		err = errors.New("")
   167  	}
   168  	query := fmt.Sprintf(sqlUpdateVDiffState, encodeString(string(state)), encodeString(err.Error()), extraCols, ct.id)
   169  	if _, err := dbClient.ExecuteFetch(query, 1); err != nil {
   170  		return err
   171  	}
   172  	insertVDiffLog(ct.vde.ctx, dbClient, ct.id, fmt.Sprintf("State changed to: %s", state))
   173  	return nil
   174  }
   175  
   176  func (ct *controller) start(ctx context.Context, dbClient binlogplayer.DBClient) error {
   177  	select {
   178  	case <-ctx.Done():
   179  		return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired")
   180  	default:
   181  	}
   182  	ct.workflowFilter = fmt.Sprintf("where workflow = %s and db_name = %s", encodeString(ct.workflow), encodeString(ct.vde.dbName))
   183  	query := fmt.Sprintf(sqlGetVReplicationEntry, ct.workflowFilter)
   184  	qr, err := dbClient.ExecuteFetch(query, -1)
   185  	if err != nil {
   186  		return err
   187  	}
   188  	log.Infof("Found %d vreplication streams for %s", len(qr.Rows), ct.workflow)
   189  	for i, row := range qr.Named().Rows {
   190  		select {
   191  		case <-ctx.Done():
   192  			return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired")
   193  		default:
   194  		}
   195  		source := newMigrationSource()
   196  		sourceBytes, err := row["source"].ToBytes()
   197  		if err != nil {
   198  			return err
   199  		}
   200  		var bls binlogdatapb.BinlogSource
   201  		if err := prototext.Unmarshal(sourceBytes, &bls); err != nil {
   202  			log.Errorf("Encountered an error unmarshalling vdiff binlog source for %s: %v", ct.uuid, err)
   203  			return err
   204  		}
   205  		source.shard = bls.Shard
   206  		source.vrID, _ = row["id"].ToInt64()
   207  		ct.sourceTimeZone = bls.SourceTimeZone
   208  		ct.targetTimeZone = bls.TargetTimeZone
   209  
   210  		if bls.ExternalCluster != "" {
   211  			ct.externalCluster = bls.ExternalCluster
   212  		}
   213  
   214  		ct.sources[source.shard] = source
   215  		if i == 0 {
   216  			ct.sourceKeyspace = bls.Keyspace
   217  			ct.filter = bls.Filter
   218  		}
   219  	}
   220  
   221  	if err := ct.validate(); err != nil {
   222  		return err
   223  	}
   224  
   225  	wd, err := newWorkflowDiffer(ct, ct.options)
   226  	if err != nil {
   227  		return err
   228  	}
   229  	if err := ct.updateState(dbClient, StartedState, nil); err != nil {
   230  		return err
   231  	}
   232  	if err := wd.diff(ctx); err != nil {
   233  		log.Errorf("Encountered an error performing workflow diff for vdiff %s: %v", ct.uuid, err)
   234  		return err
   235  	}
   236  
   237  	return nil
   238  }
   239  
   240  // markStoppedByRequest records the fact that this VDiff was stopped via user
   241  // request and resets the error generated by cancelling the context to stop it:
   242  //
   243  //	"vttablet: rpc error: code = Canceled desc = context canceled"
   244  //
   245  // This differentiates non-user requested stops that would occur e.g. during
   246  // PlannedReparentShard or tablet restart, in those cases the error will be saved
   247  // and will cause the VDiff to be retried ASAP -- which is NOT what we want here.
   248  func (ct *controller) markStoppedByRequest() error {
   249  	dbClient := ct.vde.dbClientFactoryFiltered()
   250  	if err := dbClient.Connect(); err != nil {
   251  		return fmt.Errorf("encountered an error marking vdiff %s as stopped: %v", ct.uuid, err)
   252  	}
   253  	defer dbClient.Close()
   254  
   255  	query := fmt.Sprintf(sqlUpdateVDiffStopped, ct.id)
   256  	var res *sqltypes.Result
   257  	var err error
   258  	if res, err = dbClient.ExecuteFetch(query, 1); err != nil {
   259  		return fmt.Errorf("encountered an error marking vdiff %s as stopped: %v", ct.uuid, err)
   260  	}
   261  	// We don't mark it as stopped if it's already completed
   262  	if res.RowsAffected > 0 {
   263  		insertVDiffLog(ct.vde.ctx, dbClient, ct.id, fmt.Sprintf("State changed to: %s (by user request)", StoppedState))
   264  	}
   265  
   266  	return nil
   267  }
   268  
   269  func newMigrationSource() *migrationSource {
   270  	return &migrationSource{shardStreamer: &shardStreamer{}}
   271  }
   272  
   273  func (ct *controller) validate() error {
   274  	// TODO: check if vreplication workflow has errors, what else?
   275  	return nil
   276  }
   277  
   278  // saveErrorState saves the error state for the vdiff in the database.
   279  // It never gives up trying to save the error state, unless the context
   280  // has been cancelled or the done channel has been closed -- indicating
   281  // that the engine is closing or the vdiff has been explicitly stopped.
   282  // Note that when the engine is later opened the started vdiff will be
   283  // restarted even though we were unable to save the error state.
   284  // It uses exponential backoff with a factor of 1.5 to avoid creating
   285  // too many database connections.
   286  func (ct *controller) saveErrorState(ctx context.Context, saveErr error) error {
   287  	retryDelay := 100 * time.Millisecond
   288  	maxRetryDelay := 60 * time.Second
   289  	save := func() error {
   290  		dbClient := ct.vde.dbClientFactoryFiltered()
   291  		if err := dbClient.Connect(); err != nil {
   292  			return err
   293  		}
   294  		defer dbClient.Close()
   295  
   296  		if err := ct.updateState(dbClient, ErrorState, saveErr); err != nil {
   297  			return err
   298  		}
   299  		insertVDiffLog(ctx, dbClient, ct.id, fmt.Sprintf("Error: %s", saveErr))
   300  
   301  		return nil
   302  	}
   303  
   304  	for {
   305  		if err := save(); err != nil {
   306  			log.Warningf("Failed to persist vdiff error state: %v. Will retry in %s", err, retryDelay.String())
   307  			select {
   308  			case <-ctx.Done():
   309  				return fmt.Errorf("engine is shutting down")
   310  			case <-ct.done:
   311  				return fmt.Errorf("vdiff was stopped")
   312  			case <-time.After(retryDelay):
   313  				if retryDelay < maxRetryDelay {
   314  					retryDelay = time.Duration(float64(retryDelay) * 1.5)
   315  					if retryDelay > maxRetryDelay {
   316  						retryDelay = maxRetryDelay
   317  					}
   318  				}
   319  				continue
   320  			}
   321  		}
   322  
   323  		// Success
   324  		return nil
   325  	}
   326  }