vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/vreplication/controller.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vreplication
    18  
    19  import (
    20  	"fmt"
    21  	"strconv"
    22  	"strings"
    23  	"time"
    24  
    25  	"google.golang.org/protobuf/encoding/prototext"
    26  
    27  	"vitess.io/vitess/go/vt/discovery"
    28  	"vitess.io/vitess/go/vt/vterrors"
    29  
    30  	"context"
    31  
    32  	"vitess.io/vitess/go/sync2"
    33  	"vitess.io/vitess/go/tb"
    34  	"vitess.io/vitess/go/vt/binlog/binlogplayer"
    35  	"vitess.io/vitess/go/vt/log"
    36  	"vitess.io/vitess/go/vt/mysqlctl"
    37  	"vitess.io/vitess/go/vt/topo"
    38  
    39  	binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
    40  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    41  )
    42  
    43  const (
    44  	// How many times to retry tablet selection before we
    45  	// give up and return an error message that the user
    46  	// can see and act upon if needed.
    47  	tabletPickerRetries = 5
    48  )
    49  
    50  // controller is created by Engine. Members are initialized upfront.
    51  // There is no mutex within a controller becaust its members are
    52  // either read-only or self-synchronized.
    53  type controller struct {
    54  	vre             *Engine
    55  	dbClientFactory func() binlogplayer.DBClient
    56  	mysqld          mysqlctl.MysqlDaemon
    57  	blpStats        *binlogplayer.Stats
    58  
    59  	id           uint32
    60  	workflow     string
    61  	source       *binlogdatapb.BinlogSource
    62  	stopPos      string
    63  	tabletPicker *discovery.TabletPicker
    64  
    65  	cancel context.CancelFunc
    66  	done   chan struct{}
    67  
    68  	// The following fields are updated after start. So, they need synchronization.
    69  	sourceTablet sync2.AtomicString
    70  
    71  	lastWorkflowError *vterrors.LastError
    72  }
    73  
    74  // newController creates a new controller. Unless a stream is explicitly 'Stopped',
    75  // this function launches a goroutine to perform continuous vreplication.
    76  func newController(ctx context.Context, params map[string]string, dbClientFactory func() binlogplayer.DBClient, mysqld mysqlctl.MysqlDaemon, ts *topo.Server, cell, tabletTypesStr string, blpStats *binlogplayer.Stats, vre *Engine) (*controller, error) {
    77  	if blpStats == nil {
    78  		blpStats = binlogplayer.NewStats()
    79  	}
    80  
    81  	ct := &controller{
    82  		vre:             vre,
    83  		dbClientFactory: dbClientFactory,
    84  		mysqld:          mysqld,
    85  		blpStats:        blpStats,
    86  		done:            make(chan struct{}),
    87  		source:          &binlogdatapb.BinlogSource{},
    88  	}
    89  	log.Infof("creating controller with cell: %v, tabletTypes: %v, and params: %v", cell, tabletTypesStr, params)
    90  
    91  	// id
    92  	id, err := strconv.Atoi(params["id"])
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  	ct.id = uint32(id)
    97  	ct.workflow = params["workflow"]
    98  	ct.lastWorkflowError = vterrors.NewLastError(fmt.Sprintf("VReplication controller %d for workflow %q", ct.id, ct.workflow), maxTimeToRetryError)
    99  
   100  	state := params["state"]
   101  	blpStats.State.Set(state)
   102  	// Nothing to do if replication is stopped or is known to have an unrecoverable error.
   103  	if state == binlogplayer.BlpStopped || state == binlogplayer.BlpError {
   104  		ct.cancel = func() {}
   105  		close(ct.done)
   106  		return ct, nil
   107  	}
   108  
   109  	// source, stopPos
   110  	if err := prototext.Unmarshal([]byte(params["source"]), ct.source); err != nil {
   111  		return nil, err
   112  	}
   113  	ct.stopPos = params["stop_pos"]
   114  
   115  	if ct.source.GetExternalMysql() == "" {
   116  		// tabletPicker
   117  		if v := params["cell"]; v != "" {
   118  			cell = v
   119  		}
   120  		if v := params["tablet_types"]; v != "" {
   121  			tabletTypesStr = v
   122  		}
   123  		log.Infof("creating tablet picker for source keyspace/shard %v/%v with cell: %v and tabletTypes: %v", ct.source.Keyspace, ct.source.Shard, cell, tabletTypesStr)
   124  		cells := strings.Split(cell, ",")
   125  
   126  		sourceTopo := ts
   127  		if ct.source.ExternalCluster != "" {
   128  			sourceTopo, err = sourceTopo.OpenExternalVitessClusterServer(ctx, ct.source.ExternalCluster)
   129  			if err != nil {
   130  				return nil, err
   131  			}
   132  		}
   133  		tp, err := discovery.NewTabletPicker(sourceTopo, cells, ct.source.Keyspace, ct.source.Shard, tabletTypesStr)
   134  		if err != nil {
   135  			return nil, err
   136  		}
   137  		ct.tabletPicker = tp
   138  	}
   139  
   140  	// cancel
   141  	ctx, ct.cancel = context.WithCancel(ctx)
   142  
   143  	go ct.run(ctx)
   144  
   145  	return ct, nil
   146  }
   147  
   148  func (ct *controller) run(ctx context.Context) {
   149  	defer func() {
   150  		log.Infof("stream %v: stopped", ct.id)
   151  		close(ct.done)
   152  	}()
   153  
   154  	for {
   155  		err := ct.runBlp(ctx)
   156  		if err == nil {
   157  			return
   158  		}
   159  
   160  		// Sometimes, canceled contexts get wrapped as errors.
   161  		select {
   162  		case <-ctx.Done():
   163  			log.Warningf("context canceled: %s", err.Error())
   164  			return
   165  		default:
   166  		}
   167  
   168  		ct.blpStats.ErrorCounts.Add([]string{"Stream Error"}, 1)
   169  		binlogplayer.LogError(fmt.Sprintf("error in stream %v, retrying after %v", ct.id, retryDelay), err)
   170  		timer := time.NewTimer(retryDelay)
   171  		select {
   172  		case <-ctx.Done():
   173  			log.Warningf("context canceled: %s", err.Error())
   174  			timer.Stop()
   175  			return
   176  		case <-timer.C:
   177  		}
   178  	}
   179  }
   180  
   181  func (ct *controller) runBlp(ctx context.Context) (err error) {
   182  	defer func() {
   183  		ct.sourceTablet.Set("")
   184  		if x := recover(); x != nil {
   185  			log.Errorf("stream %v: caught panic: %v\n%s", ct.id, x, tb.Stack(4))
   186  			err = fmt.Errorf("panic: %v", x)
   187  		}
   188  	}()
   189  
   190  	select {
   191  	case <-ctx.Done():
   192  		return nil
   193  	default:
   194  	}
   195  
   196  	// Call this for youtube-specific customization.
   197  	// This should be done every time, in case mysql was restarted.
   198  	if err := ct.mysqld.EnableBinlogPlayback(); err != nil {
   199  		return err
   200  	}
   201  
   202  	dbClient := ct.dbClientFactory()
   203  	if err := dbClient.Connect(); err != nil {
   204  		return vterrors.Wrap(err, "can't connect to database")
   205  	}
   206  	defer dbClient.Close()
   207  
   208  	var tablet *topodatapb.Tablet
   209  	if ct.source.GetExternalMysql() == "" {
   210  		log.Infof("trying to find a tablet eligible for vreplication. stream id: %v", ct.id)
   211  		tpCtx, tpCancel := context.WithTimeout(ctx, discovery.GetTabletPickerRetryDelay()*tabletPickerRetries)
   212  		defer tpCancel()
   213  		tablet, err = ct.tabletPicker.PickForStreaming(tpCtx)
   214  		if err != nil {
   215  			select {
   216  			case <-ctx.Done():
   217  			default:
   218  				ct.blpStats.ErrorCounts.Add([]string{"No Source Tablet Found"}, 1)
   219  				ct.setMessage(dbClient, fmt.Sprintf("Error picking tablet: %s", err.Error()))
   220  			}
   221  			return err
   222  		}
   223  		ct.setMessage(dbClient, fmt.Sprintf("Picked source tablet: %s", tablet.Alias.String()))
   224  		log.Infof("found a tablet eligible for vreplication. stream id: %v  tablet: %s", ct.id, tablet.Alias.String())
   225  		ct.sourceTablet.Set(tablet.Alias.String())
   226  	}
   227  	switch {
   228  	case len(ct.source.Tables) > 0:
   229  		// Table names can have search patterns. Resolve them against the schema.
   230  		tables, err := mysqlctl.ResolveTables(ctx, ct.mysqld, dbClient.DBName(), ct.source.Tables)
   231  		if err != nil {
   232  			ct.blpStats.ErrorCounts.Add([]string{"Invalid Source"}, 1)
   233  			return vterrors.Wrap(err, "failed to resolve table names")
   234  		}
   235  
   236  		player := binlogplayer.NewBinlogPlayerTables(dbClient, tablet, tables, ct.id, ct.blpStats)
   237  		return player.ApplyBinlogEvents(ctx)
   238  	case ct.source.KeyRange != nil:
   239  		player := binlogplayer.NewBinlogPlayerKeyRange(dbClient, tablet, ct.source.KeyRange, ct.id, ct.blpStats)
   240  		return player.ApplyBinlogEvents(ctx)
   241  	case ct.source.Filter != nil:
   242  		// Timestamp fields from binlogs are always sent as UTC.
   243  		// So, we should set the timezone to be UTC for those values to be correctly inserted.
   244  		if _, err := dbClient.ExecuteFetch("set @@session.time_zone = '+00:00'", 10000); err != nil {
   245  			return err
   246  		}
   247  		// Tables may have varying character sets. To ship the bits without interpreting them
   248  		// we set the character set to be binary.
   249  		if _, err := dbClient.ExecuteFetch("set names binary", 10000); err != nil {
   250  			return err
   251  		}
   252  		// We must apply AUTO_INCREMENT values precisely as we got them. This include the 0 value, which is not recommended in AUTO_INCREMENT, and yet is valid.
   253  		if _, err := dbClient.ExecuteFetch("set @@session.sql_mode = CONCAT(@@session.sql_mode, ',NO_AUTO_VALUE_ON_ZERO')", 10000); err != nil {
   254  			return err
   255  		}
   256  
   257  		var vsClient VStreamerClient
   258  		var err error
   259  		if name := ct.source.GetExternalMysql(); name != "" {
   260  			vsClient, err = ct.vre.ec.Get(name)
   261  			if err != nil {
   262  				return err
   263  			}
   264  		} else {
   265  			vsClient = newTabletConnector(tablet)
   266  		}
   267  		if err := vsClient.Open(ctx); err != nil {
   268  			return err
   269  		}
   270  		defer vsClient.Close(ctx)
   271  
   272  		vr := newVReplicator(ct.id, ct.source, vsClient, ct.blpStats, dbClient, ct.mysqld, ct.vre)
   273  		err = vr.Replicate(ctx)
   274  		ct.lastWorkflowError.Record(err)
   275  		// If this is a mysql error that we know needs manual intervention OR
   276  		// we cannot identify this as non-recoverable, but it has persisted beyond the retry limit (maxTimeToRetryError)
   277  		if isUnrecoverableError(err) || !ct.lastWorkflowError.ShouldRetry() {
   278  			log.Errorf("vreplication stream %d going into error state due to %+v", ct.id, err)
   279  			if errSetState := vr.setState(binlogplayer.BlpError, err.Error()); errSetState != nil {
   280  				return err // yes, err and not errSetState.
   281  			}
   282  			return nil // this will cause vreplicate to quit the workflow
   283  		}
   284  		return err
   285  	}
   286  	ct.blpStats.ErrorCounts.Add([]string{"Invalid Source"}, 1)
   287  	return fmt.Errorf("missing source")
   288  }
   289  
   290  func (ct *controller) setMessage(dbClient binlogplayer.DBClient, message string) error {
   291  	ct.blpStats.History.Add(&binlogplayer.StatsHistoryRecord{
   292  		Time:    time.Now(),
   293  		Message: message,
   294  	})
   295  	query := fmt.Sprintf("update _vt.vreplication set message=%v where id=%v", encodeString(binlogplayer.MessageTruncate(message)), ct.id)
   296  	if _, err := dbClient.ExecuteFetch(query, 1); err != nil {
   297  		return fmt.Errorf("could not set message: %v: %v", query, err)
   298  	}
   299  	return nil
   300  }
   301  func (ct *controller) Stop() {
   302  	ct.cancel()
   303  	<-ct.done
   304  }