vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/vdiff/engine.go (about)

     1  /*
     2  Copyright 2022 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vdiff
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"fmt"
    24  	"sync"
    25  	"time"
    26  
    27  	"vitess.io/vitess/go/mysql"
    28  	"vitess.io/vitess/go/vt/proto/tabletmanagerdata"
    29  	"vitess.io/vitess/go/vt/proto/topodata"
    30  	"vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication"
    31  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    32  
    33  	"vitess.io/vitess/go/sqltypes"
    34  	"vitess.io/vitess/go/sync2"
    35  	"vitess.io/vitess/go/vt/binlog/binlogplayer"
    36  	"vitess.io/vitess/go/vt/dbconfigs"
    37  	"vitess.io/vitess/go/vt/log"
    38  	"vitess.io/vitess/go/vt/topo"
    39  	"vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv"
    40  )
    41  
    42  type Engine struct {
    43  	isOpen bool
    44  
    45  	mu          sync.Mutex // guards controllers
    46  	controllers map[int64]*controller
    47  
    48  	// ctx is the root context for all controllers
    49  	ctx         context.Context
    50  	cancel      context.CancelFunc
    51  	cancelRetry context.CancelFunc
    52  
    53  	ts                      *topo.Server
    54  	tmClientFactory         func() tmclient.TabletManagerClient
    55  	dbClientFactoryFiltered func() binlogplayer.DBClient
    56  	dbClientFactoryDba      func() binlogplayer.DBClient
    57  	dbName                  string
    58  
    59  	vre *vreplication.Engine
    60  
    61  	wg         sync.WaitGroup
    62  	thisTablet *topodata.Tablet
    63  
    64  	// snapshotMu is used to ensure that only one vdiff snapshot cycle is active at a time,
    65  	// because we stop/start vreplication workflows during this process
    66  	snapshotMu sync.Mutex
    67  
    68  	vdiffSchemaCreateOnce sync.Once
    69  
    70  	// This should only be set when the engine is being used in tests. It then provides
    71  	// modified behavior for that env, e.g. not starting the retry goroutine. This should
    72  	// NOT be set in production.
    73  	fortests bool
    74  }
    75  
    76  func NewEngine(config *tabletenv.TabletConfig, ts *topo.Server, tablet *topodata.Tablet) *Engine {
    77  	vde := &Engine{
    78  		controllers:     make(map[int64]*controller),
    79  		ts:              ts,
    80  		thisTablet:      tablet,
    81  		tmClientFactory: func() tmclient.TabletManagerClient { return tmclient.NewTabletManagerClient() },
    82  	}
    83  	return vde
    84  }
    85  
    86  // NewTestEngine creates an Engine for use in tests. It uses the custom db client factory and
    87  // tablet manager client factory, while setting the fortests field to true to modify any engine
    88  // behavior when used in tests (e.g. not starting the retry goroutine).
    89  func NewTestEngine(ts *topo.Server, tablet *topodata.Tablet, dbn string, dbcf func() binlogplayer.DBClient, tmcf func() tmclient.TabletManagerClient) *Engine {
    90  	vde := &Engine{
    91  		controllers:             make(map[int64]*controller),
    92  		ts:                      ts,
    93  		thisTablet:              tablet,
    94  		dbName:                  dbn,
    95  		dbClientFactoryFiltered: dbcf,
    96  		dbClientFactoryDba:      dbcf,
    97  		tmClientFactory:         tmcf,
    98  		fortests:                true,
    99  	}
   100  	return vde
   101  }
   102  
   103  func (vde *Engine) InitDBConfig(dbcfgs *dbconfigs.DBConfigs) {
   104  	// If it's a test engine and we're already initilized then do nothing.
   105  	if vde.fortests && vde.dbClientFactoryFiltered != nil && vde.dbClientFactoryDba != nil {
   106  		return
   107  	}
   108  	vde.dbClientFactoryFiltered = func() binlogplayer.DBClient {
   109  		return binlogplayer.NewDBClient(dbcfgs.FilteredWithDB())
   110  	}
   111  	vde.dbClientFactoryDba = func() binlogplayer.DBClient {
   112  		return binlogplayer.NewDBClient(dbcfgs.DbaWithDB())
   113  	}
   114  	vde.dbName = dbcfgs.DBName
   115  }
   116  
   117  func (vde *Engine) Open(ctx context.Context, vre *vreplication.Engine) {
   118  	vde.mu.Lock()
   119  	defer vde.mu.Unlock()
   120  	if vde.ts == nil || vde.isOpen {
   121  		return
   122  	}
   123  	log.Infof("VDiff Engine: opening...")
   124  
   125  	if vde.cancelRetry != nil {
   126  		vde.cancelRetry()
   127  		vde.cancelRetry = nil
   128  	}
   129  	vde.vre = vre
   130  	if err := vde.openLocked(ctx); err != nil {
   131  		log.Infof("openLocked error: %s", err)
   132  		ctx, cancel := context.WithCancel(ctx)
   133  		vde.cancelRetry = cancel
   134  		go vde.retry(ctx, err)
   135  	}
   136  }
   137  
   138  func (vde *Engine) openLocked(ctx context.Context) error {
   139  	// This should never happen
   140  	if len(vde.controllers) > 0 {
   141  		log.Warningf("VDiff Engine invalid state detected: %d controllers existed when opening; resetting state", len(vde.controllers))
   142  		vde.resetControllers()
   143  	}
   144  
   145  	// At this point the tablet has no controllers running. So
   146  	// we want to start any VDiffs that have not been explicitly
   147  	// stopped or otherwise finished.
   148  	rows, err := vde.getVDiffsToRun(ctx)
   149  	if err != nil {
   150  		return err
   151  	}
   152  	vde.ctx, vde.cancel = context.WithCancel(ctx)
   153  	vde.isOpen = true // now we are open and have things to close
   154  	if err := vde.initControllers(rows); err != nil {
   155  		return err
   156  	}
   157  
   158  	// At this point we've fully and succesfully opened so begin
   159  	// retrying error'd VDiffs until the engine is closed.
   160  	vde.wg.Add(1)
   161  	go func() {
   162  		defer vde.wg.Done()
   163  		if vde.fortests {
   164  			return
   165  		}
   166  		vde.retryErroredVDiffs()
   167  	}()
   168  
   169  	return nil
   170  }
   171  
   172  var openRetryInterval = sync2.NewAtomicDuration(1 * time.Second)
   173  
   174  func (vde *Engine) retry(ctx context.Context, err error) {
   175  	log.Errorf("Error starting vdiff engine: %v, will keep retrying.", err)
   176  	for {
   177  		timer := time.NewTimer(openRetryInterval.Get())
   178  		select {
   179  		case <-ctx.Done():
   180  			timer.Stop()
   181  			return
   182  		case <-timer.C:
   183  		}
   184  		vde.mu.Lock()
   185  		// Recheck the context within the lock.
   186  		// This guarantees that we will not retry
   187  		// after the context was canceled. This
   188  		// can almost never happen.
   189  		select {
   190  		case <-ctx.Done():
   191  			vde.mu.Unlock()
   192  			return
   193  		default:
   194  		}
   195  		if err := vde.openLocked(ctx); err == nil {
   196  			log.Infof("VDiff engine: opened successfully")
   197  			// Don't invoke cancelRetry because openLocked
   198  			// will hold on to this context for later cancelation.
   199  			vde.cancelRetry = nil
   200  			vde.mu.Unlock()
   201  			return
   202  		}
   203  		vde.mu.Unlock()
   204  	}
   205  }
   206  
   207  // addController creates a new controller using the given vdiff record and adds it to the engine.
   208  // You must already have the main engine mutex (mu) locked before calling this.
   209  func (vde *Engine) addController(row sqltypes.RowNamedValues, options *tabletmanagerdata.VDiffOptions) error {
   210  	ct, err := newController(vde.ctx, row, vde.dbClientFactoryDba, vde.ts, vde, options)
   211  	if err != nil {
   212  		return fmt.Errorf("controller could not be initialized for stream %+v on tablet %v",
   213  			row, vde.thisTablet.Alias)
   214  	}
   215  	vde.controllers[ct.id] = ct
   216  	return nil
   217  }
   218  
   219  func (vde *Engine) initControllers(qr *sqltypes.Result) error {
   220  	if qr == nil || len(qr.Rows) == 0 {
   221  		return nil
   222  	}
   223  	for _, row := range qr.Named().Rows {
   224  		options := &tabletmanagerdata.VDiffOptions{}
   225  		if err := json.Unmarshal(row.AsBytes("options", []byte("{}")), options); err != nil {
   226  			return err
   227  		}
   228  		if err := vde.addController(row, options); err != nil {
   229  			return err
   230  		}
   231  	}
   232  	return nil
   233  }
   234  
   235  // IsOpen returns true if Engine is open.
   236  func (vde *Engine) IsOpen() bool {
   237  	vde.mu.Lock()
   238  	defer vde.mu.Unlock()
   239  	return vde.isOpen
   240  }
   241  
   242  // Close closes the Engine service.
   243  func (vde *Engine) Close() {
   244  	vde.mu.Lock()
   245  	defer vde.mu.Unlock()
   246  
   247  	// If we're retrying, we're not open.
   248  	// Just cancel the retry loop.
   249  	if vde.cancelRetry != nil {
   250  		vde.cancelRetry()
   251  		vde.cancelRetry = nil
   252  		return
   253  	}
   254  
   255  	if !vde.isOpen {
   256  		return
   257  	}
   258  
   259  	vde.cancel()
   260  
   261  	// We still have to wait for all controllers to stop.
   262  	vde.resetControllers()
   263  
   264  	// Wait for long-running functions to exit.
   265  	vde.wg.Wait()
   266  
   267  	vde.isOpen = false
   268  
   269  	log.Infof("VDiff Engine: closed")
   270  }
   271  
   272  func (vde *Engine) getVDiffsToRun(ctx context.Context) (*sqltypes.Result, error) {
   273  	dbClient := vde.dbClientFactoryFiltered()
   274  	if err := dbClient.Connect(); err != nil {
   275  		return nil, err
   276  	}
   277  	defer dbClient.Close()
   278  
   279  	// We have to use ExecIgnore here so as not to block quick tablet state
   280  	// transitions from primary to non-primary when starting the engine
   281  	qr, err := dbClient.ExecuteFetch(sqlGetVDiffsToRun, -1)
   282  	if err != nil {
   283  		return nil, err
   284  	}
   285  	if len(qr.Rows) == 0 {
   286  		return nil, nil
   287  	}
   288  	return qr, nil
   289  }
   290  
   291  func (vde *Engine) getVDiffsToRetry(ctx context.Context, dbClient binlogplayer.DBClient) (*sqltypes.Result, error) {
   292  	qr, err := dbClient.ExecuteFetch(sqlGetVDiffsToRetry, -1)
   293  	if err != nil {
   294  		return nil, err
   295  	}
   296  	if len(qr.Rows) == 0 {
   297  		return nil, nil
   298  	}
   299  	return qr, nil
   300  }
   301  
   302  func (vde *Engine) getVDiffByID(ctx context.Context, dbClient binlogplayer.DBClient, id int64) (*sqltypes.Result, error) {
   303  	qr, err := dbClient.ExecuteFetch(fmt.Sprintf(sqlGetVDiffByID, id), -1)
   304  	if err != nil {
   305  		return nil, err
   306  	}
   307  	if len(qr.Rows) != 1 {
   308  		return nil, fmt.Errorf("no vdiff found for id %d on tablet %v",
   309  			id, vde.thisTablet.Alias)
   310  	}
   311  	return qr, nil
   312  }
   313  
   314  func (vde *Engine) retryVDiffs(ctx context.Context) error {
   315  	vde.mu.Lock()
   316  	defer vde.mu.Unlock()
   317  	dbClient := vde.dbClientFactoryFiltered()
   318  	if err := dbClient.Connect(); err != nil {
   319  		return err
   320  	}
   321  	defer dbClient.Close()
   322  
   323  	qr, err := vde.getVDiffsToRetry(ctx, dbClient)
   324  	if err != nil {
   325  		return err
   326  	}
   327  	if qr == nil || len(qr.Rows) == 0 {
   328  		return nil
   329  	}
   330  	for _, row := range qr.Named().Rows {
   331  		select {
   332  		case <-ctx.Done():
   333  			return ctx.Err()
   334  		default:
   335  		}
   336  		lastError := mysql.NewSQLErrorFromError(errors.New(row.AsString("last_error", "")))
   337  		if !mysql.IsEphemeralError(lastError) {
   338  			continue
   339  		}
   340  		uuid := row.AsString("vdiff_uuid", "")
   341  		id, err := row.ToInt64("id")
   342  		if err != nil {
   343  			return err
   344  		}
   345  		log.Infof("Retrying vdiff %s that had an ephemeral error of '%v'", uuid, lastError)
   346  		if _, err = dbClient.ExecuteFetch(fmt.Sprintf(sqlRetryVDiff, id), 1); err != nil {
   347  			return err
   348  		}
   349  		options := &tabletmanagerdata.VDiffOptions{}
   350  		if err := json.Unmarshal(row.AsBytes("options", []byte("{}")), options); err != nil {
   351  			return err
   352  		}
   353  		if err := vde.addController(row, options); err != nil {
   354  			return err
   355  		}
   356  	}
   357  	return nil
   358  }
   359  
   360  func (vde *Engine) retryErroredVDiffs() {
   361  	tkr := time.NewTicker(time.Second * 30)
   362  	defer tkr.Stop()
   363  	for {
   364  		select {
   365  		case <-vde.ctx.Done():
   366  			log.Info("VDiff engine: closing...")
   367  			return
   368  		case <-tkr.C:
   369  		}
   370  
   371  		if err := vde.retryVDiffs(vde.ctx); err != nil {
   372  			log.Errorf("Error retrying vdiffs: %v", err)
   373  		}
   374  	}
   375  }
   376  
   377  func (vde *Engine) resetControllers() {
   378  	for _, ct := range vde.controllers {
   379  		ct.Stop()
   380  	}
   381  	vde.controllers = make(map[int64]*controller)
   382  }