vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package vstreamer
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"time"
    25  
    26  	"google.golang.org/protobuf/encoding/prototext"
    27  
    28  	"vitess.io/vitess/go/mysql/collations"
    29  	"vitess.io/vitess/go/timer"
    30  	vtschema "vitess.io/vitess/go/vt/schema"
    31  	"vitess.io/vitess/go/vt/vterrors"
    32  
    33  	"vitess.io/vitess/go/mysql"
    34  	"vitess.io/vitess/go/sqltypes"
    35  	"vitess.io/vitess/go/sync2"
    36  	"vitess.io/vitess/go/vt/binlog"
    37  	"vitess.io/vitess/go/vt/dbconfigs"
    38  	"vitess.io/vitess/go/vt/log"
    39  	"vitess.io/vitess/go/vt/sqlparser"
    40  	"vitess.io/vitess/go/vt/vttablet/tabletserver/schema"
    41  
    42  	binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
    43  	querypb "vitess.io/vitess/go/vt/proto/query"
    44  	vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
    45  )
    46  
    47  const (
    48  	trxHistoryLenQuery = `select count as history_len from information_schema.INNODB_METRICS where name = 'trx_rseg_history_len'`
    49  	replicaLagQuery    = `show slave status`
    50  	hostQuery          = `select @@hostname as hostname, @@port as port`
    51  )
    52  
    53  // HeartbeatTime is set to slightly below 1s, compared to idleTimeout
    54  // set by VPlayer at slightly above 1s. This minimizes conflicts
    55  // between the two timeouts.
    56  var HeartbeatTime = 900 * time.Millisecond
    57  
    58  // vschemaUpdateCount is for testing only.
    59  // vstreamer is a mutex free data structure. So, it's not safe to access its members
    60  // from a test. Since VSchema gets updated asynchronously, there's no way for a test
    61  // to wait for it. Instead, the code that updates the vschema increments this atomic
    62  // counter, which will let the tests poll for it to change.
    63  // TODO(sougou): find a better way for this.
    64  var vschemaUpdateCount sync2.AtomicInt64
    65  
    66  // vstreamer is for serving a single vreplication stream on the source side.
    67  type vstreamer struct {
    68  	ctx    context.Context
    69  	cancel func()
    70  
    71  	cp       dbconfigs.Connector
    72  	se       *schema.Engine
    73  	startPos string
    74  	filter   *binlogdatapb.Filter
    75  	send     func([]*binlogdatapb.VEvent) error
    76  
    77  	vevents        chan *localVSchema
    78  	vschema        *localVSchema
    79  	plans          map[uint64]*streamerPlan
    80  	journalTableID uint64
    81  	versionTableID uint64
    82  
    83  	// format and pos are updated by parseEvent.
    84  	format  mysql.BinlogFormat
    85  	pos     mysql.Position
    86  	stopPos string
    87  
    88  	phase string
    89  	vse   *Engine
    90  }
    91  
    92  // streamerPlan extends the original plan to also include
    93  // the TableMap, which comes from the binlog. It's used
    94  // to extract values from the ROW events.
    95  type streamerPlan struct {
    96  	*Plan
    97  	TableMap *mysql.TableMap
    98  }
    99  
   100  // newVStreamer creates a new vstreamer.
   101  // cp: the mysql conn params.
   102  // sh: the schema engine. The vstreamer uses it to convert the TableMap into field info.
   103  // startPos: a flavor compliant position to stream from. This can also contain the special
   104  //
   105  //	value "current", which means start from the current position.
   106  //
   107  // filter: the list of filtering rules. If a rule has a select expression for its filter,
   108  //
   109  //	the select list can only reference direct columns. No other expressions are allowed.
   110  //	The select expression is allowed to contain the special 'keyspace_id()' function which
   111  //	will return the keyspace id of the row. Examples:
   112  //	"select * from t", same as an empty Filter,
   113  //	"select * from t where in_keyrange('-80')", same as "-80",
   114  //	"select * from t where in_keyrange(col1, 'hash', '-80')",
   115  //	"select col1, col2 from t where...",
   116  //	"select col1, keyspace_id() from t where...".
   117  //	Only "in_keyrange" and limited comparison operators (see enum Opcode in planbuilder.go) are supported in the where clause.
   118  //	Other constructs like joins, group by, etc. are not supported.
   119  //
   120  // vschema: the current vschema. This value can later be changed through the SetVSchema method.
   121  // send: callback function to send events.
   122  func newVStreamer(ctx context.Context, cp dbconfigs.Connector, se *schema.Engine, startPos string, stopPos string, filter *binlogdatapb.Filter, vschema *localVSchema, send func([]*binlogdatapb.VEvent) error, phase string, vse *Engine) *vstreamer {
   123  	ctx, cancel := context.WithCancel(ctx)
   124  	return &vstreamer{
   125  		ctx:      ctx,
   126  		cancel:   cancel,
   127  		cp:       cp,
   128  		se:       se,
   129  		startPos: startPos,
   130  		stopPos:  stopPos,
   131  		filter:   filter,
   132  		send:     send,
   133  		vevents:  make(chan *localVSchema, 1),
   134  		vschema:  vschema,
   135  		plans:    make(map[uint64]*streamerPlan),
   136  		phase:    phase,
   137  		vse:      vse,
   138  	}
   139  }
   140  
   141  // SetVSchema updates the vstreamer against the new vschema.
   142  func (vs *vstreamer) SetVSchema(vschema *localVSchema) {
   143  	// Since vs.Stream is a single-threaded loop. We just send an event to
   144  	// that thread, which helps us avoid mutexes to update the plans.
   145  	select {
   146  	case vs.vevents <- vschema:
   147  	case <-vs.ctx.Done():
   148  	default: // if there is a pending vschema in the channel, drain it and update it with the latest one
   149  		select {
   150  		case <-vs.vevents:
   151  			vs.vevents <- vschema
   152  		default:
   153  		}
   154  	}
   155  }
   156  
   157  // Cancel stops the streaming.
   158  func (vs *vstreamer) Cancel() {
   159  	vs.cancel()
   160  }
   161  
   162  // Stream streams binlog events.
   163  func (vs *vstreamer) Stream() error {
   164  	//defer vs.cancel()
   165  	ctx := context.Background()
   166  	vs.vse.vstreamerCount.Add(1)
   167  	defer func() {
   168  		ctx.Done()
   169  		vs.vse.vstreamerCount.Add(-1)
   170  	}()
   171  	vs.vse.vstreamersCreated.Add(1)
   172  	log.Infof("Starting Stream() with startPos %s", vs.startPos)
   173  	pos, err := mysql.DecodePosition(vs.startPos)
   174  	if err != nil {
   175  		vs.vse.errorCounts.Add("StreamRows", 1)
   176  		vs.vse.vstreamersEndedWithErrors.Add(1)
   177  		return err
   178  	}
   179  	vs.pos = pos
   180  	return vs.replicate(ctx)
   181  }
   182  
   183  // Stream streams binlog events.
   184  func (vs *vstreamer) replicate(ctx context.Context) error {
   185  	// Ensure se is Open. If vttablet came up in a non_serving role,
   186  	// the schema engine may not have been initialized.
   187  	if err := vs.se.Open(); err != nil {
   188  		return wrapError(err, vs.pos, vs.vse)
   189  	}
   190  
   191  	conn, err := binlog.NewBinlogConnection(vs.cp)
   192  	if err != nil {
   193  		return wrapError(err, vs.pos, vs.vse)
   194  	}
   195  	defer conn.Close()
   196  
   197  	events, errs, err := conn.StartBinlogDumpFromPosition(vs.ctx, "", vs.pos)
   198  	if err != nil {
   199  		return wrapError(err, vs.pos, vs.vse)
   200  	}
   201  	err = vs.parseEvents(vs.ctx, events, errs)
   202  	return wrapError(err, vs.pos, vs.vse)
   203  }
   204  
   205  // parseEvents parses and sends events.
   206  func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.BinlogEvent, errs <-chan error) error {
   207  	// bufferAndTransmit uses bufferedEvents and curSize to buffer events.
   208  	var (
   209  		bufferedEvents []*binlogdatapb.VEvent
   210  		curSize        int
   211  	)
   212  
   213  	// Only the following patterns are possible:
   214  	// BEGIN->ROWs or Statements->GTID->COMMIT. In the case of large transactions, this can be broken into chunks.
   215  	// BEGIN->JOURNAL->GTID->COMMIT
   216  	// GTID->DDL
   217  	// GTID->OTHER
   218  	// HEARTBEAT is issued if there's inactivity, which is likely
   219  	// to heppend between one group of events and another.
   220  	//
   221  	// Buffering only takes row or statement lengths into consideration.
   222  	// Length of other events is considered negligible.
   223  	// If a new row event causes the packet size to be exceeded,
   224  	// all existing rows are sent without the new row.
   225  	// If a single row exceeds the packet size, it will be in its own packet.
   226  	bufferAndTransmit := func(vevent *binlogdatapb.VEvent) error {
   227  		vevent.Keyspace = vs.vse.keyspace
   228  		vevent.Shard = vs.vse.shard
   229  
   230  		switch vevent.Type {
   231  		case binlogdatapb.VEventType_GTID, binlogdatapb.VEventType_BEGIN, binlogdatapb.VEventType_FIELD,
   232  			binlogdatapb.VEventType_JOURNAL:
   233  			// We never have to send GTID, BEGIN, FIELD events on their own.
   234  			// A JOURNAL event is always preceded by a BEGIN and followed by a COMMIT.
   235  			// So, we don't have to send it right away.
   236  			bufferedEvents = append(bufferedEvents, vevent)
   237  		case binlogdatapb.VEventType_COMMIT, binlogdatapb.VEventType_DDL, binlogdatapb.VEventType_OTHER,
   238  			binlogdatapb.VEventType_HEARTBEAT, binlogdatapb.VEventType_VERSION:
   239  			// COMMIT, DDL, OTHER and HEARTBEAT must be immediately sent.
   240  			// Although unlikely, it's possible to get a HEARTBEAT in the middle
   241  			// of a transaction. If so, we still send the partial transaction along
   242  			// with the heartbeat.
   243  			bufferedEvents = append(bufferedEvents, vevent)
   244  			vevents := bufferedEvents
   245  			bufferedEvents = nil
   246  			curSize = 0
   247  			return vs.send(vevents)
   248  		case binlogdatapb.VEventType_INSERT, binlogdatapb.VEventType_DELETE, binlogdatapb.VEventType_UPDATE, binlogdatapb.VEventType_REPLACE:
   249  			newSize := len(vevent.GetDml())
   250  			if curSize+newSize > defaultPacketSize {
   251  				vs.vse.vstreamerNumPackets.Add(1)
   252  				vevents := bufferedEvents
   253  				bufferedEvents = []*binlogdatapb.VEvent{vevent}
   254  				curSize = newSize
   255  				return vs.send(vevents)
   256  			}
   257  			curSize += newSize
   258  			bufferedEvents = append(bufferedEvents, vevent)
   259  		case binlogdatapb.VEventType_ROW:
   260  			// ROW events happen inside transactions. So, we can chunk them.
   261  			// Buffer everything until packet size is reached, and then send.
   262  			newSize := 0
   263  			for _, rowChange := range vevent.RowEvent.RowChanges {
   264  				if rowChange.Before != nil {
   265  					newSize += len(rowChange.Before.Values)
   266  				}
   267  				if rowChange.After != nil {
   268  					newSize += len(rowChange.After.Values)
   269  				}
   270  			}
   271  			if curSize+newSize > defaultPacketSize {
   272  				vs.vse.vstreamerNumPackets.Add(1)
   273  				vevents := bufferedEvents
   274  				bufferedEvents = []*binlogdatapb.VEvent{vevent}
   275  				curSize = newSize
   276  				return vs.send(vevents)
   277  			}
   278  			curSize += newSize
   279  			bufferedEvents = append(bufferedEvents, vevent)
   280  		default:
   281  			vs.vse.errorCounts.Add("BufferAndTransmit", 1)
   282  			return fmt.Errorf("unexpected event: %v", vevent)
   283  		}
   284  		return nil
   285  	}
   286  
   287  	// Main loop: calls bufferAndTransmit as events arrive.
   288  	hbTimer := time.NewTimer(HeartbeatTime)
   289  	defer hbTimer.Stop()
   290  
   291  	injectHeartbeat := func(throttled bool) error {
   292  		now := time.Now().UnixNano()
   293  		select {
   294  		case <-ctx.Done():
   295  			return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired")
   296  		default:
   297  			err := bufferAndTransmit(&binlogdatapb.VEvent{
   298  				Type:        binlogdatapb.VEventType_HEARTBEAT,
   299  				Timestamp:   now / 1e9,
   300  				CurrentTime: now,
   301  				Throttled:   throttled,
   302  			})
   303  			return err
   304  		}
   305  	}
   306  
   307  	throttleEvents := func(throttledEvents chan mysql.BinlogEvent) {
   308  		throttledHeartbeatsRateLimiter := timer.NewRateLimiter(HeartbeatTime)
   309  		for {
   310  			// check throttler.
   311  			if !vs.vse.throttlerClient.ThrottleCheckOKOrWait(ctx) {
   312  				// make sure to leave if context is cancelled
   313  				select {
   314  				case <-ctx.Done():
   315  					return
   316  				default:
   317  					// do nothing special
   318  				}
   319  				throttledHeartbeatsRateLimiter.Do(func() error {
   320  					return injectHeartbeat(true)
   321  				})
   322  				// we won't process events, until we're no longer throttling
   323  				continue
   324  			}
   325  			select {
   326  			case ev, ok := <-events:
   327  				if ok {
   328  					select {
   329  					case throttledEvents <- ev:
   330  					case <-ctx.Done():
   331  						return
   332  					}
   333  				} else {
   334  					close(throttledEvents)
   335  					return
   336  				}
   337  			case <-ctx.Done():
   338  				return
   339  			}
   340  		}
   341  	}
   342  	// throttledEvents can be read just like you would read from events
   343  	// throttledEvents pulls data from events, but throttles pulling data,
   344  	// which in turn blocks the BinlogConnection from pushing events to the channel
   345  	throttledEvents := make(chan mysql.BinlogEvent)
   346  	go throttleEvents(throttledEvents)
   347  
   348  	for {
   349  		hbTimer.Reset(HeartbeatTime)
   350  		// Drain event if timer fired before reset.
   351  		select {
   352  		case <-hbTimer.C:
   353  		default:
   354  		}
   355  
   356  		select {
   357  		case ev, ok := <-throttledEvents:
   358  			if !ok {
   359  				select {
   360  				case <-ctx.Done():
   361  					return nil
   362  				default:
   363  				}
   364  				return fmt.Errorf("unexpected server EOF")
   365  			}
   366  			vevents, err := vs.parseEvent(ev)
   367  			if err != nil {
   368  				vs.vse.errorCounts.Add("ParseEvent", 1)
   369  				return err
   370  			}
   371  			for _, vevent := range vevents {
   372  				if err := bufferAndTransmit(vevent); err != nil {
   373  					if err == io.EOF {
   374  						return nil
   375  					}
   376  					vs.vse.errorCounts.Add("BufferAndTransmit", 1)
   377  					return fmt.Errorf("error sending event: %v", err)
   378  				}
   379  			}
   380  		case vs.vschema = <-vs.vevents:
   381  			select {
   382  			case <-ctx.Done():
   383  				return nil
   384  			default:
   385  				if err := vs.rebuildPlans(); err != nil {
   386  					return err
   387  				}
   388  				// Increment this counter for testing.
   389  				vschemaUpdateCount.Add(1)
   390  			}
   391  		case err := <-errs:
   392  			return err
   393  		case <-ctx.Done():
   394  			return nil
   395  		case <-hbTimer.C:
   396  			if err := injectHeartbeat(false); err != nil {
   397  				if err == io.EOF {
   398  					return nil
   399  				}
   400  				vs.vse.errorCounts.Add("Send", 1)
   401  				return fmt.Errorf("error sending event: %v", err)
   402  			}
   403  		}
   404  	}
   405  }
   406  
   407  // parseEvent parses an event from the binlog and converts it to a list of VEvents.
   408  func (vs *vstreamer) parseEvent(ev mysql.BinlogEvent) ([]*binlogdatapb.VEvent, error) {
   409  	if !ev.IsValid() {
   410  		return nil, fmt.Errorf("can't parse binlog event: invalid data: %#v", ev)
   411  	}
   412  
   413  	// We need to keep checking for FORMAT_DESCRIPTION_EVENT even after we've
   414  	// seen one, because another one might come along (e.g. on log rotate due to
   415  	// binlog settings change) that changes the format.
   416  	if ev.IsFormatDescription() {
   417  		var err error
   418  		vs.format, err = ev.Format()
   419  		if err != nil {
   420  			return nil, fmt.Errorf("can't parse FORMAT_DESCRIPTION_EVENT: %v, event data: %#v", err, ev)
   421  		}
   422  		return nil, nil
   423  	}
   424  
   425  	// We can't parse anything until we get a FORMAT_DESCRIPTION_EVENT that
   426  	// tells us the size of the event header.
   427  	if vs.format.IsZero() {
   428  		// The only thing that should come before the FORMAT_DESCRIPTION_EVENT
   429  		// is a fake ROTATE_EVENT, which the primary sends to tell us the name
   430  		// of the current log file.
   431  		if ev.IsRotate() {
   432  			return nil, nil
   433  		}
   434  		return nil, fmt.Errorf("got a real event before FORMAT_DESCRIPTION_EVENT: %#v", ev)
   435  	}
   436  
   437  	// Strip the checksum, if any. We don't actually verify the checksum, so discard it.
   438  	ev, _, err := ev.StripChecksum(vs.format)
   439  	if err != nil {
   440  		return nil, fmt.Errorf("can't strip checksum from binlog event: %v, event data: %#v", err, ev)
   441  	}
   442  
   443  	var vevents []*binlogdatapb.VEvent
   444  	switch {
   445  	case ev.IsGTID():
   446  		gtid, hasBegin, err := ev.GTID(vs.format)
   447  		if err != nil {
   448  			return nil, fmt.Errorf("can't get GTID from binlog event: %v, event data: %#v", err, ev)
   449  		}
   450  		if hasBegin {
   451  			vevents = append(vevents, &binlogdatapb.VEvent{
   452  				Type: binlogdatapb.VEventType_BEGIN,
   453  			})
   454  		}
   455  		vs.pos = mysql.AppendGTID(vs.pos, gtid)
   456  	case ev.IsXID():
   457  		vevents = append(vevents, &binlogdatapb.VEvent{
   458  			Type: binlogdatapb.VEventType_GTID,
   459  			Gtid: mysql.EncodePosition(vs.pos),
   460  		}, &binlogdatapb.VEvent{
   461  			Type: binlogdatapb.VEventType_COMMIT,
   462  		})
   463  	case ev.IsQuery():
   464  		q, err := ev.Query(vs.format)
   465  		if err != nil {
   466  			return nil, fmt.Errorf("can't get query from binlog event: %v, event data: %#v", err, ev)
   467  		}
   468  		// Insert/Delete/Update are supported only to be used in the context of external mysql streams where source databases
   469  		// could be using SBR. Vitess itself will never run into cases where it needs to consume non rbr statements.
   470  
   471  		switch cat := sqlparser.Preview(q.SQL); cat {
   472  		case sqlparser.StmtInsert:
   473  			mustSend := mustSendStmt(q, vs.cp.DBName())
   474  			if mustSend {
   475  				vevents = append(vevents, &binlogdatapb.VEvent{
   476  					Type: binlogdatapb.VEventType_INSERT,
   477  					Dml:  q.SQL,
   478  				})
   479  			}
   480  		case sqlparser.StmtUpdate:
   481  			mustSend := mustSendStmt(q, vs.cp.DBName())
   482  			if mustSend {
   483  				vevents = append(vevents, &binlogdatapb.VEvent{
   484  					Type: binlogdatapb.VEventType_UPDATE,
   485  					Dml:  q.SQL,
   486  				})
   487  			}
   488  		case sqlparser.StmtDelete:
   489  			mustSend := mustSendStmt(q, vs.cp.DBName())
   490  			if mustSend {
   491  				vevents = append(vevents, &binlogdatapb.VEvent{
   492  					Type: binlogdatapb.VEventType_DELETE,
   493  					Dml:  q.SQL,
   494  				})
   495  			}
   496  		case sqlparser.StmtReplace:
   497  			mustSend := mustSendStmt(q, vs.cp.DBName())
   498  			if mustSend {
   499  				vevents = append(vevents, &binlogdatapb.VEvent{
   500  					Type: binlogdatapb.VEventType_REPLACE,
   501  					Dml:  q.SQL,
   502  				})
   503  			}
   504  		case sqlparser.StmtBegin:
   505  			vevents = append(vevents, &binlogdatapb.VEvent{
   506  				Type: binlogdatapb.VEventType_BEGIN,
   507  			})
   508  		case sqlparser.StmtCommit:
   509  			vevents = append(vevents, &binlogdatapb.VEvent{
   510  				Type: binlogdatapb.VEventType_COMMIT,
   511  			})
   512  		case sqlparser.StmtDDL:
   513  			if mustSendDDL(q, vs.cp.DBName(), vs.filter) {
   514  				vevents = append(vevents, &binlogdatapb.VEvent{
   515  					Type: binlogdatapb.VEventType_GTID,
   516  					Gtid: mysql.EncodePosition(vs.pos),
   517  				}, &binlogdatapb.VEvent{
   518  					Type:      binlogdatapb.VEventType_DDL,
   519  					Statement: q.SQL,
   520  				})
   521  			} else {
   522  				// If the DDL need not be sent, send a dummy OTHER event.
   523  				vevents = append(vevents, &binlogdatapb.VEvent{
   524  					Type: binlogdatapb.VEventType_GTID,
   525  					Gtid: mysql.EncodePosition(vs.pos),
   526  				}, &binlogdatapb.VEvent{
   527  					Type: binlogdatapb.VEventType_OTHER,
   528  				})
   529  			}
   530  			if schema.MustReloadSchemaOnDDL(q.SQL, vs.cp.DBName()) {
   531  				vs.se.ReloadAt(context.Background(), vs.pos)
   532  			}
   533  		case sqlparser.StmtSavepoint:
   534  			// We currently completely skip `SAVEPOINT ...` statements.
   535  			//
   536  			// MySQL inserts `SAVEPOINT ...` statements into the binlog in row based, statement based
   537  			// and in mixed replication modes, but only ever writes `ROLLBACK TO ...` statements to the
   538  			// binlog in mixed or statement based replication modes. Without `ROLLBACK TO ...` statements,
   539  			// savepoints are side-effect free.
   540  			//
   541  			// Vitess only supports row based replication, so skipping the creation of savepoints
   542  			// reduces the amount of data send over to vplayer.
   543  		case sqlparser.StmtOther, sqlparser.StmtPriv, sqlparser.StmtSet, sqlparser.StmtComment, sqlparser.StmtFlush:
   544  			// These are either:
   545  			// 1) DBA statements like REPAIR that can be ignored.
   546  			// 2) Privilege-altering statements like GRANT/REVOKE
   547  			//    that we want to keep out of the stream for now.
   548  			vevents = append(vevents, &binlogdatapb.VEvent{
   549  				Type: binlogdatapb.VEventType_GTID,
   550  				Gtid: mysql.EncodePosition(vs.pos),
   551  			}, &binlogdatapb.VEvent{
   552  				Type: binlogdatapb.VEventType_OTHER,
   553  			})
   554  		default:
   555  			return nil, fmt.Errorf("unexpected statement type %s in row-based replication: %q", cat, q.SQL)
   556  		}
   557  	case ev.IsTableMap():
   558  		// This is very frequent. It precedes every row event.
   559  		// If it's the first time for a table, we generate a FIELD
   560  		// event, and also cache the plan. Subsequent TableMap events
   561  		// for that table id don't generate VEvents.
   562  		// A schema change will result in a change in table id, which
   563  		// will generate a new plan and FIELD event.
   564  		id := ev.TableID(vs.format)
   565  
   566  		tm, err := ev.TableMap(vs.format)
   567  		if err != nil {
   568  			return nil, err
   569  		}
   570  
   571  		if plan, ok := vs.plans[id]; ok {
   572  			// When the underlying mysql server restarts the table map can change.
   573  			// Usually the vstreamer will also error out when this happens, and vstreamer re-initializes its table map.
   574  			// But if the vstreamer is not aware of the restart, we could get an id that matches one in the cache, but
   575  			// is for a different table. We then invalidate and recompute the plan for this id.
   576  			if plan == nil || plan.Table.Name == tm.Name {
   577  				return nil, nil
   578  			}
   579  			vs.plans[id] = nil
   580  			log.Infof("table map changed: id %d for %s has changed to %s", id, plan.Table.Name, tm.Name)
   581  		}
   582  
   583  		if tm.Database == "_vt" && tm.Name == "resharding_journal" {
   584  			// A journal is a special case that generates a JOURNAL event.
   585  			return nil, vs.buildJournalPlan(id, tm)
   586  		} else if tm.Database == "_vt" && tm.Name == "schema_version" && !vs.se.SkipMetaCheck {
   587  			// Generates a Version event when it detects that a schema is stored in the schema_version table.
   588  			return nil, vs.buildVersionPlan(id, tm)
   589  		}
   590  		if tm.Database != "" && tm.Database != vs.cp.DBName() {
   591  			vs.plans[id] = nil
   592  			return nil, nil
   593  		}
   594  		if vtschema.IsInternalOperationTableName(tm.Name) { // ignore tables created by onlineddl/gh-ost/pt-osc
   595  			vs.plans[id] = nil
   596  			return nil, nil
   597  		}
   598  		if !ruleMatches(tm.Name, vs.filter) {
   599  			return nil, nil
   600  		}
   601  
   602  		vevent, err := vs.buildTablePlan(id, tm)
   603  		if err != nil {
   604  			vs.vse.errorCounts.Add("TablePlan", 1)
   605  			return nil, err
   606  		}
   607  		if vevent != nil {
   608  			vevents = append(vevents, vevent)
   609  		}
   610  	case ev.IsWriteRows() || ev.IsDeleteRows() || ev.IsUpdateRows():
   611  		// The existence of before and after images can be used to
   612  		// identify statement types. It's also possible that the
   613  		// before and after images end up going to different shards.
   614  		// If so, an update will be treated as delete on one shard
   615  		// and insert on the other.
   616  		id := ev.TableID(vs.format)
   617  		plan := vs.plans[id]
   618  		if plan == nil {
   619  			return nil, nil
   620  		}
   621  		rows, err := ev.Rows(vs.format, plan.TableMap)
   622  		if err != nil {
   623  			return nil, err
   624  		}
   625  		if id == vs.journalTableID {
   626  			vevents, err = vs.processJournalEvent(vevents, plan, rows)
   627  		} else if id == vs.versionTableID {
   628  			vs.se.RegisterVersionEvent()
   629  			vevent := &binlogdatapb.VEvent{
   630  				Type: binlogdatapb.VEventType_VERSION,
   631  			}
   632  			vevents = append(vevents, vevent)
   633  
   634  		} else {
   635  			vevents, err = vs.processRowEvent(vevents, plan, rows)
   636  		}
   637  		if err != nil {
   638  			return nil, err
   639  		}
   640  	case ev.IsCompressed():
   641  		log.Errorf("VReplication does not handle binlog compression")
   642  		return nil, fmt.Errorf("VReplication does not handle binlog compression")
   643  	}
   644  	for _, vevent := range vevents {
   645  		vevent.Timestamp = int64(ev.Timestamp())
   646  		vevent.CurrentTime = time.Now().UnixNano()
   647  	}
   648  	return vevents, nil
   649  }
   650  
   651  func (vs *vstreamer) buildJournalPlan(id uint64, tm *mysql.TableMap) error {
   652  	conn, err := vs.cp.Connect(vs.ctx)
   653  	if err != nil {
   654  		return err
   655  	}
   656  	defer conn.Close()
   657  	qr, err := conn.ExecuteFetch("select * from _vt.resharding_journal where 1 != 1", 1, true)
   658  	if err != nil {
   659  		return err
   660  	}
   661  	fields := qr.Fields
   662  	if len(fields) < len(tm.Types) {
   663  		return fmt.Errorf("cannot determine table columns for %s: event has %v, schema has %v", tm.Name, tm.Types, fields)
   664  	}
   665  	table := &Table{
   666  		Name:   "_vt.resharding_journal",
   667  		Fields: fields[:len(tm.Types)],
   668  	}
   669  	// Build a normal table plan, which means, return all rows
   670  	// and columns as is. Special handling is done when we actually
   671  	// receive the row event. We'll build a JOURNAL event instead.
   672  	plan, err := buildREPlan(table, nil, "")
   673  	if err != nil {
   674  		return err
   675  	}
   676  	vs.plans[id] = &streamerPlan{
   677  		Plan:     plan,
   678  		TableMap: tm,
   679  	}
   680  	vs.journalTableID = id
   681  	return nil
   682  }
   683  
   684  func (vs *vstreamer) buildVersionPlan(id uint64, tm *mysql.TableMap) error {
   685  	conn, err := vs.cp.Connect(vs.ctx)
   686  	if err != nil {
   687  		return err
   688  	}
   689  	defer conn.Close()
   690  	qr, err := conn.ExecuteFetch("select * from _vt.schema_version where 1 != 1", 1, true)
   691  	if err != nil {
   692  		return err
   693  	}
   694  	fields := qr.Fields
   695  	if len(fields) < len(tm.Types) {
   696  		return fmt.Errorf("cannot determine table columns for %s: event has %v, schema has %v", tm.Name, tm.Types, fields)
   697  	}
   698  	table := &Table{
   699  		Name:   "_vt.schema_version",
   700  		Fields: fields[:len(tm.Types)],
   701  	}
   702  	// Build a normal table plan, which means, return all rows
   703  	// and columns as is. Special handling is done when we actually
   704  	// receive the row event. We'll build a JOURNAL event instead.
   705  	plan, err := buildREPlan(table, nil, "")
   706  	if err != nil {
   707  		return err
   708  	}
   709  	vs.plans[id] = &streamerPlan{
   710  		Plan:     plan,
   711  		TableMap: tm,
   712  	}
   713  	vs.versionTableID = id
   714  	return nil
   715  }
   716  
   717  func (vs *vstreamer) buildTablePlan(id uint64, tm *mysql.TableMap) (*binlogdatapb.VEvent, error) {
   718  	cols, err := vs.buildTableColumns(tm)
   719  	if err != nil {
   720  		return nil, err
   721  	}
   722  
   723  	table := &Table{
   724  		Name:   tm.Name,
   725  		Fields: cols,
   726  	}
   727  	plan, err := buildPlan(table, vs.vschema, vs.filter)
   728  	if err != nil {
   729  		return nil, err
   730  	}
   731  	if plan == nil {
   732  		vs.plans[id] = nil
   733  		return nil, nil
   734  	}
   735  	vs.plans[id] = &streamerPlan{
   736  		Plan:     plan,
   737  		TableMap: tm,
   738  	}
   739  	return &binlogdatapb.VEvent{
   740  		Type: binlogdatapb.VEventType_FIELD,
   741  		FieldEvent: &binlogdatapb.FieldEvent{
   742  			TableName: plan.Table.Name,
   743  			Fields:    plan.fields(),
   744  			Keyspace:  vs.vse.keyspace,
   745  			Shard:     vs.vse.shard,
   746  		},
   747  	}, nil
   748  }
   749  
   750  func (vs *vstreamer) buildTableColumns(tm *mysql.TableMap) ([]*querypb.Field, error) {
   751  	var fields []*querypb.Field
   752  	for i, typ := range tm.Types {
   753  		t, err := sqltypes.MySQLToType(int64(typ), 0)
   754  		if err != nil {
   755  			return nil, fmt.Errorf("unsupported type: %d, position: %d", typ, i)
   756  		}
   757  		fields = append(fields, &querypb.Field{
   758  			Name: fmt.Sprintf("@%d", i+1),
   759  			Type: t,
   760  		})
   761  	}
   762  	st, err := vs.se.GetTableForPos(sqlparser.NewIdentifierCS(tm.Name), mysql.EncodePosition(vs.pos))
   763  	if err != nil {
   764  		if vs.filter.FieldEventMode == binlogdatapb.Filter_ERR_ON_MISMATCH {
   765  			log.Infof("No schema found for table %s", tm.Name)
   766  			return nil, fmt.Errorf("unknown table %v in schema", tm.Name)
   767  		}
   768  		return fields, nil
   769  	}
   770  
   771  	if len(st.Fields) < len(tm.Types) {
   772  		if vs.filter.FieldEventMode == binlogdatapb.Filter_ERR_ON_MISMATCH {
   773  			log.Infof("Cannot determine columns for table %s", tm.Name)
   774  			return nil, fmt.Errorf("cannot determine table columns for %s: event has %v, schema has %v", tm.Name, tm.Types, st.Fields)
   775  		}
   776  		return fields, nil
   777  	}
   778  
   779  	// check if the schema returned by schema.Engine matches with row.
   780  	for i := range tm.Types {
   781  		if !sqltypes.AreTypesEquivalent(fields[i].Type, st.Fields[i].Type) {
   782  			return fields, nil
   783  		}
   784  	}
   785  
   786  	// Columns should be truncated to match those in tm.
   787  	fields = st.Fields[:len(tm.Types)]
   788  	extColInfos, err := vs.getExtColInfos(tm.Name, tm.Database)
   789  	if err != nil {
   790  		return nil, err
   791  	}
   792  	for _, field := range fields {
   793  		// we want the MySQL column type info so that we can properly handle
   794  		// ambiguous binlog events and other cases where the internal types
   795  		// don't match the MySQL column type. One example being that in binlog
   796  		// events CHAR columns with a binary collation are indistinguishable
   797  		// from BINARY columns.
   798  		if extColInfo, ok := extColInfos[field.Name]; ok {
   799  			field.ColumnType = extColInfo.columnType
   800  		}
   801  	}
   802  	return fields, nil
   803  }
   804  
   805  // additional column attributes from information_schema.columns. Currently only column_type is used, but
   806  // we expect to add more in the future
   807  type extColInfo struct {
   808  	columnType string
   809  }
   810  
   811  func encodeString(in string) string {
   812  	buf := bytes.NewBuffer(nil)
   813  	sqltypes.NewVarChar(in).EncodeSQL(buf)
   814  	return buf.String()
   815  }
   816  
   817  func (vs *vstreamer) getExtColInfos(table, database string) (map[string]*extColInfo, error) {
   818  	extColInfos := make(map[string]*extColInfo)
   819  	conn, err := vs.cp.Connect(vs.ctx)
   820  	if err != nil {
   821  		return nil, err
   822  	}
   823  	defer conn.Close()
   824  	queryTemplate := "select column_name, column_type from information_schema.columns where table_schema=%s and table_name=%s;"
   825  	query := fmt.Sprintf(queryTemplate, encodeString(database), encodeString(table))
   826  	qr, err := conn.ExecuteFetch(query, 10000, false)
   827  	if err != nil {
   828  		return nil, err
   829  	}
   830  	for _, row := range qr.Rows {
   831  		extColInfo := &extColInfo{
   832  			columnType: row[1].ToString(),
   833  		}
   834  		extColInfos[row[0].ToString()] = extColInfo
   835  	}
   836  	return extColInfos, nil
   837  }
   838  
   839  func (vs *vstreamer) processJournalEvent(vevents []*binlogdatapb.VEvent, plan *streamerPlan, rows mysql.Rows) ([]*binlogdatapb.VEvent, error) {
   840  	// Get DbName
   841  	params, err := vs.cp.MysqlParams()
   842  	if err != nil {
   843  		return nil, err
   844  	}
   845  nextrow:
   846  	for _, row := range rows.Rows {
   847  		afterOK, afterValues, err := vs.extractRowAndFilter(plan, row.Data, rows.DataColumns, row.NullColumns)
   848  		if err != nil {
   849  			return nil, err
   850  		}
   851  		if !afterOK {
   852  			// This can happen if someone manually deleted rows.
   853  			continue
   854  		}
   855  		// Exclude events that don't match the db_name.
   856  		for i, fld := range plan.fields() {
   857  			if fld.Name == "db_name" && afterValues[i].ToString() != params.DbName {
   858  				continue nextrow
   859  			}
   860  		}
   861  		for i, fld := range plan.fields() {
   862  			if fld.Name != "val" {
   863  				continue
   864  			}
   865  			journal := &binlogdatapb.Journal{}
   866  			avBytes, err := afterValues[i].ToBytes()
   867  			if err != nil {
   868  				return nil, err
   869  			}
   870  			if err := prototext.Unmarshal(avBytes, journal); err != nil {
   871  				return nil, err
   872  			}
   873  			vevents = append(vevents, &binlogdatapb.VEvent{
   874  				Type:    binlogdatapb.VEventType_JOURNAL,
   875  				Journal: journal,
   876  			})
   877  		}
   878  	}
   879  	return vevents, nil
   880  }
   881  
   882  func (vs *vstreamer) processRowEvent(vevents []*binlogdatapb.VEvent, plan *streamerPlan, rows mysql.Rows) ([]*binlogdatapb.VEvent, error) {
   883  	rowChanges := make([]*binlogdatapb.RowChange, 0, len(rows.Rows))
   884  	for _, row := range rows.Rows {
   885  		beforeOK, beforeValues, err := vs.extractRowAndFilter(plan, row.Identify, rows.IdentifyColumns, row.NullIdentifyColumns)
   886  		if err != nil {
   887  			return nil, err
   888  		}
   889  		afterOK, afterValues, err := vs.extractRowAndFilter(plan, row.Data, rows.DataColumns, row.NullColumns)
   890  		if err != nil {
   891  			return nil, err
   892  		}
   893  		if !beforeOK && !afterOK {
   894  			continue
   895  		}
   896  		rowChange := &binlogdatapb.RowChange{}
   897  		if beforeOK {
   898  			rowChange.Before = sqltypes.RowToProto3(beforeValues)
   899  		}
   900  		if afterOK {
   901  			rowChange.After = sqltypes.RowToProto3(afterValues)
   902  		}
   903  		rowChanges = append(rowChanges, rowChange)
   904  	}
   905  	if len(rowChanges) != 0 {
   906  		vevents = append(vevents, &binlogdatapb.VEvent{
   907  			Type: binlogdatapb.VEventType_ROW,
   908  			RowEvent: &binlogdatapb.RowEvent{
   909  				TableName:  plan.Table.Name,
   910  				RowChanges: rowChanges,
   911  				Keyspace:   vs.vse.keyspace,
   912  				Shard:      vs.vse.shard,
   913  			},
   914  		})
   915  	}
   916  	return vevents, nil
   917  }
   918  
   919  func (vs *vstreamer) rebuildPlans() error {
   920  	for id, plan := range vs.plans {
   921  		if plan == nil {
   922  			// If a table has no plan, a vschema change will not
   923  			// cause that to change.
   924  			continue
   925  		}
   926  		newPlan, err := buildPlan(plan.Table, vs.vschema, vs.filter)
   927  		if err != nil {
   928  			return err
   929  		}
   930  		if newPlan == nil {
   931  			continue
   932  		}
   933  		vs.plans[id] = &streamerPlan{
   934  			Plan:     newPlan,
   935  			TableMap: plan.TableMap,
   936  		}
   937  	}
   938  	return nil
   939  }
   940  
   941  func (vs *vstreamer) extractRowAndFilter(plan *streamerPlan, data []byte, dataColumns, nullColumns mysql.Bitmap) (bool, []sqltypes.Value, error) {
   942  	if len(data) == 0 {
   943  		return false, nil, nil
   944  	}
   945  	values := make([]sqltypes.Value, dataColumns.Count())
   946  	charsets := make([]collations.ID, len(values))
   947  	valueIndex := 0
   948  	pos := 0
   949  	for colNum := 0; colNum < dataColumns.Count(); colNum++ {
   950  		if !dataColumns.Bit(colNum) {
   951  			return false, nil, fmt.Errorf("partial row image encountered: ensure binlog_row_image is set to 'full'")
   952  		}
   953  		if nullColumns.Bit(valueIndex) {
   954  			valueIndex++
   955  			continue
   956  		}
   957  		value, l, err := mysql.CellValue(data, pos, plan.TableMap.Types[colNum], plan.TableMap.Metadata[colNum], plan.Table.Fields[colNum])
   958  		if err != nil {
   959  			log.Errorf("extractRowAndFilter: %s, table: %s, colNum: %d, fields: %+v, current values: %+v",
   960  				err, plan.Table.Name, colNum, plan.Table.Fields, values)
   961  			return false, nil, err
   962  		}
   963  		pos += l
   964  
   965  		charsets[colNum] = collations.ID(plan.Table.Fields[colNum].Charset)
   966  		values[colNum] = value
   967  		valueIndex++
   968  	}
   969  	filtered := make([]sqltypes.Value, len(plan.ColExprs))
   970  	ok, err := plan.filter(values, filtered, charsets)
   971  	return ok, filtered, err
   972  }
   973  
   974  func wrapError(err error, stopPos mysql.Position, vse *Engine) error {
   975  	if err != nil {
   976  		vse.vstreamersEndedWithErrors.Add(1)
   977  		vse.errorCounts.Add("StreamEnded", 1)
   978  		err = fmt.Errorf("stream (at source tablet) error @ %v: %v", stopPos, err)
   979  		log.Error(err)
   980  		return err
   981  	}
   982  	log.Infof("stream (at source tablet) ended @ %v", stopPos)
   983  	return nil
   984  }