vitess.io/vitess@v0.16.2/go/vt/vtctl/workflow/server.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package workflow
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sort"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  
    28  	"google.golang.org/protobuf/encoding/prototext"
    29  	"k8s.io/apimachinery/pkg/util/sets"
    30  
    31  	"vitess.io/vitess/go/sqltypes"
    32  	"vitess.io/vitess/go/trace"
    33  	"vitess.io/vitess/go/vt/concurrency"
    34  	"vitess.io/vitess/go/vt/key"
    35  	"vitess.io/vitess/go/vt/log"
    36  	"vitess.io/vitess/go/vt/topo"
    37  	"vitess.io/vitess/go/vt/vtctl/workflow/vexec"
    38  	"vitess.io/vitess/go/vt/vtgate/evalengine"
    39  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    40  
    41  	binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
    42  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    43  	vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata"
    44  	"vitess.io/vitess/go/vt/proto/vttime"
    45  )
    46  
    47  var (
    48  	// ErrInvalidWorkflow is a catchall error type for conditions that should be
    49  	// impossible when operating on a workflow.
    50  	ErrInvalidWorkflow = errors.New("invalid workflow")
    51  	// ErrMultipleSourceKeyspaces occurs when a workflow somehow has multiple
    52  	// source keyspaces across different shard primaries. This should be
    53  	// impossible.
    54  	ErrMultipleSourceKeyspaces = errors.New("multiple source keyspaces for a single workflow")
    55  	// ErrMultipleTargetKeyspaces occurs when a workflow somehow has multiple
    56  	// target keyspaces across different shard primaries. This should be
    57  	// impossible.
    58  	ErrMultipleTargetKeyspaces = errors.New("multiple target keyspaces for a single workflow")
    59  )
    60  
    61  // Server provides an API to work with Vitess workflows, like vreplication
    62  // workflows (MoveTables, Reshard, etc) and schema migration workflows.
    63  //
    64  // NB: This is in alpha, and you probably don't want to depend on it (yet!).
    65  // Currently, it provides only a read-only API to vreplication workflows. Write
    66  // actions on vreplication workflows, and schema migration workflows entirely,
    67  // are not yet supported, but planned.
    68  type Server struct {
    69  	ts  *topo.Server
    70  	tmc tmclient.TabletManagerClient
    71  }
    72  
    73  // NewServer returns a new server instance with the given topo.Server and
    74  // TabletManagerClient.
    75  func NewServer(ts *topo.Server, tmc tmclient.TabletManagerClient) *Server {
    76  	return &Server{
    77  		ts:  ts,
    78  		tmc: tmc,
    79  	}
    80  }
    81  
    82  // CheckReshardingJournalExistsOnTablet returns the journal (or an empty
    83  // journal) and a boolean to indicate if the resharding_journal table exists on
    84  // the given tablet.
    85  //
    86  // (TODO:@ajm188) This should not be part of the final public API, and should
    87  // be un-exported after all places in package wrangler that call this have been
    88  // migrated over.
    89  func (s *Server) CheckReshardingJournalExistsOnTablet(ctx context.Context, tablet *topodatapb.Tablet, migrationID int64) (*binlogdatapb.Journal, bool, error) {
    90  	var (
    91  		journal binlogdatapb.Journal
    92  		exists  bool
    93  	)
    94  
    95  	query := fmt.Sprintf("select val from _vt.resharding_journal where id=%v", migrationID)
    96  	p3qr, err := s.tmc.VReplicationExec(ctx, tablet, query)
    97  	if err != nil {
    98  		return nil, false, err
    99  	}
   100  
   101  	if len(p3qr.Rows) != 0 {
   102  		qr := sqltypes.Proto3ToResult(p3qr)
   103  		qrBytes, err := qr.Rows[0][0].ToBytes()
   104  		if err != nil {
   105  			return nil, false, err
   106  		}
   107  		if err := prototext.Unmarshal(qrBytes, &journal); err != nil {
   108  			return nil, false, err
   109  		}
   110  
   111  		exists = true
   112  	}
   113  
   114  	return &journal, exists, nil
   115  }
   116  
   117  // GetCellsWithShardReadsSwitched returns the topo cells partitioned into two
   118  // slices: one with the cells where shard reads have been switched for the given
   119  // tablet type and one with the cells where shard reads have not been switched
   120  // for the given tablet type.
   121  //
   122  // This function is for use in Reshard, and "switched reads" is defined as if
   123  // any one of the source shards has the query service disabled in its tablet
   124  // control record.
   125  func (s *Server) GetCellsWithShardReadsSwitched(
   126  	ctx context.Context,
   127  	keyspace string,
   128  	si *topo.ShardInfo,
   129  	tabletType topodatapb.TabletType,
   130  ) (cellsSwitched []string, cellsNotSwitched []string, err error) {
   131  	cells, err := s.ts.GetCellInfoNames(ctx)
   132  	if err != nil {
   133  		return nil, nil, err
   134  	}
   135  
   136  	for _, cell := range cells {
   137  		srvks, err := s.ts.GetSrvKeyspace(ctx, cell, keyspace)
   138  		if err != nil {
   139  			return nil, nil, err
   140  		}
   141  
   142  		// Checking one shard is enough.
   143  		var (
   144  			shardServedTypes []string
   145  			found            bool
   146  			noControls       bool
   147  		)
   148  
   149  		for _, partition := range srvks.GetPartitions() {
   150  			if tabletType != partition.GetServedType() {
   151  				continue
   152  			}
   153  
   154  			// If reads and writes are both switched it is possible that the
   155  			// shard is not in the partition table.
   156  			for _, shardReference := range partition.GetShardReferences() {
   157  				if key.KeyRangeEqual(shardReference.GetKeyRange(), si.GetKeyRange()) {
   158  					found = true
   159  					break
   160  				}
   161  			}
   162  
   163  			// It is possible that there are no tablet controls if the target
   164  			// shards are not yet serving, or once reads and writes are both
   165  			// switched.
   166  			if len(partition.GetShardTabletControls()) == 0 {
   167  				noControls = true
   168  				break
   169  			}
   170  
   171  			for _, tabletControl := range partition.GetShardTabletControls() {
   172  				if key.KeyRangeEqual(tabletControl.GetKeyRange(), si.GetKeyRange()) {
   173  					if !tabletControl.GetQueryServiceDisabled() {
   174  						shardServedTypes = append(shardServedTypes, si.ShardName())
   175  					}
   176  
   177  					break
   178  				}
   179  			}
   180  		}
   181  
   182  		if found && (len(shardServedTypes) > 0 || noControls) {
   183  			cellsNotSwitched = append(cellsNotSwitched, cell)
   184  		} else {
   185  			cellsSwitched = append(cellsSwitched, cell)
   186  		}
   187  	}
   188  
   189  	return cellsSwitched, cellsNotSwitched, nil
   190  }
   191  
   192  // GetCellsWithTableReadsSwitched returns the topo cells partitioned into two
   193  // slices: one with the cells where table reads have been switched for the given
   194  // tablet type and one with the cells where table reads have not been switched
   195  // for the given tablet type.
   196  //
   197  // This function is for use in MoveTables, and "switched reads" is defined as if
   198  // the routing rule for a (table, tablet_type) is pointing to the target
   199  // keyspace.
   200  func (s *Server) GetCellsWithTableReadsSwitched(
   201  	ctx context.Context,
   202  	keyspace string,
   203  	table string,
   204  	tabletType topodatapb.TabletType,
   205  ) (cellsSwitched []string, cellsNotSwitched []string, err error) {
   206  	cells, err := s.ts.GetCellInfoNames(ctx)
   207  	if err != nil {
   208  		return nil, nil, err
   209  	}
   210  
   211  	getKeyspace := func(ruleTarget string) (string, error) {
   212  		arr := strings.Split(ruleTarget, ".")
   213  		if len(arr) != 2 {
   214  			return "", fmt.Errorf("rule target is not correctly formatted: %s", ruleTarget)
   215  		}
   216  
   217  		return arr[0], nil
   218  	}
   219  
   220  	for _, cell := range cells {
   221  		srvVSchema, err := s.ts.GetSrvVSchema(ctx, cell)
   222  		if err != nil {
   223  			return nil, nil, err
   224  		}
   225  
   226  		var (
   227  			found    bool
   228  			switched bool
   229  		)
   230  
   231  		for _, rule := range srvVSchema.RoutingRules.Rules {
   232  			ruleName := fmt.Sprintf("%s.%s@%s", keyspace, table, strings.ToLower(tabletType.String()))
   233  			if rule.FromTable == ruleName {
   234  				found = true
   235  
   236  				for _, to := range rule.ToTables {
   237  					ks, err := getKeyspace(to)
   238  					if err != nil {
   239  						log.Errorf(err.Error())
   240  						return nil, nil, err
   241  					}
   242  
   243  					if ks == keyspace {
   244  						switched = true
   245  						break // if one table in the workflow switched, we are done.
   246  					}
   247  				}
   248  			}
   249  
   250  			if found {
   251  				break
   252  			}
   253  		}
   254  
   255  		if switched {
   256  			cellsSwitched = append(cellsSwitched, cell)
   257  		} else {
   258  			cellsNotSwitched = append(cellsNotSwitched, cell)
   259  		}
   260  	}
   261  
   262  	return cellsSwitched, cellsNotSwitched, nil
   263  }
   264  
   265  // GetWorkflows returns a list of all workflows that exist in a given keyspace,
   266  // with some additional filtering depending on the request parameters (for
   267  // example, ActiveOnly=true restricts the search to only workflows that are
   268  // currently running).
   269  //
   270  // It has the same signature as the vtctlservicepb.VtctldServer's GetWorkflows
   271  // rpc, and grpcvtctldserver delegates to this function.
   272  func (s *Server) GetWorkflows(ctx context.Context, req *vtctldatapb.GetWorkflowsRequest) (*vtctldatapb.GetWorkflowsResponse, error) {
   273  	span, ctx := trace.NewSpan(ctx, "workflow.Server.GetWorkflows")
   274  	defer span.Finish()
   275  
   276  	span.Annotate("keyspace", req.Keyspace)
   277  	span.Annotate("active_only", req.ActiveOnly)
   278  
   279  	where := ""
   280  	if req.ActiveOnly {
   281  		where = "WHERE state <> 'Stopped'"
   282  	}
   283  
   284  	query := fmt.Sprintf(`
   285  		SELECT
   286  			id,
   287  			workflow,
   288  			source,
   289  			pos,
   290  			stop_pos,
   291  			max_replication_lag,
   292  			state,
   293  			db_name,
   294  			time_updated,
   295  			transaction_timestamp,
   296  			message,
   297  			tags,
   298  			workflow_type,
   299  			workflow_sub_type
   300  		FROM
   301  			_vt.vreplication
   302  		%s`,
   303  		where,
   304  	)
   305  
   306  	vx := vexec.NewVExec(req.Keyspace, "", s.ts, s.tmc)
   307  	results, err := vx.QueryContext(ctx, query)
   308  	if err != nil {
   309  		return nil, err
   310  	}
   311  
   312  	m := sync.Mutex{} // guards access to the following maps during concurrent calls to scanWorkflow
   313  	workflowsMap := make(map[string]*vtctldatapb.Workflow, len(results))
   314  	sourceKeyspaceByWorkflow := make(map[string]string, len(results))
   315  	sourceShardsByWorkflow := make(map[string]sets.Set[string], len(results))
   316  	targetKeyspaceByWorkflow := make(map[string]string, len(results))
   317  	targetShardsByWorkflow := make(map[string]sets.Set[string], len(results))
   318  	maxVReplicationLagByWorkflow := make(map[string]float64, len(results))
   319  
   320  	// We guarantee the following invariants when this function is called for a
   321  	// given workflow:
   322  	// - workflow.Name != "" (more precisely, ".Name is set 'properly'")
   323  	// - workflowsMap[workflow.Name] == workflow
   324  	// - sourceShardsByWorkflow[workflow.Name] != nil
   325  	// - targetShardsByWorkflow[workflow.Name] != nil
   326  	// - workflow.ShardStatuses != nil
   327  	scanWorkflow := func(ctx context.Context, workflow *vtctldatapb.Workflow, row sqltypes.RowNamedValues, tablet *topo.TabletInfo) error {
   328  		span, ctx := trace.NewSpan(ctx, "workflow.Server.scanWorkflow")
   329  		defer span.Finish()
   330  
   331  		span.Annotate("keyspace", req.Keyspace)
   332  		span.Annotate("shard", tablet.Shard)
   333  		span.Annotate("active_only", req.ActiveOnly)
   334  		span.Annotate("workflow", workflow.Name)
   335  		span.Annotate("tablet_alias", tablet.AliasString())
   336  
   337  		id, err := evalengine.ToInt64(row["id"])
   338  		if err != nil {
   339  			return err
   340  		}
   341  
   342  		var bls binlogdatapb.BinlogSource
   343  		rowBytes, err := row["source"].ToBytes()
   344  		if err != nil {
   345  			return err
   346  		}
   347  		if err := prototext.Unmarshal(rowBytes, &bls); err != nil {
   348  			return err
   349  		}
   350  
   351  		pos := row["pos"].ToString()
   352  		stopPos := row["stop_pos"].ToString()
   353  		state := row["state"].ToString()
   354  		dbName := row["db_name"].ToString()
   355  
   356  		timeUpdatedSeconds, err := evalengine.ToInt64(row["time_updated"])
   357  		if err != nil {
   358  			return err
   359  		}
   360  
   361  		transactionTimeSeconds, err := evalengine.ToInt64(row["transaction_timestamp"])
   362  		if err != nil {
   363  			return err
   364  		}
   365  
   366  		message := row["message"].ToString()
   367  
   368  		tags := row["tags"].ToString()
   369  		var tagArray []string
   370  		if tags != "" {
   371  			tagArray = strings.Split(tags, ",")
   372  		}
   373  		workflowType, _ := row["workflow_type"].ToInt64()
   374  		workflowSubType, _ := row["workflow_sub_type"].ToInt64()
   375  		stream := &vtctldatapb.Workflow_Stream{
   376  			Id:           id,
   377  			Shard:        tablet.Shard,
   378  			Tablet:       tablet.Alias,
   379  			BinlogSource: &bls,
   380  			Position:     pos,
   381  			StopPosition: stopPos,
   382  			State:        state,
   383  			DbName:       dbName,
   384  			TransactionTimestamp: &vttime.Time{
   385  				Seconds: transactionTimeSeconds,
   386  			},
   387  			TimeUpdated: &vttime.Time{
   388  				Seconds: timeUpdatedSeconds,
   389  			},
   390  			Message: message,
   391  			Tags:    tagArray,
   392  		}
   393  		workflow.WorkflowType = binlogdatapb.VReplicationWorkflowType_name[int32(workflowType)]
   394  		workflow.WorkflowSubType = binlogdatapb.VReplicationWorkflowSubType_name[int32(workflowSubType)]
   395  		stream.CopyStates, err = s.getWorkflowCopyStates(ctx, tablet, id)
   396  		if err != nil {
   397  			return err
   398  		}
   399  
   400  		span.Annotate("num_copy_states", len(stream.CopyStates))
   401  
   402  		switch {
   403  		case strings.Contains(strings.ToLower(stream.Message), "error"):
   404  			stream.State = "Error"
   405  		case stream.State == "Running" && len(stream.CopyStates) > 0:
   406  			stream.State = "Copying"
   407  		case stream.State == "Running" && int64(time.Now().Second())-timeUpdatedSeconds > 10:
   408  			stream.State = "Lagging"
   409  		}
   410  
   411  		// At this point, we're going to start modifying the maps defined
   412  		// outside this function, as well as fields on the passed-in Workflow
   413  		// pointer. Since we're running concurrently, take the lock.
   414  		//
   415  		// We've already made the remote call to getCopyStates, so synchronizing
   416  		// here shouldn't hurt too badly, performance-wise.
   417  		m.Lock()
   418  		defer m.Unlock()
   419  
   420  		shardStreamKey := fmt.Sprintf("%s/%s", tablet.Shard, tablet.AliasString())
   421  		shardStream, ok := workflow.ShardStreams[shardStreamKey]
   422  		if !ok {
   423  			ctx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   424  			defer cancel()
   425  
   426  			si, err := s.ts.GetShard(ctx, req.Keyspace, tablet.Shard)
   427  			if err != nil {
   428  				return err
   429  			}
   430  
   431  			shardStream = &vtctldatapb.Workflow_ShardStream{
   432  				Streams:          nil,
   433  				TabletControls:   si.TabletControls,
   434  				IsPrimaryServing: si.IsPrimaryServing,
   435  			}
   436  
   437  			workflow.ShardStreams[shardStreamKey] = shardStream
   438  		}
   439  
   440  		shardStream.Streams = append(shardStream.Streams, stream)
   441  		sourceShardsByWorkflow[workflow.Name].Insert(stream.BinlogSource.Shard)
   442  		targetShardsByWorkflow[workflow.Name].Insert(tablet.Shard)
   443  
   444  		if ks, ok := sourceKeyspaceByWorkflow[workflow.Name]; ok && ks != stream.BinlogSource.Keyspace {
   445  			return fmt.Errorf("%w: workflow = %v, ks1 = %v, ks2 = %v", ErrMultipleSourceKeyspaces, workflow.Name, ks, stream.BinlogSource.Keyspace)
   446  		}
   447  
   448  		sourceKeyspaceByWorkflow[workflow.Name] = stream.BinlogSource.Keyspace
   449  
   450  		if ks, ok := targetKeyspaceByWorkflow[workflow.Name]; ok && ks != tablet.Keyspace {
   451  			return fmt.Errorf("%w: workflow = %v, ks1 = %v, ks2 = %v", ErrMultipleTargetKeyspaces, workflow.Name, ks, tablet.Keyspace)
   452  		}
   453  
   454  		targetKeyspaceByWorkflow[workflow.Name] = tablet.Keyspace
   455  
   456  		timeUpdated := time.Unix(timeUpdatedSeconds, 0)
   457  		vreplicationLag := time.Since(timeUpdated)
   458  
   459  		if currentMaxLag, ok := maxVReplicationLagByWorkflow[workflow.Name]; ok {
   460  			if vreplicationLag.Seconds() > currentMaxLag {
   461  				maxVReplicationLagByWorkflow[workflow.Name] = vreplicationLag.Seconds()
   462  			}
   463  		} else {
   464  			maxVReplicationLagByWorkflow[workflow.Name] = vreplicationLag.Seconds()
   465  		}
   466  
   467  		return nil
   468  	}
   469  
   470  	var (
   471  		scanWorkflowWg     sync.WaitGroup
   472  		scanWorkflowErrors concurrency.FirstErrorRecorder
   473  	)
   474  
   475  	for tablet, result := range results {
   476  		qr := sqltypes.Proto3ToResult(result)
   477  
   478  		// In the old implementation, we knew we had at most one (0 <= N <= 1)
   479  		// workflow for each shard primary we queried. There might be multiple
   480  		// rows (streams) comprising that workflow, so we would aggregate the
   481  		// rows for a given primary into a single value ("the workflow",
   482  		// ReplicationStatusResult in the old types).
   483  		//
   484  		// In this version, we have many (N >= 0) workflows for each shard
   485  		// primary we queried, so we need to determine if each row corresponds
   486  		// to a workflow we're already aggregating, or if it's a workflow we
   487  		// haven't seen yet for that shard primary. We use the workflow name to
   488  		// dedupe for this.
   489  		for _, row := range qr.Named().Rows {
   490  			workflowName := row["workflow"].ToString()
   491  			workflow, ok := workflowsMap[workflowName]
   492  			if !ok {
   493  				workflow = &vtctldatapb.Workflow{
   494  					Name:         workflowName,
   495  					ShardStreams: map[string]*vtctldatapb.Workflow_ShardStream{},
   496  				}
   497  
   498  				workflowsMap[workflowName] = workflow
   499  				sourceShardsByWorkflow[workflowName] = sets.New[string]()
   500  				targetShardsByWorkflow[workflowName] = sets.New[string]()
   501  			}
   502  
   503  			scanWorkflowWg.Add(1)
   504  			go func(ctx context.Context, workflow *vtctldatapb.Workflow, row sqltypes.RowNamedValues, tablet *topo.TabletInfo) {
   505  				defer scanWorkflowWg.Done()
   506  				if err := scanWorkflow(ctx, workflow, row, tablet); err != nil {
   507  					scanWorkflowErrors.RecordError(err)
   508  				}
   509  			}(ctx, workflow, row, tablet)
   510  		}
   511  	}
   512  
   513  	scanWorkflowWg.Wait()
   514  	if scanWorkflowErrors.HasErrors() {
   515  		return nil, scanWorkflowErrors.Error()
   516  	}
   517  
   518  	var (
   519  		fetchLogsWG  sync.WaitGroup
   520  		vrepLogQuery = strings.TrimSpace(`
   521  SELECT
   522  	id,
   523  	vrepl_id,
   524  	type,
   525  	state,
   526  	message,
   527  	created_at,
   528  	updated_at,
   529  	count
   530  FROM
   531  	_vt.vreplication_log
   532  ORDER BY
   533  	vrepl_id ASC,
   534  	id ASC
   535  `)
   536  	)
   537  
   538  	fetchStreamLogs := func(ctx context.Context, workflow *vtctldatapb.Workflow) {
   539  		span, ctx := trace.NewSpan(ctx, "workflow.Server.scanWorkflow")
   540  		defer span.Finish()
   541  
   542  		span.Annotate("keyspace", req.Keyspace)
   543  		span.Annotate("workflow", workflow.Name)
   544  
   545  		results, err := vx.WithWorkflow(workflow.Name).QueryContext(ctx, vrepLogQuery)
   546  		if err != nil {
   547  			// Note that we do not return here. If there are any query results
   548  			// in the map (i.e. some tablets returned successfully), we will
   549  			// still try to read log rows from them on a best-effort basis. But,
   550  			// we will also pre-emptively record the top-level fetch error on
   551  			// every stream in every shard in the workflow. Further processing
   552  			// below may override the error message for certain streams.
   553  			for _, streams := range workflow.ShardStreams {
   554  				for _, stream := range streams.Streams {
   555  					stream.LogFetchError = err.Error()
   556  				}
   557  			}
   558  		}
   559  
   560  		for target, p3qr := range results {
   561  			qr := sqltypes.Proto3ToResult(p3qr)
   562  			shardStreamKey := fmt.Sprintf("%s/%s", target.Shard, target.AliasString())
   563  
   564  			ss, ok := workflow.ShardStreams[shardStreamKey]
   565  			if !ok || ss == nil {
   566  				continue
   567  			}
   568  
   569  			streams := ss.Streams
   570  			streamIdx := 0
   571  			markErrors := func(err error) {
   572  				if streamIdx >= len(streams) {
   573  					return
   574  				}
   575  
   576  				streams[streamIdx].LogFetchError = err.Error()
   577  			}
   578  
   579  			for _, row := range qr.Rows {
   580  				id, err := evalengine.ToInt64(row[0])
   581  				if err != nil {
   582  					markErrors(err)
   583  					continue
   584  				}
   585  
   586  				streamID, err := evalengine.ToInt64(row[1])
   587  				if err != nil {
   588  					markErrors(err)
   589  					continue
   590  				}
   591  
   592  				typ := row[2].ToString()
   593  				state := row[3].ToString()
   594  				message := row[4].ToString()
   595  
   596  				createdAt, err := time.Parse("2006-01-02 15:04:05", row[5].ToString())
   597  				if err != nil {
   598  					markErrors(err)
   599  					continue
   600  				}
   601  
   602  				updatedAt, err := time.Parse("2006-01-02 15:04:05", row[6].ToString())
   603  				if err != nil {
   604  					markErrors(err)
   605  					continue
   606  				}
   607  
   608  				count, err := evalengine.ToInt64(row[7])
   609  				if err != nil {
   610  					markErrors(err)
   611  					continue
   612  				}
   613  
   614  				streamLog := &vtctldatapb.Workflow_Stream_Log{
   615  					Id:       id,
   616  					StreamId: streamID,
   617  					Type:     typ,
   618  					State:    state,
   619  					CreatedAt: &vttime.Time{
   620  						Seconds: createdAt.Unix(),
   621  					},
   622  					UpdatedAt: &vttime.Time{
   623  						Seconds: updatedAt.Unix(),
   624  					},
   625  					Message: message,
   626  					Count:   count,
   627  				}
   628  
   629  				// Earlier, in the main loop where we called scanWorkflow for
   630  				// each _vt.vreplication row, we also sorted each ShardStreams
   631  				// slice by ascending id, and our _vt.vreplication_log query
   632  				// ordered by (stream_id ASC, id ASC), so we can walk the
   633  				// streams in index order in O(n) amortized over all the rows
   634  				// for this tablet.
   635  				for streamIdx < len(streams) {
   636  					stream := streams[streamIdx]
   637  					if stream.Id < streamLog.StreamId {
   638  						streamIdx++
   639  						continue
   640  					}
   641  
   642  					if stream.Id > streamLog.StreamId {
   643  						log.Warningf("Found stream log for nonexistent stream: %+v", streamLog)
   644  						break
   645  					}
   646  
   647  					// stream.Id == streamLog.StreamId
   648  					stream.Logs = append(stream.Logs, streamLog)
   649  					break
   650  				}
   651  			}
   652  		}
   653  	}
   654  
   655  	workflows := make([]*vtctldatapb.Workflow, 0, len(workflowsMap))
   656  
   657  	for name, workflow := range workflowsMap {
   658  		sourceShards, ok := sourceShardsByWorkflow[name]
   659  		if !ok {
   660  			return nil, fmt.Errorf("%w: %s has no source shards", ErrInvalidWorkflow, name)
   661  		}
   662  
   663  		sourceKeyspace, ok := sourceKeyspaceByWorkflow[name]
   664  		if !ok {
   665  			return nil, fmt.Errorf("%w: %s has no source keyspace", ErrInvalidWorkflow, name)
   666  		}
   667  
   668  		targetShards, ok := targetShardsByWorkflow[name]
   669  		if !ok {
   670  			return nil, fmt.Errorf("%w: %s has no target shards", ErrInvalidWorkflow, name)
   671  		}
   672  
   673  		targetKeyspace, ok := targetKeyspaceByWorkflow[name]
   674  		if !ok {
   675  			return nil, fmt.Errorf("%w: %s has no target keyspace", ErrInvalidWorkflow, name)
   676  		}
   677  
   678  		maxVReplicationLag, ok := maxVReplicationLagByWorkflow[name]
   679  		if !ok {
   680  			return nil, fmt.Errorf("%w: %s has no tracked vreplication lag", ErrInvalidWorkflow, name)
   681  		}
   682  
   683  		workflow.Source = &vtctldatapb.Workflow_ReplicationLocation{
   684  			Keyspace: sourceKeyspace,
   685  			Shards:   sets.List(sourceShards),
   686  		}
   687  
   688  		workflow.Target = &vtctldatapb.Workflow_ReplicationLocation{
   689  			Keyspace: targetKeyspace,
   690  			Shards:   sets.List(targetShards),
   691  		}
   692  
   693  		workflow.MaxVReplicationLag = int64(maxVReplicationLag)
   694  
   695  		// Sort shard streams by stream_id ASC, to support an optimization
   696  		// in fetchStreamLogs below.
   697  		for _, shardStreams := range workflow.ShardStreams {
   698  			sort.Slice(shardStreams.Streams, func(i, j int) bool {
   699  				return shardStreams.Streams[i].Id < shardStreams.Streams[j].Id
   700  			})
   701  		}
   702  
   703  		workflows = append(workflows, workflow)
   704  
   705  		// Fetch logs for all streams associated with this workflow in the background.
   706  		fetchLogsWG.Add(1)
   707  		go func(ctx context.Context, workflow *vtctldatapb.Workflow) {
   708  			defer fetchLogsWG.Done()
   709  			fetchStreamLogs(ctx, workflow)
   710  		}(ctx, workflow)
   711  	}
   712  
   713  	// Wait for all the log fetchers to finish.
   714  	fetchLogsWG.Wait()
   715  
   716  	return &vtctldatapb.GetWorkflowsResponse{
   717  		Workflows: workflows,
   718  	}, nil
   719  }
   720  
   721  func (s *Server) getWorkflowCopyStates(ctx context.Context, tablet *topo.TabletInfo, id int64) ([]*vtctldatapb.Workflow_Stream_CopyState, error) {
   722  	span, ctx := trace.NewSpan(ctx, "workflow.Server.getWorkflowCopyStates")
   723  	defer span.Finish()
   724  
   725  	span.Annotate("keyspace", tablet.Keyspace)
   726  	span.Annotate("shard", tablet.Shard)
   727  	span.Annotate("tablet_alias", tablet.AliasString())
   728  	span.Annotate("vrepl_id", id)
   729  
   730  	query := fmt.Sprintf("select table_name, lastpk from _vt.copy_state where vrepl_id = %d and id in (select max(id) from _vt.copy_state where vrepl_id = %d group by vrepl_id, table_name)", id, id)
   731  	qr, err := s.tmc.VReplicationExec(ctx, tablet.Tablet, query)
   732  	if err != nil {
   733  		return nil, err
   734  	}
   735  
   736  	result := sqltypes.Proto3ToResult(qr)
   737  	if result == nil {
   738  		return nil, nil
   739  	}
   740  
   741  	copyStates := make([]*vtctldatapb.Workflow_Stream_CopyState, len(result.Rows))
   742  	for i, row := range result.Rows {
   743  		// These fields are technically varbinary, but this is close enough.
   744  		copyStates[i] = &vtctldatapb.Workflow_Stream_CopyState{
   745  			Table:  row[0].ToString(),
   746  			LastPk: row[1].ToString(),
   747  		}
   748  	}
   749  
   750  	return copyStates, nil
   751  }