github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/statusutil/writer.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package statusutil
    15  
    16  import (
    17  	"context"
    18  	"time"
    19  
    20  	"github.com/pingcap/log"
    21  	"github.com/pingcap/tiflow/engine/framework/internal/worker"
    22  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    23  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    24  	"github.com/pingcap/tiflow/engine/pkg/p2p"
    25  	"github.com/pingcap/tiflow/pkg/errors"
    26  	"github.com/pingcap/tiflow/pkg/retry"
    27  	"go.uber.org/zap"
    28  	"golang.org/x/time/rate"
    29  )
    30  
    31  // Writer is used to persist WorkerStatus changes and send notifications
    32  // to the Master.
    33  type Writer struct {
    34  	metaclient    pkgOrm.Client
    35  	messageSender p2p.MessageSender
    36  	lastStatus    frameModel.WorkerStatus
    37  
    38  	workerID   frameModel.WorkerID
    39  	masterInfo worker.MasterInfoProvider
    40  }
    41  
    42  // NewWriter creates a new Writer.
    43  func NewWriter(
    44  	metaclient pkgOrm.Client,
    45  	messageSender p2p.MessageSender,
    46  	masterInfo worker.MasterInfoProvider,
    47  	workerID frameModel.WorkerID,
    48  ) *Writer {
    49  	return &Writer{
    50  		metaclient:    metaclient,
    51  		messageSender: messageSender,
    52  		masterInfo:    masterInfo,
    53  		workerID:      workerID,
    54  	}
    55  }
    56  
    57  // UpdateStatus checks if newStatus.HasSignificantChange() is true, if so, it persists the change and
    58  // tries to send a notification. Note that sending the notification is asynchronous.
    59  func (w *Writer) UpdateStatus(ctx context.Context, newStatus *frameModel.WorkerStatus) (retErr error) {
    60  	defer func() {
    61  		if retErr == nil {
    62  			return
    63  		}
    64  		log.Warn("UpdateStatus failed",
    65  			zap.String("worker-id", w.workerID),
    66  			zap.String("master-id", w.masterInfo.MasterID()),
    67  			zap.String("master-node", w.masterInfo.MasterNode()),
    68  			zap.Int64("master-epoch", w.masterInfo.Epoch()),
    69  			zap.Error(retErr))
    70  	}()
    71  
    72  	if w.lastStatus.HasSignificantChange(newStatus) {
    73  		// Status has changed, so we need to persist the status.
    74  		if err := w.persistStatus(ctx, newStatus); err != nil {
    75  			return err
    76  		}
    77  	}
    78  
    79  	w.lastStatus = *newStatus
    80  
    81  	// TODO replace the timeout with a variable.
    82  	return w.sendStatusMessageWithRetry(ctx, 15*time.Second, newStatus)
    83  }
    84  
    85  func (w *Writer) sendStatusMessageWithRetry(
    86  	ctx context.Context, timeout time.Duration, newStatus *frameModel.WorkerStatus,
    87  ) error {
    88  	// NOTE we need this function especially to handle the situation where
    89  	// the p2p connection to the target executor is not established yet.
    90  	// We might need one or two retries when our executor has just started up.
    91  
    92  	retryCtx, cancel := context.WithTimeout(ctx, timeout)
    93  	defer cancel()
    94  
    95  	rl := rate.NewLimiter(rate.Every(100*time.Millisecond), 1)
    96  	for {
    97  		select {
    98  		case <-retryCtx.Done():
    99  			return errors.Trace(retryCtx.Err())
   100  		default:
   101  		}
   102  
   103  		if err := rl.Wait(retryCtx); err != nil {
   104  			return errors.Trace(err)
   105  		}
   106  
   107  		topic := WorkerStatusTopic(w.masterInfo.MasterID())
   108  		// NOTE: We must read the MasterNode() in each retry in case the master is failed over.
   109  		err := w.messageSender.SendToNodeB(ctx, w.masterInfo.MasterNode(), topic, &WorkerStatusMessage{
   110  			Worker:      w.workerID,
   111  			MasterEpoch: w.masterInfo.Epoch(),
   112  			Status:      newStatus,
   113  		})
   114  		if err != nil {
   115  			if errors.Is(err, errors.ErrExecutorNotFoundForMessage) {
   116  				if err := w.masterInfo.SyncRefreshMasterInfo(ctx); err != nil {
   117  					log.Warn("failed to refresh master info",
   118  						zap.String("worker-id", w.workerID),
   119  						zap.String("master-id", w.masterInfo.MasterID()),
   120  						zap.Error(err))
   121  				}
   122  			}
   123  			log.Warn("failed to send status to master. Retrying...",
   124  				zap.String("worker-id", w.workerID),
   125  				zap.String("master-id", w.masterInfo.MasterID()),
   126  				zap.Any("status", newStatus),
   127  				zap.Error(err))
   128  			continue
   129  		}
   130  		return nil
   131  	}
   132  }
   133  
   134  func (w *Writer) persistStatus(ctx context.Context, newStatus *frameModel.WorkerStatus) error {
   135  	return retry.Do(ctx, func() error {
   136  		return w.metaclient.UpdateWorker(ctx, newStatus)
   137  	}, retry.WithBackoffMaxDelay(1000 /* 1 second */), retry.WithIsRetryableErr(func(err error) bool {
   138  		// TODO: refine the IsRetryable method
   139  		//if err, ok := err.(metaclient.Error); ok {
   140  		//return err.IsRetryable()
   141  		//}
   142  		return true
   143  	}))
   144  }