github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/statusutil/writer.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package statusutil 15 16 import ( 17 "context" 18 "time" 19 20 "github.com/pingcap/log" 21 "github.com/pingcap/tiflow/engine/framework/internal/worker" 22 frameModel "github.com/pingcap/tiflow/engine/framework/model" 23 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 24 "github.com/pingcap/tiflow/engine/pkg/p2p" 25 "github.com/pingcap/tiflow/pkg/errors" 26 "github.com/pingcap/tiflow/pkg/retry" 27 "go.uber.org/zap" 28 "golang.org/x/time/rate" 29 ) 30 31 // Writer is used to persist WorkerStatus changes and send notifications 32 // to the Master. 33 type Writer struct { 34 metaclient pkgOrm.Client 35 messageSender p2p.MessageSender 36 lastStatus frameModel.WorkerStatus 37 38 workerID frameModel.WorkerID 39 masterInfo worker.MasterInfoProvider 40 } 41 42 // NewWriter creates a new Writer. 43 func NewWriter( 44 metaclient pkgOrm.Client, 45 messageSender p2p.MessageSender, 46 masterInfo worker.MasterInfoProvider, 47 workerID frameModel.WorkerID, 48 ) *Writer { 49 return &Writer{ 50 metaclient: metaclient, 51 messageSender: messageSender, 52 masterInfo: masterInfo, 53 workerID: workerID, 54 } 55 } 56 57 // UpdateStatus checks if newStatus.HasSignificantChange() is true, if so, it persists the change and 58 // tries to send a notification. Note that sending the notification is asynchronous. 59 func (w *Writer) UpdateStatus(ctx context.Context, newStatus *frameModel.WorkerStatus) (retErr error) { 60 defer func() { 61 if retErr == nil { 62 return 63 } 64 log.Warn("UpdateStatus failed", 65 zap.String("worker-id", w.workerID), 66 zap.String("master-id", w.masterInfo.MasterID()), 67 zap.String("master-node", w.masterInfo.MasterNode()), 68 zap.Int64("master-epoch", w.masterInfo.Epoch()), 69 zap.Error(retErr)) 70 }() 71 72 if w.lastStatus.HasSignificantChange(newStatus) { 73 // Status has changed, so we need to persist the status. 74 if err := w.persistStatus(ctx, newStatus); err != nil { 75 return err 76 } 77 } 78 79 w.lastStatus = *newStatus 80 81 // TODO replace the timeout with a variable. 82 return w.sendStatusMessageWithRetry(ctx, 15*time.Second, newStatus) 83 } 84 85 func (w *Writer) sendStatusMessageWithRetry( 86 ctx context.Context, timeout time.Duration, newStatus *frameModel.WorkerStatus, 87 ) error { 88 // NOTE we need this function especially to handle the situation where 89 // the p2p connection to the target executor is not established yet. 90 // We might need one or two retries when our executor has just started up. 91 92 retryCtx, cancel := context.WithTimeout(ctx, timeout) 93 defer cancel() 94 95 rl := rate.NewLimiter(rate.Every(100*time.Millisecond), 1) 96 for { 97 select { 98 case <-retryCtx.Done(): 99 return errors.Trace(retryCtx.Err()) 100 default: 101 } 102 103 if err := rl.Wait(retryCtx); err != nil { 104 return errors.Trace(err) 105 } 106 107 topic := WorkerStatusTopic(w.masterInfo.MasterID()) 108 // NOTE: We must read the MasterNode() in each retry in case the master is failed over. 109 err := w.messageSender.SendToNodeB(ctx, w.masterInfo.MasterNode(), topic, &WorkerStatusMessage{ 110 Worker: w.workerID, 111 MasterEpoch: w.masterInfo.Epoch(), 112 Status: newStatus, 113 }) 114 if err != nil { 115 if errors.Is(err, errors.ErrExecutorNotFoundForMessage) { 116 if err := w.masterInfo.SyncRefreshMasterInfo(ctx); err != nil { 117 log.Warn("failed to refresh master info", 118 zap.String("worker-id", w.workerID), 119 zap.String("master-id", w.masterInfo.MasterID()), 120 zap.Error(err)) 121 } 122 } 123 log.Warn("failed to send status to master. Retrying...", 124 zap.String("worker-id", w.workerID), 125 zap.String("master-id", w.masterInfo.MasterID()), 126 zap.Any("status", newStatus), 127 zap.Error(err)) 128 continue 129 } 130 return nil 131 } 132 } 133 134 func (w *Writer) persistStatus(ctx context.Context, newStatus *frameModel.WorkerStatus) error { 135 return retry.Do(ctx, func() error { 136 return w.metaclient.UpdateWorker(ctx, newStatus) 137 }, retry.WithBackoffMaxDelay(1000 /* 1 second */), retry.WithIsRetryableErr(func(err error) bool { 138 // TODO: refine the IsRetryable method 139 //if err, ok := err.(metaclient.Error); ok { 140 //return err.IsRetryable() 141 //} 142 return true 143 })) 144 }