github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/join.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package worker
    15  
    16  import (
    17  	"context"
    18  	"strings"
    19  	"time"
    20  
    21  	"github.com/pingcap/failpoint"
    22  	toolutils "github.com/pingcap/tidb-tools/pkg/utils"
    23  	"github.com/pingcap/tiflow/dm/pb"
    24  	"github.com/pingcap/tiflow/dm/pkg/encrypt"
    25  	"github.com/pingcap/tiflow/dm/pkg/ha"
    26  	"github.com/pingcap/tiflow/dm/pkg/log"
    27  	"github.com/pingcap/tiflow/dm/pkg/terror"
    28  	"github.com/pingcap/tiflow/dm/pkg/utils"
    29  	"go.uber.org/zap"
    30  	"google.golang.org/grpc"
    31  )
    32  
    33  // GetJoinURLs gets the endpoints from the join address.
    34  func GetJoinURLs(addrs string) []string {
    35  	// TODO: handle pm1=xxxx:1234,pm2=xxxx:1234,pm3=xxxx:1234
    36  	return strings.Split(addrs, ",")
    37  }
    38  
    39  // JoinMaster let dm-worker join the cluster with the specified master endpoints.
    40  func (s *Server) JoinMaster(endpoints []string) error {
    41  	// TODO: grpc proxy
    42  	tls, err := toolutils.NewTLS(s.cfg.SSLCA, s.cfg.SSLCert, s.cfg.SSLKey, s.cfg.AdvertiseAddr, s.cfg.CertAllowedCN)
    43  	if err != nil {
    44  		return terror.ErrWorkerTLSConfigNotValid.Delegate(err)
    45  	}
    46  
    47  	// join doesn't support to be canceled now, because it can return at most in (3s+3s) * len(endpoints).
    48  	ctx, cancel := context.WithCancel(context.Background())
    49  	defer cancel()
    50  
    51  	req := &pb.RegisterWorkerRequest{
    52  		Name:    s.cfg.Name,
    53  		Address: s.cfg.AdvertiseAddr,
    54  	}
    55  
    56  	var errorStr string
    57  	for _, endpoint := range endpoints {
    58  		ctx1, cancel1 := context.WithTimeout(ctx, 3*time.Second)
    59  		//nolint:staticcheck
    60  		conn, err := grpc.DialContext(ctx1, utils.UnwrapScheme(endpoint), grpc.WithBlock(), tls.ToGRPCDialOption(), grpc.WithBackoffMaxDelay(3*time.Second))
    61  		cancel1()
    62  		if err != nil {
    63  			if conn != nil {
    64  				conn.Close()
    65  			}
    66  			log.L().Error("fail to dial dm-master", zap.String("endpoint", endpoint), zap.Error(err))
    67  			errorStr = err.Error()
    68  			continue
    69  		}
    70  		client := pb.NewMasterClient(conn)
    71  		ctx1, cancel1 = context.WithTimeout(ctx, 3*time.Second)
    72  		resp, err := client.RegisterWorker(ctx1, req)
    73  		cancel1()
    74  		conn.Close()
    75  		if err != nil {
    76  			log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.Error(err))
    77  			errorStr = err.Error()
    78  			continue
    79  		}
    80  		if !resp.GetResult() {
    81  			log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.String("error", resp.Msg))
    82  			errorStr = resp.Msg
    83  			continue
    84  		}
    85  
    86  		// worker do calls decrypt, but the password is decrypted already,
    87  		// but in case we need it later, init it.
    88  		encrypt.InitCipher(resp.GetSecretKey())
    89  
    90  		return nil
    91  	}
    92  	return terror.ErrWorkerFailConnectMaster.Generate(endpoints, errorStr)
    93  }
    94  
    95  // KeepAlive attempts to keep the lease of the server alive forever.
    96  func (s *Server) KeepAlive() {
    97  	for {
    98  		log.L().Info("start to keepalive with master")
    99  
   100  		failpoint.Inject("FailToKeepAlive", func(val failpoint.Value) {
   101  			workerStrings := val.(string)
   102  			if strings.Contains(workerStrings, s.cfg.Name) {
   103  				log.L().Info("worker keep alive failed", zap.String("failpoint", "FailToKeepAlive"))
   104  				failpoint.Goto("bypass")
   105  			}
   106  		})
   107  
   108  		{
   109  			err1 := ha.KeepAlive(s.kaCtx, s.etcdClient, s.cfg.Name, s.cfg.KeepAliveTTL)
   110  			log.L().Warn("keepalive with master goroutine paused", zap.Error(err1))
   111  		}
   112  
   113  		failpoint.Label("bypass")
   114  
   115  		// TODO: report the error.
   116  		// when lost keepalive, stop the worker without graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737
   117  		err := s.stopSourceWorker("", true, false)
   118  		if err != nil {
   119  			log.L().Error("fail to stop worker", zap.Error(err))
   120  			return // return if failed to stop the worker.
   121  		}
   122  		select {
   123  		case <-s.kaCtx.Done():
   124  			log.L().Info("keepalive with master goroutine exited!")
   125  			return
   126  		case <-time.After(retryConnectSleepTime):
   127  			// Try to connect master again
   128  		}
   129  	}
   130  }
   131  
   132  // UpdateKeepAliveTTL updates keepalive key with new lease TTL in place, to avoid watcher observe a DELETE event.
   133  func (s *Server) UpdateKeepAliveTTL(newTTL int64) {
   134  	ha.KeepAliveUpdateCh <- newTTL
   135  	log.L().Debug("received update keepalive TTL request, should be updated soon", zap.Int64("new ttl", newTTL))
   136  }