github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/join.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package worker 15 16 import ( 17 "context" 18 "strings" 19 "time" 20 21 "github.com/pingcap/failpoint" 22 toolutils "github.com/pingcap/tidb-tools/pkg/utils" 23 "github.com/pingcap/tiflow/dm/pb" 24 "github.com/pingcap/tiflow/dm/pkg/encrypt" 25 "github.com/pingcap/tiflow/dm/pkg/ha" 26 "github.com/pingcap/tiflow/dm/pkg/log" 27 "github.com/pingcap/tiflow/dm/pkg/terror" 28 "github.com/pingcap/tiflow/dm/pkg/utils" 29 "go.uber.org/zap" 30 "google.golang.org/grpc" 31 ) 32 33 // GetJoinURLs gets the endpoints from the join address. 34 func GetJoinURLs(addrs string) []string { 35 // TODO: handle pm1=xxxx:1234,pm2=xxxx:1234,pm3=xxxx:1234 36 return strings.Split(addrs, ",") 37 } 38 39 // JoinMaster let dm-worker join the cluster with the specified master endpoints. 40 func (s *Server) JoinMaster(endpoints []string) error { 41 // TODO: grpc proxy 42 tls, err := toolutils.NewTLS(s.cfg.SSLCA, s.cfg.SSLCert, s.cfg.SSLKey, s.cfg.AdvertiseAddr, s.cfg.CertAllowedCN) 43 if err != nil { 44 return terror.ErrWorkerTLSConfigNotValid.Delegate(err) 45 } 46 47 // join doesn't support to be canceled now, because it can return at most in (3s+3s) * len(endpoints). 48 ctx, cancel := context.WithCancel(context.Background()) 49 defer cancel() 50 51 req := &pb.RegisterWorkerRequest{ 52 Name: s.cfg.Name, 53 Address: s.cfg.AdvertiseAddr, 54 } 55 56 var errorStr string 57 for _, endpoint := range endpoints { 58 ctx1, cancel1 := context.WithTimeout(ctx, 3*time.Second) 59 //nolint:staticcheck 60 conn, err := grpc.DialContext(ctx1, utils.UnwrapScheme(endpoint), grpc.WithBlock(), tls.ToGRPCDialOption(), grpc.WithBackoffMaxDelay(3*time.Second)) 61 cancel1() 62 if err != nil { 63 if conn != nil { 64 conn.Close() 65 } 66 log.L().Error("fail to dial dm-master", zap.String("endpoint", endpoint), zap.Error(err)) 67 errorStr = err.Error() 68 continue 69 } 70 client := pb.NewMasterClient(conn) 71 ctx1, cancel1 = context.WithTimeout(ctx, 3*time.Second) 72 resp, err := client.RegisterWorker(ctx1, req) 73 cancel1() 74 conn.Close() 75 if err != nil { 76 log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.Error(err)) 77 errorStr = err.Error() 78 continue 79 } 80 if !resp.GetResult() { 81 log.L().Error("fail to register worker", zap.String("endpoint", endpoint), zap.String("error", resp.Msg)) 82 errorStr = resp.Msg 83 continue 84 } 85 86 // worker do calls decrypt, but the password is decrypted already, 87 // but in case we need it later, init it. 88 encrypt.InitCipher(resp.GetSecretKey()) 89 90 return nil 91 } 92 return terror.ErrWorkerFailConnectMaster.Generate(endpoints, errorStr) 93 } 94 95 // KeepAlive attempts to keep the lease of the server alive forever. 96 func (s *Server) KeepAlive() { 97 for { 98 log.L().Info("start to keepalive with master") 99 100 failpoint.Inject("FailToKeepAlive", func(val failpoint.Value) { 101 workerStrings := val.(string) 102 if strings.Contains(workerStrings, s.cfg.Name) { 103 log.L().Info("worker keep alive failed", zap.String("failpoint", "FailToKeepAlive")) 104 failpoint.Goto("bypass") 105 } 106 }) 107 108 { 109 err1 := ha.KeepAlive(s.kaCtx, s.etcdClient, s.cfg.Name, s.cfg.KeepAliveTTL) 110 log.L().Warn("keepalive with master goroutine paused", zap.Error(err1)) 111 } 112 113 failpoint.Label("bypass") 114 115 // TODO: report the error. 116 // when lost keepalive, stop the worker without graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737 117 err := s.stopSourceWorker("", true, false) 118 if err != nil { 119 log.L().Error("fail to stop worker", zap.Error(err)) 120 return // return if failed to stop the worker. 121 } 122 select { 123 case <-s.kaCtx.Done(): 124 log.L().Info("keepalive with master goroutine exited!") 125 return 126 case <-time.After(retryConnectSleepTime): 127 // Try to connect master again 128 } 129 } 130 } 131 132 // UpdateKeepAliveTTL updates keepalive key with new lease TTL in place, to avoid watcher observe a DELETE event. 133 func (s *Server) UpdateKeepAliveTTL(newTTL int64) { 134 ha.KeepAliveUpdateCh <- newTTL 135 log.L().Debug("received update keepalive TTL request, should be updated soon", zap.Int64("new ttl", newTTL)) 136 }