github.com/matrixorigin/matrixone@v1.2.0/pkg/cnservice/server_heartbeat.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cnservice 16 17 import ( 18 "context" 19 "time" 20 21 "github.com/matrixorigin/matrixone/pkg/common/system" 22 "github.com/matrixorigin/matrixone/pkg/logutil" 23 logservicepb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 24 v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2" 25 "go.uber.org/zap" 26 ) 27 28 func (s *service) startCNStoreHeartbeat() error { 29 if s._hakeeperClient == nil { 30 if _, err := s.getHAKeeperClient(); err != nil { 31 return err 32 } 33 } 34 return s.stopper.RunNamedTask("cnservice-heartbeat", s.heartbeatTask) 35 } 36 37 func (s *service) heartbeatTask(ctx context.Context) { 38 if s.cfg.HAKeeper.HeatbeatInterval.Duration == 0 { 39 panic("invalid heartbeat interval") 40 } 41 defer logutil.LogAsyncTask(s.logger, "cnservice/heartbeat-task")() 42 defer func() { 43 s.logger.Info("cn heartbeat task stopped") 44 }() 45 46 ticker := time.NewTicker(s.cfg.HAKeeper.HeatbeatInterval.Duration) 47 defer ticker.Stop() 48 49 for { 50 select { 51 case <-ctx.Done(): 52 return 53 case <-ticker.C: 54 s.heartbeat(ctx) 55 // see pkg/logservice/service_commands.go#130 56 select { 57 case <-ctx.Done(): 58 return 59 default: 60 } 61 } 62 } 63 } 64 65 func (s *service) heartbeat(ctx context.Context) { 66 start := time.Now() 67 defer func() { 68 v2.CNHeartbeatHistogram.Observe(time.Since(start).Seconds()) 69 }() 70 71 ctx2, cancel := context.WithTimeout(ctx, s.cfg.HAKeeper.HeatbeatTimeout.Duration) 72 defer cancel() 73 74 hb := logservicepb.CNStoreHeartbeat{ 75 UUID: s.cfg.UUID, 76 ServiceAddress: s.pipelineServiceServiceAddr(), 77 SQLAddress: s.cfg.SQLAddress, 78 LockServiceAddress: s.lockServiceServiceAddr(), 79 Role: s.metadata.Role, 80 TaskServiceCreated: s.GetTaskRunner() != nil, 81 QueryAddress: s.queryServiceServiceAddr(), 82 InitWorkState: s.cfg.InitWorkState, 83 ConfigData: s.config.GetData(), 84 Resource: logservicepb.Resource{ 85 CPUTotal: uint64(system.NumCPU()), 86 CPUAvailable: system.CPUAvailable(), 87 MemTotal: system.MemoryTotal(), 88 MemAvailable: system.MemoryAvailable(), 89 }, 90 } 91 if s.gossipNode != nil { 92 hb.GossipAddress = s.gossipServiceAddr() 93 hb.GossipJoined = s.gossipNode.Joined() 94 } 95 96 cb, err := s._hakeeperClient.SendCNHeartbeat(ctx2, hb) 97 if err != nil { 98 v2.CNHeartbeatFailureCounter.Inc() 99 s.logger.Error("failed to send cn heartbeat", zap.Error(err)) 100 return 101 } 102 103 select { 104 case <-s.hakeeperConnected: 105 default: 106 s.initTaskServiceHolder() 107 close(s.hakeeperConnected) 108 } 109 s.config.DecrCount() 110 s.handleCommands(cb.Commands) 111 } 112 113 func (s *service) handleCommands(cmds []logservicepb.ScheduleCommand) { 114 for _, cmd := range cmds { 115 if cmd.ServiceType != logservicepb.CNService { 116 s.logger.Fatal("received invalid command", zap.String("command", cmd.LogString())) 117 } 118 s.logger.Info("applying schedule command", zap.String("command", cmd.LogString())) 119 if cmd.CreateTaskService != nil { 120 s.createTaskService(cmd.CreateTaskService) 121 s.createSQLLogger(cmd.CreateTaskService) 122 } else if s.gossipNode.Created() && cmd.JoinGossipCluster != nil { 123 s.gossipNode.SetJoined() 124 125 // Start an async task to join the gossip cluster to avoid the long time joining, and if 126 // it fails to join cluster, unset the joined state to give it another try. 127 if err := s.stopper.RunNamedTask("join gossip cluster", func(ctx context.Context) { 128 // The local state may be large, so do not set a timeout context. 129 if err := s.gossipNode.Join(cmd.JoinGossipCluster.Existing); err != nil { 130 s.logger.Error("failed to join gossip cluster", zap.Error(err)) 131 s.gossipNode.UnsetJoined() 132 } 133 }); err != nil { 134 s.logger.Error("failed to start task to join gossip cluster", zap.Error(err)) 135 s.gossipNode.UnsetJoined() 136 } 137 } 138 } 139 }