github.com/matrixorigin/matrixone@v1.2.0/pkg/cnservice/server_heartbeat.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cnservice
    16  
    17  import (
    18  	"context"
    19  	"time"
    20  
    21  	"github.com/matrixorigin/matrixone/pkg/common/system"
    22  	"github.com/matrixorigin/matrixone/pkg/logutil"
    23  	logservicepb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    24  	v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2"
    25  	"go.uber.org/zap"
    26  )
    27  
    28  func (s *service) startCNStoreHeartbeat() error {
    29  	if s._hakeeperClient == nil {
    30  		if _, err := s.getHAKeeperClient(); err != nil {
    31  			return err
    32  		}
    33  	}
    34  	return s.stopper.RunNamedTask("cnservice-heartbeat", s.heartbeatTask)
    35  }
    36  
    37  func (s *service) heartbeatTask(ctx context.Context) {
    38  	if s.cfg.HAKeeper.HeatbeatInterval.Duration == 0 {
    39  		panic("invalid heartbeat interval")
    40  	}
    41  	defer logutil.LogAsyncTask(s.logger, "cnservice/heartbeat-task")()
    42  	defer func() {
    43  		s.logger.Info("cn heartbeat task stopped")
    44  	}()
    45  
    46  	ticker := time.NewTicker(s.cfg.HAKeeper.HeatbeatInterval.Duration)
    47  	defer ticker.Stop()
    48  
    49  	for {
    50  		select {
    51  		case <-ctx.Done():
    52  			return
    53  		case <-ticker.C:
    54  			s.heartbeat(ctx)
    55  			// see pkg/logservice/service_commands.go#130
    56  			select {
    57  			case <-ctx.Done():
    58  				return
    59  			default:
    60  			}
    61  		}
    62  	}
    63  }
    64  
    65  func (s *service) heartbeat(ctx context.Context) {
    66  	start := time.Now()
    67  	defer func() {
    68  		v2.CNHeartbeatHistogram.Observe(time.Since(start).Seconds())
    69  	}()
    70  
    71  	ctx2, cancel := context.WithTimeout(ctx, s.cfg.HAKeeper.HeatbeatTimeout.Duration)
    72  	defer cancel()
    73  
    74  	hb := logservicepb.CNStoreHeartbeat{
    75  		UUID:               s.cfg.UUID,
    76  		ServiceAddress:     s.pipelineServiceServiceAddr(),
    77  		SQLAddress:         s.cfg.SQLAddress,
    78  		LockServiceAddress: s.lockServiceServiceAddr(),
    79  		Role:               s.metadata.Role,
    80  		TaskServiceCreated: s.GetTaskRunner() != nil,
    81  		QueryAddress:       s.queryServiceServiceAddr(),
    82  		InitWorkState:      s.cfg.InitWorkState,
    83  		ConfigData:         s.config.GetData(),
    84  		Resource: logservicepb.Resource{
    85  			CPUTotal:     uint64(system.NumCPU()),
    86  			CPUAvailable: system.CPUAvailable(),
    87  			MemTotal:     system.MemoryTotal(),
    88  			MemAvailable: system.MemoryAvailable(),
    89  		},
    90  	}
    91  	if s.gossipNode != nil {
    92  		hb.GossipAddress = s.gossipServiceAddr()
    93  		hb.GossipJoined = s.gossipNode.Joined()
    94  	}
    95  
    96  	cb, err := s._hakeeperClient.SendCNHeartbeat(ctx2, hb)
    97  	if err != nil {
    98  		v2.CNHeartbeatFailureCounter.Inc()
    99  		s.logger.Error("failed to send cn heartbeat", zap.Error(err))
   100  		return
   101  	}
   102  
   103  	select {
   104  	case <-s.hakeeperConnected:
   105  	default:
   106  		s.initTaskServiceHolder()
   107  		close(s.hakeeperConnected)
   108  	}
   109  	s.config.DecrCount()
   110  	s.handleCommands(cb.Commands)
   111  }
   112  
   113  func (s *service) handleCommands(cmds []logservicepb.ScheduleCommand) {
   114  	for _, cmd := range cmds {
   115  		if cmd.ServiceType != logservicepb.CNService {
   116  			s.logger.Fatal("received invalid command", zap.String("command", cmd.LogString()))
   117  		}
   118  		s.logger.Info("applying schedule command", zap.String("command", cmd.LogString()))
   119  		if cmd.CreateTaskService != nil {
   120  			s.createTaskService(cmd.CreateTaskService)
   121  			s.createSQLLogger(cmd.CreateTaskService)
   122  		} else if s.gossipNode.Created() && cmd.JoinGossipCluster != nil {
   123  			s.gossipNode.SetJoined()
   124  
   125  			// Start an async task to join the gossip cluster to avoid the long time joining, and if
   126  			// it fails to join cluster, unset the joined state to give it another try.
   127  			if err := s.stopper.RunNamedTask("join gossip cluster", func(ctx context.Context) {
   128  				// The local state may be large, so do not set a timeout context.
   129  				if err := s.gossipNode.Join(cmd.JoinGossipCluster.Existing); err != nil {
   130  					s.logger.Error("failed to join gossip cluster", zap.Error(err))
   131  					s.gossipNode.UnsetJoined()
   132  				}
   133  			}); err != nil {
   134  				s.logger.Error("failed to start task to join gossip cluster", zap.Error(err))
   135  				s.gossipNode.UnsetJoined()
   136  			}
   137  		}
   138  	}
   139  }