github.com/projecteru2/core@v0.0.0-20240321043226-06bcc1c23f58/selfmon/selfmon.go (about)

     1  package selfmon
     2  
     3  import (
     4  	"context"
     5  	"hash/maphash"
     6  	"math/rand"
     7  	"testing"
     8  	"time"
     9  
    10  	"github.com/cockroachdb/errors"
    11  
    12  	"github.com/projecteru2/core/cluster"
    13  	"github.com/projecteru2/core/log"
    14  	"github.com/projecteru2/core/store"
    15  	storefactory "github.com/projecteru2/core/store/factory"
    16  	"github.com/projecteru2/core/types"
    17  	"github.com/projecteru2/core/utils"
    18  )
    19  
    20  // ActiveKey .
    21  const ActiveKey = "/selfmon/active"
    22  
    23  // NodeStatusWatcher monitors the changes of node status
    24  type NodeStatusWatcher struct {
    25  	ID      int64
    26  	config  types.Config
    27  	cluster cluster.Cluster
    28  	store   store.Store
    29  }
    30  
    31  // RunNodeStatusWatcher .
    32  func RunNodeStatusWatcher(ctx context.Context, config types.Config, cluster cluster.Cluster, t *testing.T) {
    33  	r := rand.New(rand.NewSource(int64(new(maphash.Hash).Sum64()))) //nolint
    34  	ID := r.Int63n(10000)                                           //nolint
    35  	store, err := storefactory.NewStore(config, t)
    36  	if err != nil {
    37  		log.WithFunc("selfmon.RunNodeStatusWatcher").WithField("ID", ID).Error(ctx, err, "failed to create store")
    38  		return
    39  	}
    40  
    41  	watcher := &NodeStatusWatcher{
    42  		ID:      ID,
    43  		config:  config,
    44  		store:   store,
    45  		cluster: cluster,
    46  	}
    47  	watcher.run(ctx)
    48  }
    49  
    50  func (n *NodeStatusWatcher) run(ctx context.Context) {
    51  	for {
    52  		select {
    53  		case <-ctx.Done():
    54  			return
    55  		default:
    56  			n.withActiveLock(ctx, func(ctx context.Context) {
    57  				if err := n.monitor(ctx); err != nil {
    58  					log.WithFunc("selfmon.run").Errorf(ctx, err, "stops watching node id %+v", n.ID)
    59  				}
    60  			})
    61  			time.Sleep(n.config.ConnectionTimeout)
    62  		}
    63  	}
    64  }
    65  
    66  // withActiveLock acquires the active lock synchronously
    67  func (n *NodeStatusWatcher) withActiveLock(parentCtx context.Context, f func(ctx context.Context)) {
    68  	ctx, cancel := context.WithCancel(parentCtx)
    69  	defer cancel()
    70  	logger := log.WithFunc("selfmon.withActiveLock").WithField("ID", n.ID)
    71  
    72  	var expiry <-chan struct{}
    73  	var unregister func()
    74  	defer func() {
    75  		if unregister != nil {
    76  			logger.Info(ctx, "unregisters")
    77  			unregister()
    78  		}
    79  	}()
    80  
    81  	retryCounter := 0
    82  
    83  	for {
    84  		select {
    85  		case <-ctx.Done():
    86  			logger.Info(ctx, "context canceled")
    87  			return
    88  		default:
    89  		}
    90  
    91  		// try to get the lock
    92  		if ne, un, err := n.register(ctx); err != nil {
    93  			if errors.Is(err, context.Canceled) {
    94  				logger.Info(ctx, "context canceled")
    95  				return
    96  			} else if !errors.Is(err, types.ErrKeyExists) {
    97  				logger.Error(ctx, err, "failed to re-register")
    98  				time.Sleep(time.Second)
    99  				continue
   100  			}
   101  			if retryCounter == 0 {
   102  				logger.Warn(ctx, "failed to register, there has been another active node status watcher")
   103  			}
   104  			retryCounter = (retryCounter + 1) % 60
   105  			time.Sleep(time.Second)
   106  		} else {
   107  			logger.Info(ctx, "node status watcher has been active")
   108  			expiry = ne
   109  			unregister = un
   110  			break
   111  		}
   112  	}
   113  
   114  	// cancel the ctx when: 1. selfmon closed 2. lost the active lock
   115  	go func() {
   116  		defer cancel()
   117  
   118  		select {
   119  		case <-ctx.Done():
   120  			logger.Info(ctx, "context canceled")
   121  			return
   122  		case <-expiry:
   123  			logger.Info(ctx, "lock expired")
   124  			return
   125  		}
   126  	}()
   127  
   128  	f(ctx)
   129  }
   130  
   131  func (n *NodeStatusWatcher) register(ctx context.Context) (<-chan struct{}, func(), error) {
   132  	return n.store.StartEphemeral(ctx, ActiveKey, n.config.HAKeepaliveInterval)
   133  }
   134  
   135  func (n *NodeStatusWatcher) initNodeStatus(ctx context.Context) {
   136  	logger := log.WithFunc("selfmon.initNodeStatus")
   137  	logger.Debug(ctx, "init node status started")
   138  	nodes := make(chan *types.Node)
   139  
   140  	go func() {
   141  		defer close(nodes)
   142  		// Get all nodes which are active status, and regardless of pod.
   143  		var err error
   144  		var ch <-chan *types.Node
   145  		utils.WithTimeout(ctx, n.config.GlobalTimeout, func(ctx context.Context) {
   146  			ch, err = n.cluster.ListPodNodes(ctx, &types.ListNodesOptions{
   147  				Podname:  "",
   148  				Labels:   nil,
   149  				All:      true,
   150  				CallInfo: false,
   151  			})
   152  			if err != nil {
   153  				logger.Error(ctx, err, "get pod nodes failed")
   154  				return
   155  			}
   156  			for node := range ch {
   157  				logger.Debugf(ctx, "watched %s/%s", node.Name, node.Endpoint)
   158  				nodes <- node
   159  			}
   160  		})
   161  		if err != nil {
   162  			logger.Error(ctx, err, "get pod nodes failed")
   163  			return
   164  		}
   165  	}()
   166  
   167  	for node := range nodes {
   168  		status, err := n.cluster.GetNodeStatus(ctx, node.Name)
   169  		if err != nil {
   170  			status = &types.NodeStatus{
   171  				Nodename: node.Name,
   172  				Podname:  node.Podname,
   173  				Alive:    false,
   174  			}
   175  		}
   176  		// deal with test node
   177  		if node.Test {
   178  			status.Alive = true
   179  		}
   180  		n.dealNodeStatusMessage(ctx, status)
   181  	}
   182  }
   183  
   184  func (n *NodeStatusWatcher) monitor(ctx context.Context) error {
   185  	// init node status first
   186  	go n.initNodeStatus(ctx)
   187  	logger := log.WithFunc("selfmon.monitor").WithField("ID", n.ID)
   188  
   189  	// monitor node status
   190  	messageChan := n.cluster.NodeStatusStream(ctx)
   191  	logger.Info(ctx, "watch node status started")
   192  	defer logger.Info(ctx, "stop watching node status")
   193  
   194  	for {
   195  		select {
   196  		case message, ok := <-messageChan:
   197  			if !ok {
   198  				return types.ErrMessageChanClosed
   199  			}
   200  			go n.dealNodeStatusMessage(ctx, message)
   201  		case <-ctx.Done():
   202  			return ctx.Err()
   203  		}
   204  	}
   205  }
   206  
   207  func (n *NodeStatusWatcher) dealNodeStatusMessage(ctx context.Context, message *types.NodeStatus) {
   208  	logger := log.WithFunc("selfmon.dealNodeStatusMessage")
   209  	if message.Error != nil {
   210  		logger.Errorf(ctx, message.Error, "deal with node status stream message failed %+v", message)
   211  		return
   212  	}
   213  	// here we ignore node back to alive status because it will updated by agent
   214  	if message.Alive {
   215  		return
   216  	}
   217  
   218  	ctx, cancel := context.WithCancel(ctx)
   219  	defer cancel()
   220  
   221  	// TODO maybe we need a distributed lock to control concurrency
   222  	opts := &types.SetNodeOptions{
   223  		Nodename:      message.Nodename,
   224  		WorkloadsDown: true,
   225  	}
   226  	if _, err := n.cluster.SetNode(ctx, opts); err != nil {
   227  		logger.Errorf(ctx, err, "set node %s failed", message.Nodename)
   228  		return
   229  	}
   230  	logger.Infof(ctx, "set node %s as alive: %+v", message.Nodename, message.Alive)
   231  }