github.com/projecteru2/core@v0.0.0-20240321043226-06bcc1c23f58/selfmon/selfmon.go (about) 1 package selfmon 2 3 import ( 4 "context" 5 "hash/maphash" 6 "math/rand" 7 "testing" 8 "time" 9 10 "github.com/cockroachdb/errors" 11 12 "github.com/projecteru2/core/cluster" 13 "github.com/projecteru2/core/log" 14 "github.com/projecteru2/core/store" 15 storefactory "github.com/projecteru2/core/store/factory" 16 "github.com/projecteru2/core/types" 17 "github.com/projecteru2/core/utils" 18 ) 19 20 // ActiveKey . 21 const ActiveKey = "/selfmon/active" 22 23 // NodeStatusWatcher monitors the changes of node status 24 type NodeStatusWatcher struct { 25 ID int64 26 config types.Config 27 cluster cluster.Cluster 28 store store.Store 29 } 30 31 // RunNodeStatusWatcher . 32 func RunNodeStatusWatcher(ctx context.Context, config types.Config, cluster cluster.Cluster, t *testing.T) { 33 r := rand.New(rand.NewSource(int64(new(maphash.Hash).Sum64()))) //nolint 34 ID := r.Int63n(10000) //nolint 35 store, err := storefactory.NewStore(config, t) 36 if err != nil { 37 log.WithFunc("selfmon.RunNodeStatusWatcher").WithField("ID", ID).Error(ctx, err, "failed to create store") 38 return 39 } 40 41 watcher := &NodeStatusWatcher{ 42 ID: ID, 43 config: config, 44 store: store, 45 cluster: cluster, 46 } 47 watcher.run(ctx) 48 } 49 50 func (n *NodeStatusWatcher) run(ctx context.Context) { 51 for { 52 select { 53 case <-ctx.Done(): 54 return 55 default: 56 n.withActiveLock(ctx, func(ctx context.Context) { 57 if err := n.monitor(ctx); err != nil { 58 log.WithFunc("selfmon.run").Errorf(ctx, err, "stops watching node id %+v", n.ID) 59 } 60 }) 61 time.Sleep(n.config.ConnectionTimeout) 62 } 63 } 64 } 65 66 // withActiveLock acquires the active lock synchronously 67 func (n *NodeStatusWatcher) withActiveLock(parentCtx context.Context, f func(ctx context.Context)) { 68 ctx, cancel := context.WithCancel(parentCtx) 69 defer cancel() 70 logger := log.WithFunc("selfmon.withActiveLock").WithField("ID", n.ID) 71 72 var expiry <-chan struct{} 73 var unregister func() 74 defer func() { 75 if unregister != nil { 76 logger.Info(ctx, "unregisters") 77 unregister() 78 } 79 }() 80 81 retryCounter := 0 82 83 for { 84 select { 85 case <-ctx.Done(): 86 logger.Info(ctx, "context canceled") 87 return 88 default: 89 } 90 91 // try to get the lock 92 if ne, un, err := n.register(ctx); err != nil { 93 if errors.Is(err, context.Canceled) { 94 logger.Info(ctx, "context canceled") 95 return 96 } else if !errors.Is(err, types.ErrKeyExists) { 97 logger.Error(ctx, err, "failed to re-register") 98 time.Sleep(time.Second) 99 continue 100 } 101 if retryCounter == 0 { 102 logger.Warn(ctx, "failed to register, there has been another active node status watcher") 103 } 104 retryCounter = (retryCounter + 1) % 60 105 time.Sleep(time.Second) 106 } else { 107 logger.Info(ctx, "node status watcher has been active") 108 expiry = ne 109 unregister = un 110 break 111 } 112 } 113 114 // cancel the ctx when: 1. selfmon closed 2. lost the active lock 115 go func() { 116 defer cancel() 117 118 select { 119 case <-ctx.Done(): 120 logger.Info(ctx, "context canceled") 121 return 122 case <-expiry: 123 logger.Info(ctx, "lock expired") 124 return 125 } 126 }() 127 128 f(ctx) 129 } 130 131 func (n *NodeStatusWatcher) register(ctx context.Context) (<-chan struct{}, func(), error) { 132 return n.store.StartEphemeral(ctx, ActiveKey, n.config.HAKeepaliveInterval) 133 } 134 135 func (n *NodeStatusWatcher) initNodeStatus(ctx context.Context) { 136 logger := log.WithFunc("selfmon.initNodeStatus") 137 logger.Debug(ctx, "init node status started") 138 nodes := make(chan *types.Node) 139 140 go func() { 141 defer close(nodes) 142 // Get all nodes which are active status, and regardless of pod. 143 var err error 144 var ch <-chan *types.Node 145 utils.WithTimeout(ctx, n.config.GlobalTimeout, func(ctx context.Context) { 146 ch, err = n.cluster.ListPodNodes(ctx, &types.ListNodesOptions{ 147 Podname: "", 148 Labels: nil, 149 All: true, 150 CallInfo: false, 151 }) 152 if err != nil { 153 logger.Error(ctx, err, "get pod nodes failed") 154 return 155 } 156 for node := range ch { 157 logger.Debugf(ctx, "watched %s/%s", node.Name, node.Endpoint) 158 nodes <- node 159 } 160 }) 161 if err != nil { 162 logger.Error(ctx, err, "get pod nodes failed") 163 return 164 } 165 }() 166 167 for node := range nodes { 168 status, err := n.cluster.GetNodeStatus(ctx, node.Name) 169 if err != nil { 170 status = &types.NodeStatus{ 171 Nodename: node.Name, 172 Podname: node.Podname, 173 Alive: false, 174 } 175 } 176 // deal with test node 177 if node.Test { 178 status.Alive = true 179 } 180 n.dealNodeStatusMessage(ctx, status) 181 } 182 } 183 184 func (n *NodeStatusWatcher) monitor(ctx context.Context) error { 185 // init node status first 186 go n.initNodeStatus(ctx) 187 logger := log.WithFunc("selfmon.monitor").WithField("ID", n.ID) 188 189 // monitor node status 190 messageChan := n.cluster.NodeStatusStream(ctx) 191 logger.Info(ctx, "watch node status started") 192 defer logger.Info(ctx, "stop watching node status") 193 194 for { 195 select { 196 case message, ok := <-messageChan: 197 if !ok { 198 return types.ErrMessageChanClosed 199 } 200 go n.dealNodeStatusMessage(ctx, message) 201 case <-ctx.Done(): 202 return ctx.Err() 203 } 204 } 205 } 206 207 func (n *NodeStatusWatcher) dealNodeStatusMessage(ctx context.Context, message *types.NodeStatus) { 208 logger := log.WithFunc("selfmon.dealNodeStatusMessage") 209 if message.Error != nil { 210 logger.Errorf(ctx, message.Error, "deal with node status stream message failed %+v", message) 211 return 212 } 213 // here we ignore node back to alive status because it will updated by agent 214 if message.Alive { 215 return 216 } 217 218 ctx, cancel := context.WithCancel(ctx) 219 defer cancel() 220 221 // TODO maybe we need a distributed lock to control concurrency 222 opts := &types.SetNodeOptions{ 223 Nodename: message.Nodename, 224 WorkloadsDown: true, 225 } 226 if _, err := n.cluster.SetNode(ctx, opts); err != nil { 227 logger.Errorf(ctx, err, "set node %s failed", message.Nodename) 228 return 229 } 230 logger.Infof(ctx, "set node %s as alive: %+v", message.Nodename, message.Alive) 231 }