github.com/projecteru2/core@v0.0.0-20240321043226-06bcc1c23f58/cluster/calcium/node.go (about) 1 package calcium 2 3 import ( 4 "context" 5 "sync" 6 7 enginefactory "github.com/projecteru2/core/engine/factory" 8 enginetypes "github.com/projecteru2/core/engine/types" 9 "github.com/projecteru2/core/log" 10 "github.com/projecteru2/core/metrics" 11 "github.com/projecteru2/core/resource/plugins" 12 resourcetypes "github.com/projecteru2/core/resource/types" 13 "github.com/projecteru2/core/store" 14 "github.com/projecteru2/core/types" 15 "github.com/projecteru2/core/utils" 16 ) 17 18 // AddNode adds a node 19 // node with resource info 20 func (c *Calcium) AddNode(ctx context.Context, opts *types.AddNodeOptions) (*types.Node, error) { 21 logger := log.WithFunc("calcium.AddNode").WithField("opts", opts) 22 if err := opts.Validate(); err != nil { 23 logger.Error(ctx, err) 24 return nil, err 25 } 26 var res resourcetypes.Resources 27 var node *types.Node 28 var err error 29 30 // check if the node is alive 31 client, err := enginefactory.GetEngine(ctx, c.config, opts.Nodename, opts.Endpoint, opts.Ca, opts.Cert, opts.Key) 32 if err != nil { 33 return nil, err 34 } 35 // get node info 36 nodeInfo, err := client.Info(ctx) 37 if err != nil { 38 return nil, err 39 } 40 41 return node, utils.Txn( 42 ctx, 43 // if: add node resource with resource plugins 44 func(ctx context.Context) error { 45 res, err = c.rmgr.AddNode(ctx, opts.Nodename, opts.Resources, nodeInfo) 46 return err 47 }, 48 // then: add node meta in store 49 func(ctx context.Context) error { 50 node, err = c.store.AddNode(ctx, opts) 51 if err != nil { 52 return err 53 } 54 node.ResourceInfo.Capacity = res 55 _ = c.pool.Invoke(func() { c.doSendNodeMetrics(context.TODO(), node) }) 56 return nil 57 }, 58 // rollback: remove node with resource plugins 59 func(ctx context.Context, failureByCond bool) error { 60 if failureByCond { 61 return nil 62 } 63 return c.rmgr.RemoveNode(ctx, opts.Nodename) 64 }, 65 c.config.GlobalTimeout) 66 } 67 68 // RemoveNode remove a node 69 func (c *Calcium) RemoveNode(ctx context.Context, nodename string) error { 70 logger := log.WithFunc("calcium.RemoveNode").WithField("node", nodename) 71 if nodename == "" { 72 logger.Error(ctx, types.ErrEmptyNodeName) 73 return types.ErrEmptyNodeName 74 } 75 return c.withNodePodLocked(ctx, nodename, func(ctx context.Context, node *types.Node) error { 76 workloads, err := c.ListNodeWorkloads(ctx, node.Name, nil) 77 if err != nil { 78 logger.Error(ctx, err) 79 return err 80 } 81 // need drain first 82 if len(workloads) > 0 { 83 logger.Error(ctx, types.ErrNodeNotEmpty) 84 return types.ErrNodeNotEmpty 85 } 86 87 return utils.Txn(ctx, 88 // if: remove node metadata 89 func(ctx context.Context) error { 90 return c.store.RemoveNode(ctx, node) 91 }, 92 // then: remove node resource metadata 93 func(ctx context.Context) error { 94 if err := c.rmgr.RemoveNode(ctx, nodename); err != nil { 95 return err 96 } 97 enginefactory.RemoveEngineFromCache(ctx, node.Endpoint, node.Ca, node.Cert, node.Key) 98 metrics.Client.RemoveInvalidNodes(nodename) 99 return nil 100 }, 101 // rollback: do nothing 102 func(_ context.Context, _ bool) error { 103 return nil 104 }, 105 c.config.GlobalTimeout) 106 }) 107 } 108 109 // ListPodNodes list nodes belong to pod 110 // node with resource info 111 func (c *Calcium) ListPodNodes(ctx context.Context, opts *types.ListNodesOptions) (<-chan *types.Node, error) { 112 logger := log.WithFunc("calcium.ListPodNodes").WithField("podname", opts.Podname).WithField("labels", opts.Labels).WithField("all", opts.All).WithField("info", opts.CallInfo) 113 nf := &types.NodeFilter{Podname: opts.Podname, Labels: opts.Labels, All: opts.All} 114 var ( 115 nodes []*types.Node 116 err error 117 ) 118 if opts.CallInfo { 119 nodes, err = c.store.GetNodesByPod(ctx, nf) 120 } else { 121 nodes, err = c.store.GetNodesByPod(ctx, nf, store.WithoutEngineOption()) 122 } 123 if err != nil { 124 logger.Error(ctx, err) 125 return nil, err 126 } 127 ch := make(chan *types.Node) 128 129 _ = c.pool.Invoke(func() { 130 defer close(ch) 131 wg := &sync.WaitGroup{} 132 wg.Add(len(nodes)) 133 defer wg.Wait() 134 for _, node := range nodes { 135 node := node 136 _ = c.pool.Invoke(func() { 137 defer wg.Done() 138 var err error 139 if node.ResourceInfo.Capacity, node.ResourceInfo.Usage, node.ResourceInfo.Diffs, err = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false); err != nil { 140 logger.Errorf(ctx, err, "failed to get node %+v resource info", node.Name) 141 } 142 if opts.CallInfo { 143 if err := node.Info(ctx); err != nil { 144 logger.Errorf(ctx, err, "failed to get node %+v info", node.Name) 145 } 146 } 147 ch <- node 148 }) 149 } 150 }) 151 152 return ch, nil 153 } 154 155 // GetNode get node 156 // node with resource info 157 func (c *Calcium) GetNode(ctx context.Context, nodename string) (node *types.Node, err error) { 158 logger := log.WithFunc("calcium.GetNode").WithField("node", nodename) 159 if nodename == "" { 160 logger.Error(ctx, types.ErrEmptyNodeName) 161 return nil, types.ErrEmptyNodeName 162 } 163 if node, err = c.store.GetNode(ctx, nodename); err != nil { 164 logger.Error(ctx, err) 165 return nil, err 166 } 167 if node.ResourceInfo.Capacity, node.ResourceInfo.Usage, node.ResourceInfo.Diffs, err = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false); err != nil { 168 logger.Error(ctx, err) 169 return nil, err 170 } 171 return node, nil 172 } 173 174 // GetNodeEngineInfo get node engine 175 func (c *Calcium) GetNodeEngineInfo(ctx context.Context, nodename string) (*enginetypes.Info, error) { 176 logger := log.WithFunc("calcium.GetNodeEngineInfo").WithField("node", nodename) 177 if nodename == "" { 178 logger.Error(ctx, types.ErrEmptyNodeName) 179 return nil, types.ErrEmptyNodeName 180 } 181 node, err := c.store.GetNode(ctx, nodename) 182 if err != nil { 183 logger.Error(ctx, err) 184 return nil, err 185 } 186 engineInfo, err := node.Engine.Info(ctx) 187 logger.Error(ctx, err) 188 return engineInfo, err 189 } 190 191 // SetNode set node available or not 192 // node with resource info 193 func (c *Calcium) SetNode(ctx context.Context, opts *types.SetNodeOptions) (*types.Node, error) { 194 logger := log.WithFunc("calcium.SetNode").WithField("opts", opts) 195 if err := opts.Validate(); err != nil { 196 logger.Error(ctx, err) 197 return nil, err 198 } 199 var n *types.Node 200 return n, c.withNodePodLocked(ctx, opts.Nodename, func(ctx context.Context, node *types.Node) error { 201 logger.Info(ctx, "set node") 202 // update resource map 203 var err error 204 node.ResourceInfo.Capacity, node.ResourceInfo.Usage, node.ResourceInfo.Diffs, err = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false) 205 if err != nil { 206 return err 207 } 208 n = node 209 210 n.Bypass = (opts.Bypass == types.TriTrue) || (opts.Bypass == types.TriKeep && n.Bypass) 211 if n.IsDown() { 212 logger.Warnf(ctx, "node marked down: %s", opts.Nodename) 213 } 214 215 if opts.WorkloadsDown { 216 c.setAllWorkloadsOnNodeDown(ctx, n.Name) 217 } 218 219 // update node endpoint 220 if opts.Endpoint != "" { 221 n.Endpoint = opts.Endpoint 222 } 223 // update ca / cert / key 224 n.Ca = opts.Ca 225 n.Cert = opts.Cert 226 n.Key = opts.Key 227 // update key value 228 if len(opts.Labels) != 0 { 229 n.Labels = opts.Labels 230 } 231 232 var origin resourcetypes.Resources 233 return utils.Txn(ctx, 234 // if: update node resource capacity success 235 func(ctx context.Context) error { 236 if len(opts.Resources) == 0 { 237 return nil 238 } 239 origin, _, err = c.rmgr.SetNodeResourceCapacity(ctx, n.Name, nil, opts.Resources, opts.Delta, plugins.Incr) 240 return err 241 }, 242 // then: update node metadata 243 func(ctx context.Context) error { 244 defer enginefactory.RemoveEngineFromCache(ctx, node.Endpoint, node.Ca, node.Cert, node.Key) 245 if err := c.store.UpdateNodes(ctx, n); err != nil { 246 return err 247 } 248 // update resource 249 // actually we can ignore err here, if update success 250 n.ResourceInfo.Capacity, n.ResourceInfo.Usage, n.ResourceInfo.Diffs, _ = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false) 251 // use send to update the usage 252 _ = c.pool.Invoke(func() { c.doSendNodeMetrics(context.TODO(), n) }) 253 // remap all container 254 _ = c.pool.Invoke(func() { c.RemapResourceAndLog(ctx, logger, node) }) 255 return nil 256 }, 257 // rollback: update node resource capacity in reverse 258 func(ctx context.Context, failureByCond bool) error { 259 if failureByCond { 260 return nil 261 } 262 if len(opts.Resources) == 0 { 263 return nil 264 } 265 _, _, err = c.rmgr.SetNodeResourceCapacity(ctx, n.Name, nil, origin, false, plugins.Decr) 266 return err 267 }, 268 c.config.GlobalTimeout) 269 }) 270 } 271 272 // filterNodes filters nodes using NodeFilter nf 273 // the filtering logic is introduced along with NodeFilter 274 // NOTE: when nf.Includes is set, they don't need to belong to podname 275 // update on 2021-06-21: sort and unique locks to avoid deadlock 276 // node without resource info if batch get 277 func (c *Calcium) filterNodes(ctx context.Context, nodeFilter *types.NodeFilter) (ns []*types.Node, err error) { 278 defer func() { 279 if len(ns) == 0 { 280 return 281 } 282 // sorted by nodenames 283 nodenames := utils.Map(ns, func(node *types.Node) string { return node.Name }) 284 // unique 285 p := utils.Unique(nodenames, func(i int) string { return nodenames[i] }) 286 ns = ns[:p] 287 }() 288 289 if len(nodeFilter.Includes) != 0 { 290 for _, nodename := range nodeFilter.Includes { 291 node, err := c.store.GetNode(ctx, nodename) 292 if err != nil { 293 return nil, err 294 } 295 ns = append(ns, node) 296 } 297 return ns, nil 298 } 299 300 listedNodes, err := c.store.GetNodesByPod(ctx, nodeFilter) 301 if err != nil { 302 return nil, err 303 } 304 if len(nodeFilter.Excludes) == 0 { 305 return listedNodes, nil 306 } 307 308 excludes := map[string]struct{}{} 309 for _, n := range nodeFilter.Excludes { 310 excludes[n] = struct{}{} 311 } 312 313 for _, n := range listedNodes { 314 if _, ok := excludes[n.Name]; ok { 315 continue 316 } 317 ns = append(ns, n) 318 } 319 return ns, nil 320 } 321 322 func (c *Calcium) setAllWorkloadsOnNodeDown(ctx context.Context, nodename string) { 323 workloads, err := c.store.ListNodeWorkloads(ctx, nodename, nil) 324 logger := log.WithFunc("calcium.setAllWorkloadsOnNodeDown").WithField("node", nodename) 325 if err != nil { 326 logger.Errorf(ctx, err, "failed to list node workloads, node %+v", nodename) 327 return 328 } 329 330 for _, workload := range workloads { 331 appname, entrypoint, _, err := utils.ParseWorkloadName(workload.Name) 332 if err != nil { 333 logger.Errorf(ctx, err, "set workload %s on node %s as inactive failed", workload.ID, nodename) 334 continue 335 } 336 337 if workload.StatusMeta == nil { 338 workload.StatusMeta = &types.StatusMeta{ID: workload.ID} 339 } 340 workload.StatusMeta.Running = false 341 workload.StatusMeta.Healthy = false 342 343 // Set these attributes to set workload status 344 workload.StatusMeta.Appname = appname 345 workload.StatusMeta.Nodename = workload.Nodename 346 workload.StatusMeta.Entrypoint = entrypoint 347 348 // mark workload which belongs to this node as unhealthy 349 if err = c.store.SetWorkloadStatus(ctx, workload.StatusMeta, 0); err != nil { 350 logger.Errorf(ctx, err, "set workload %s on node %s as inactive failed", workload.ID, nodename) 351 } else { 352 logger.Infof(ctx, "set workload %s on node %s as inactive", workload.ID, nodename) 353 } 354 } 355 }