github.com/projecteru2/core@v0.0.0-20240321043226-06bcc1c23f58/cluster/calcium/node.go (about)

     1  package calcium
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  
     7  	enginefactory "github.com/projecteru2/core/engine/factory"
     8  	enginetypes "github.com/projecteru2/core/engine/types"
     9  	"github.com/projecteru2/core/log"
    10  	"github.com/projecteru2/core/metrics"
    11  	"github.com/projecteru2/core/resource/plugins"
    12  	resourcetypes "github.com/projecteru2/core/resource/types"
    13  	"github.com/projecteru2/core/store"
    14  	"github.com/projecteru2/core/types"
    15  	"github.com/projecteru2/core/utils"
    16  )
    17  
    18  // AddNode adds a node
    19  // node with resource info
    20  func (c *Calcium) AddNode(ctx context.Context, opts *types.AddNodeOptions) (*types.Node, error) {
    21  	logger := log.WithFunc("calcium.AddNode").WithField("opts", opts)
    22  	if err := opts.Validate(); err != nil {
    23  		logger.Error(ctx, err)
    24  		return nil, err
    25  	}
    26  	var res resourcetypes.Resources
    27  	var node *types.Node
    28  	var err error
    29  
    30  	// check if the node is alive
    31  	client, err := enginefactory.GetEngine(ctx, c.config, opts.Nodename, opts.Endpoint, opts.Ca, opts.Cert, opts.Key)
    32  	if err != nil {
    33  		return nil, err
    34  	}
    35  	// get node info
    36  	nodeInfo, err := client.Info(ctx)
    37  	if err != nil {
    38  		return nil, err
    39  	}
    40  
    41  	return node, utils.Txn(
    42  		ctx,
    43  		// if: add node resource with resource plugins
    44  		func(ctx context.Context) error {
    45  			res, err = c.rmgr.AddNode(ctx, opts.Nodename, opts.Resources, nodeInfo)
    46  			return err
    47  		},
    48  		// then: add node meta in store
    49  		func(ctx context.Context) error {
    50  			node, err = c.store.AddNode(ctx, opts)
    51  			if err != nil {
    52  				return err
    53  			}
    54  			node.ResourceInfo.Capacity = res
    55  			_ = c.pool.Invoke(func() { c.doSendNodeMetrics(context.TODO(), node) })
    56  			return nil
    57  		},
    58  		// rollback: remove node with resource plugins
    59  		func(ctx context.Context, failureByCond bool) error {
    60  			if failureByCond {
    61  				return nil
    62  			}
    63  			return c.rmgr.RemoveNode(ctx, opts.Nodename)
    64  		},
    65  		c.config.GlobalTimeout)
    66  }
    67  
    68  // RemoveNode remove a node
    69  func (c *Calcium) RemoveNode(ctx context.Context, nodename string) error {
    70  	logger := log.WithFunc("calcium.RemoveNode").WithField("node", nodename)
    71  	if nodename == "" {
    72  		logger.Error(ctx, types.ErrEmptyNodeName)
    73  		return types.ErrEmptyNodeName
    74  	}
    75  	return c.withNodePodLocked(ctx, nodename, func(ctx context.Context, node *types.Node) error {
    76  		workloads, err := c.ListNodeWorkloads(ctx, node.Name, nil)
    77  		if err != nil {
    78  			logger.Error(ctx, err)
    79  			return err
    80  		}
    81  		// need drain first
    82  		if len(workloads) > 0 {
    83  			logger.Error(ctx, types.ErrNodeNotEmpty)
    84  			return types.ErrNodeNotEmpty
    85  		}
    86  
    87  		return utils.Txn(ctx,
    88  			// if: remove node metadata
    89  			func(ctx context.Context) error {
    90  				return c.store.RemoveNode(ctx, node)
    91  			},
    92  			// then: remove node resource metadata
    93  			func(ctx context.Context) error {
    94  				if err := c.rmgr.RemoveNode(ctx, nodename); err != nil {
    95  					return err
    96  				}
    97  				enginefactory.RemoveEngineFromCache(ctx, node.Endpoint, node.Ca, node.Cert, node.Key)
    98  				metrics.Client.RemoveInvalidNodes(nodename)
    99  				return nil
   100  			},
   101  			// rollback: do nothing
   102  			func(_ context.Context, _ bool) error {
   103  				return nil
   104  			},
   105  			c.config.GlobalTimeout)
   106  	})
   107  }
   108  
   109  // ListPodNodes list nodes belong to pod
   110  // node with resource info
   111  func (c *Calcium) ListPodNodes(ctx context.Context, opts *types.ListNodesOptions) (<-chan *types.Node, error) {
   112  	logger := log.WithFunc("calcium.ListPodNodes").WithField("podname", opts.Podname).WithField("labels", opts.Labels).WithField("all", opts.All).WithField("info", opts.CallInfo)
   113  	nf := &types.NodeFilter{Podname: opts.Podname, Labels: opts.Labels, All: opts.All}
   114  	var (
   115  		nodes []*types.Node
   116  		err   error
   117  	)
   118  	if opts.CallInfo {
   119  		nodes, err = c.store.GetNodesByPod(ctx, nf)
   120  	} else {
   121  		nodes, err = c.store.GetNodesByPod(ctx, nf, store.WithoutEngineOption())
   122  	}
   123  	if err != nil {
   124  		logger.Error(ctx, err)
   125  		return nil, err
   126  	}
   127  	ch := make(chan *types.Node)
   128  
   129  	_ = c.pool.Invoke(func() {
   130  		defer close(ch)
   131  		wg := &sync.WaitGroup{}
   132  		wg.Add(len(nodes))
   133  		defer wg.Wait()
   134  		for _, node := range nodes {
   135  			node := node
   136  			_ = c.pool.Invoke(func() {
   137  				defer wg.Done()
   138  				var err error
   139  				if node.ResourceInfo.Capacity, node.ResourceInfo.Usage, node.ResourceInfo.Diffs, err = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false); err != nil {
   140  					logger.Errorf(ctx, err, "failed to get node %+v resource info", node.Name)
   141  				}
   142  				if opts.CallInfo {
   143  					if err := node.Info(ctx); err != nil {
   144  						logger.Errorf(ctx, err, "failed to get node %+v info", node.Name)
   145  					}
   146  				}
   147  				ch <- node
   148  			})
   149  		}
   150  	})
   151  
   152  	return ch, nil
   153  }
   154  
   155  // GetNode get node
   156  // node with resource info
   157  func (c *Calcium) GetNode(ctx context.Context, nodename string) (node *types.Node, err error) {
   158  	logger := log.WithFunc("calcium.GetNode").WithField("node", nodename)
   159  	if nodename == "" {
   160  		logger.Error(ctx, types.ErrEmptyNodeName)
   161  		return nil, types.ErrEmptyNodeName
   162  	}
   163  	if node, err = c.store.GetNode(ctx, nodename); err != nil {
   164  		logger.Error(ctx, err)
   165  		return nil, err
   166  	}
   167  	if node.ResourceInfo.Capacity, node.ResourceInfo.Usage, node.ResourceInfo.Diffs, err = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false); err != nil {
   168  		logger.Error(ctx, err)
   169  		return nil, err
   170  	}
   171  	return node, nil
   172  }
   173  
   174  // GetNodeEngineInfo get node engine
   175  func (c *Calcium) GetNodeEngineInfo(ctx context.Context, nodename string) (*enginetypes.Info, error) {
   176  	logger := log.WithFunc("calcium.GetNodeEngineInfo").WithField("node", nodename)
   177  	if nodename == "" {
   178  		logger.Error(ctx, types.ErrEmptyNodeName)
   179  		return nil, types.ErrEmptyNodeName
   180  	}
   181  	node, err := c.store.GetNode(ctx, nodename)
   182  	if err != nil {
   183  		logger.Error(ctx, err)
   184  		return nil, err
   185  	}
   186  	engineInfo, err := node.Engine.Info(ctx)
   187  	logger.Error(ctx, err)
   188  	return engineInfo, err
   189  }
   190  
   191  // SetNode set node available or not
   192  // node with resource info
   193  func (c *Calcium) SetNode(ctx context.Context, opts *types.SetNodeOptions) (*types.Node, error) {
   194  	logger := log.WithFunc("calcium.SetNode").WithField("opts", opts)
   195  	if err := opts.Validate(); err != nil {
   196  		logger.Error(ctx, err)
   197  		return nil, err
   198  	}
   199  	var n *types.Node
   200  	return n, c.withNodePodLocked(ctx, opts.Nodename, func(ctx context.Context, node *types.Node) error {
   201  		logger.Info(ctx, "set node")
   202  		// update resource map
   203  		var err error
   204  		node.ResourceInfo.Capacity, node.ResourceInfo.Usage, node.ResourceInfo.Diffs, err = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false)
   205  		if err != nil {
   206  			return err
   207  		}
   208  		n = node
   209  
   210  		n.Bypass = (opts.Bypass == types.TriTrue) || (opts.Bypass == types.TriKeep && n.Bypass)
   211  		if n.IsDown() {
   212  			logger.Warnf(ctx, "node marked down: %s", opts.Nodename)
   213  		}
   214  
   215  		if opts.WorkloadsDown {
   216  			c.setAllWorkloadsOnNodeDown(ctx, n.Name)
   217  		}
   218  
   219  		// update node endpoint
   220  		if opts.Endpoint != "" {
   221  			n.Endpoint = opts.Endpoint
   222  		}
   223  		// update ca / cert / key
   224  		n.Ca = opts.Ca
   225  		n.Cert = opts.Cert
   226  		n.Key = opts.Key
   227  		// update key value
   228  		if len(opts.Labels) != 0 {
   229  			n.Labels = opts.Labels
   230  		}
   231  
   232  		var origin resourcetypes.Resources
   233  		return utils.Txn(ctx,
   234  			// if: update node resource capacity success
   235  			func(ctx context.Context) error {
   236  				if len(opts.Resources) == 0 {
   237  					return nil
   238  				}
   239  				origin, _, err = c.rmgr.SetNodeResourceCapacity(ctx, n.Name, nil, opts.Resources, opts.Delta, plugins.Incr)
   240  				return err
   241  			},
   242  			// then: update node metadata
   243  			func(ctx context.Context) error {
   244  				defer enginefactory.RemoveEngineFromCache(ctx, node.Endpoint, node.Ca, node.Cert, node.Key)
   245  				if err := c.store.UpdateNodes(ctx, n); err != nil {
   246  					return err
   247  				}
   248  				// update resource
   249  				// actually we can ignore err here, if update success
   250  				n.ResourceInfo.Capacity, n.ResourceInfo.Usage, n.ResourceInfo.Diffs, _ = c.rmgr.GetNodeResourceInfo(ctx, node.Name, nil, false)
   251  				// use send to update the usage
   252  				_ = c.pool.Invoke(func() { c.doSendNodeMetrics(context.TODO(), n) })
   253  				// remap all container
   254  				_ = c.pool.Invoke(func() { c.RemapResourceAndLog(ctx, logger, node) })
   255  				return nil
   256  			},
   257  			// rollback: update node resource capacity in reverse
   258  			func(ctx context.Context, failureByCond bool) error {
   259  				if failureByCond {
   260  					return nil
   261  				}
   262  				if len(opts.Resources) == 0 {
   263  					return nil
   264  				}
   265  				_, _, err = c.rmgr.SetNodeResourceCapacity(ctx, n.Name, nil, origin, false, plugins.Decr)
   266  				return err
   267  			},
   268  			c.config.GlobalTimeout)
   269  	})
   270  }
   271  
   272  // filterNodes filters nodes using NodeFilter nf
   273  // the filtering logic is introduced along with NodeFilter
   274  // NOTE: when nf.Includes is set, they don't need to belong to podname
   275  // update on 2021-06-21: sort and unique locks to avoid deadlock
   276  // node without resource info if batch get
   277  func (c *Calcium) filterNodes(ctx context.Context, nodeFilter *types.NodeFilter) (ns []*types.Node, err error) {
   278  	defer func() {
   279  		if len(ns) == 0 {
   280  			return
   281  		}
   282  		// sorted by nodenames
   283  		nodenames := utils.Map(ns, func(node *types.Node) string { return node.Name })
   284  		// unique
   285  		p := utils.Unique(nodenames, func(i int) string { return nodenames[i] })
   286  		ns = ns[:p]
   287  	}()
   288  
   289  	if len(nodeFilter.Includes) != 0 {
   290  		for _, nodename := range nodeFilter.Includes {
   291  			node, err := c.store.GetNode(ctx, nodename)
   292  			if err != nil {
   293  				return nil, err
   294  			}
   295  			ns = append(ns, node)
   296  		}
   297  		return ns, nil
   298  	}
   299  
   300  	listedNodes, err := c.store.GetNodesByPod(ctx, nodeFilter)
   301  	if err != nil {
   302  		return nil, err
   303  	}
   304  	if len(nodeFilter.Excludes) == 0 {
   305  		return listedNodes, nil
   306  	}
   307  
   308  	excludes := map[string]struct{}{}
   309  	for _, n := range nodeFilter.Excludes {
   310  		excludes[n] = struct{}{}
   311  	}
   312  
   313  	for _, n := range listedNodes {
   314  		if _, ok := excludes[n.Name]; ok {
   315  			continue
   316  		}
   317  		ns = append(ns, n)
   318  	}
   319  	return ns, nil
   320  }
   321  
   322  func (c *Calcium) setAllWorkloadsOnNodeDown(ctx context.Context, nodename string) {
   323  	workloads, err := c.store.ListNodeWorkloads(ctx, nodename, nil)
   324  	logger := log.WithFunc("calcium.setAllWorkloadsOnNodeDown").WithField("node", nodename)
   325  	if err != nil {
   326  		logger.Errorf(ctx, err, "failed to list node workloads, node %+v", nodename)
   327  		return
   328  	}
   329  
   330  	for _, workload := range workloads {
   331  		appname, entrypoint, _, err := utils.ParseWorkloadName(workload.Name)
   332  		if err != nil {
   333  			logger.Errorf(ctx, err, "set workload %s on node %s as inactive failed", workload.ID, nodename)
   334  			continue
   335  		}
   336  
   337  		if workload.StatusMeta == nil {
   338  			workload.StatusMeta = &types.StatusMeta{ID: workload.ID}
   339  		}
   340  		workload.StatusMeta.Running = false
   341  		workload.StatusMeta.Healthy = false
   342  
   343  		// Set these attributes to set workload status
   344  		workload.StatusMeta.Appname = appname
   345  		workload.StatusMeta.Nodename = workload.Nodename
   346  		workload.StatusMeta.Entrypoint = entrypoint
   347  
   348  		// mark workload which belongs to this node as unhealthy
   349  		if err = c.store.SetWorkloadStatus(ctx, workload.StatusMeta, 0); err != nil {
   350  			logger.Errorf(ctx, err, "set workload %s on node %s as inactive failed", workload.ID, nodename)
   351  		} else {
   352  			logger.Infof(ctx, "set workload %s on node %s as inactive", workload.ID, nodename)
   353  		}
   354  	}
   355  }