github.com/projecteru2/core@v0.0.0-20240321043226-06bcc1c23f58/resource/cobalt/node.go (about)

     1  package cobalt
     2  
     3  import (
     4  	"context"
     5  	"math"
     6  
     7  	"github.com/cockroachdb/errors"
     8  	enginetypes "github.com/projecteru2/core/engine/types"
     9  	plugintypes "github.com/projecteru2/core/resource/plugins/types"
    10  	resourcetypes "github.com/projecteru2/core/resource/types"
    11  	"github.com/sanity-io/litter"
    12  	"golang.org/x/exp/slices"
    13  
    14  	"github.com/projecteru2/core/log"
    15  	"github.com/projecteru2/core/resource/plugins"
    16  	"github.com/projecteru2/core/types"
    17  	"github.com/projecteru2/core/utils"
    18  )
    19  
    20  // AddNode .
    21  func (m Manager) AddNode(ctx context.Context, nodename string, opts resourcetypes.Resources, nodeInfo *enginetypes.Info) (resourcetypes.Resources, error) {
    22  	logger := log.WithFunc("resource.cobalt.AddNode").WithField("node", nodename)
    23  	res := resourcetypes.Resources{}
    24  	rollbackPlugins := []plugins.Plugin{}
    25  
    26  	return res, utils.PCR(ctx,
    27  		// prepare: do nothing
    28  		func(_ context.Context) error {
    29  			return nil
    30  		},
    31  		// commit: call plugins to add the node
    32  		func(ctx context.Context) error {
    33  			resps, err := call(ctx, m.plugins, func(plugin plugins.Plugin) (*plugintypes.AddNodeResponse, error) {
    34  				r := opts[plugin.Name()]
    35  				// Even when r==nil, we still need to run plugin,
    36  				// The reasons are as follows
    37  				// 1. plugin can fetch config from engine info
    38  				// 2. plugin need a chance to create empty config on ETCD.
    39  				logger.WithField("plugin", plugin.Name()).WithField("node", nodename).Infof(ctx, "%v", litter.Sdump(r))
    40  				resp, err := plugin.AddNode(ctx, nodename, r, nodeInfo)
    41  				if err != nil {
    42  					logger.Errorf(ctx, err, "node %+v plugin %+v failed to add node, req: %+v", nodename, plugin.Name(), litter.Sdump(r))
    43  				}
    44  				return resp, err
    45  			})
    46  
    47  			if err != nil {
    48  				for plugin := range resps {
    49  					rollbackPlugins = append(rollbackPlugins, plugin)
    50  				}
    51  				return err
    52  			}
    53  
    54  			for plugin, resp := range resps {
    55  				res[plugin.Name()] = resp.Capacity
    56  			}
    57  			return nil
    58  		},
    59  		// rollback: remove node
    60  		func(ctx context.Context) error {
    61  			_, err := call(ctx, rollbackPlugins, func(plugin plugins.Plugin) (*plugintypes.RemoveNodeResponse, error) {
    62  				resp, err := plugin.RemoveNode(ctx, nodename)
    63  				if err != nil {
    64  					logger.Errorf(ctx, err, "node %+v plugin %+v failed to rollback", nodename, plugin.Name())
    65  				}
    66  				return resp, err
    67  			})
    68  
    69  			if err != nil {
    70  				logger.Error(ctx, err, "failed to rollback")
    71  			}
    72  			return err
    73  		},
    74  		m.config.GlobalTimeout,
    75  	)
    76  }
    77  
    78  // RemoveNode .
    79  func (m Manager) RemoveNode(ctx context.Context, nodename string) error {
    80  	logger := log.WithFunc("resource.cobalt.RemoveNode").WithField("node", nodename)
    81  	var nodeCapacity resourcetypes.Resources
    82  	var nodeUsage resourcetypes.Resources
    83  	rollbackPlugins := []plugins.Plugin{}
    84  
    85  	return utils.PCR(ctx,
    86  		// prepare: get node resource
    87  		func(ctx context.Context) error {
    88  			var err error
    89  			nodeCapacity, nodeUsage, _, err = m.GetNodeResourceInfo(ctx, nodename, nil, false)
    90  			if err != nil {
    91  				logger.Error(ctx, err, "failed to get node resource")
    92  				return err
    93  			}
    94  			return nil
    95  		},
    96  		// commit: remove node
    97  		func(ctx context.Context) error {
    98  			resps, err := call(ctx, m.plugins, func(plugin plugins.Plugin) (*plugintypes.RemoveNodeResponse, error) {
    99  				resp, err := plugin.RemoveNode(ctx, nodename)
   100  				if err != nil {
   101  					logger.Errorf(ctx, err, "plugin %+v failed to remove node", plugin.Name())
   102  				}
   103  				return resp, err
   104  			})
   105  
   106  			if err != nil {
   107  				for plugin := range resps {
   108  					rollbackPlugins = append(rollbackPlugins, plugin)
   109  				}
   110  
   111  				logger.Error(ctx, err, "failed to remove node")
   112  				return err
   113  			}
   114  			return nil
   115  		},
   116  		// rollback: add node
   117  		func(ctx context.Context) error {
   118  			_, err := call(ctx, rollbackPlugins, func(plugin plugins.Plugin) (*plugintypes.SetNodeResourceInfoResponse, error) {
   119  				capacity := nodeCapacity[plugin.Name()]
   120  				usage := nodeUsage[plugin.Name()]
   121  
   122  				resp, err := plugin.SetNodeResourceInfo(ctx, nodename, capacity, usage)
   123  				if err != nil {
   124  					logger.Errorf(ctx, err, "plugin %+v node %+v failed to rollback", plugin.Name(), nodename)
   125  				}
   126  				return resp, err
   127  			})
   128  
   129  			if err != nil {
   130  				logger.Error(ctx, err, "failed to rollback")
   131  			}
   132  			return err
   133  		},
   134  		m.config.GlobalTimeout,
   135  	)
   136  }
   137  
   138  // GetMostIdleNode .
   139  func (m Manager) GetMostIdleNode(ctx context.Context, nodenames []string) (string, error) {
   140  	logger := log.WithFunc("resource.cobalt.GetMostIdleNode")
   141  	if len(nodenames) == 0 {
   142  		return "", errors.Wrap(types.ErrGetMostIdleNodeFailed, "empty node names")
   143  	}
   144  
   145  	resps, err := call(ctx, m.plugins, func(plugin plugins.Plugin) (*plugintypes.GetMostIdleNodeResponse, error) {
   146  		resp, err := plugin.GetMostIdleNode(ctx, nodenames)
   147  		if err != nil {
   148  			logger.Errorf(ctx, err, "plugin %+v failed to get the most idle node of %+v", plugin.Name(), nodenames)
   149  		}
   150  		return resp, err
   151  	})
   152  	if err != nil {
   153  		logger.Errorf(ctx, err, "failed to get the most idle node of %+v", nodenames)
   154  		return "", err
   155  	}
   156  
   157  	var mostIdleNode *plugintypes.GetMostIdleNodeResponse
   158  	for _, resp := range resps {
   159  		if (mostIdleNode == nil || resp.Priority > mostIdleNode.Priority) && len(resp.Nodename) > 0 {
   160  			mostIdleNode = resp
   161  		}
   162  	}
   163  
   164  	if mostIdleNode == nil {
   165  		return "", types.ErrGetMostIdleNodeFailed
   166  	}
   167  	return mostIdleNode.Nodename, nil
   168  }
   169  
   170  // GetNodeResourceInfo .
   171  func (m Manager) GetNodeResourceInfo(ctx context.Context, nodename string, workloads []*types.Workload, fix bool) (resourcetypes.Resources, resourcetypes.Resources, []string, error) {
   172  	nodeCapacity := resourcetypes.Resources{}
   173  	nodeUsage := resourcetypes.Resources{}
   174  	resourceDiffs := []string{}
   175  
   176  	ps := m.plugins
   177  	if m.config.ResourcePlugin.Whitelist != nil {
   178  		ps = utils.Filter(ps, func(plugin plugins.Plugin) bool {
   179  			return slices.Contains(m.config.ResourcePlugin.Whitelist, plugin.Name())
   180  		})
   181  	}
   182  
   183  	resps, err := call(ctx, ps, func(plugin plugins.Plugin) (*plugintypes.GetNodeResourceInfoResponse, error) {
   184  		var resp *plugintypes.GetNodeResourceInfoResponse
   185  		var err error
   186  
   187  		wrks := []plugintypes.WorkloadResource{}
   188  
   189  		for _, wrk := range workloads {
   190  			r := wrk.Resources[plugin.Name()]
   191  			wrks = append(wrks, r)
   192  		}
   193  
   194  		if fix {
   195  			resp, err = plugin.FixNodeResource(ctx, nodename, wrks)
   196  		} else {
   197  			resp, err = plugin.GetNodeResourceInfo(ctx, nodename, wrks)
   198  		}
   199  		if err != nil {
   200  			log.WithFunc("resource.cobalt.GetNodeResourceInfo").WithField("node", nodename).Errorf(ctx, err, "plugin %+v failed to get node resource", plugin.Name())
   201  		}
   202  		return resp, err
   203  	})
   204  
   205  	if err != nil {
   206  		return nil, nil, nil, err
   207  	}
   208  
   209  	for plugin, resp := range resps {
   210  		nodeCapacity[plugin.Name()] = resp.Capacity
   211  		nodeUsage[plugin.Name()] = resp.Usage
   212  		resourceDiffs = append(resourceDiffs, resp.Diffs...)
   213  	}
   214  
   215  	return nodeCapacity, nodeUsage, resourceDiffs, nil
   216  }
   217  
   218  // SetNodeResourceUsage .
   219  func (m Manager) SetNodeResourceUsage(ctx context.Context, nodename string, nodeResource resourcetypes.Resources, nodeResourceRequest resourcetypes.Resources, workloadsResource []resourcetypes.Resources, delta bool, incr bool) (resourcetypes.Resources, resourcetypes.Resources, error) {
   220  	logger := log.WithFunc("resource.cobalt.SetNodeResourceUsage").WithField("node", nodename)
   221  	wrksResource := map[string][]resourcetypes.RawParams{}
   222  	rollbackPlugins := []plugins.Plugin{}
   223  	before := resourcetypes.Resources{}
   224  	after := resourcetypes.Resources{}
   225  
   226  	return before, after, utils.PCR(ctx,
   227  		func(_ context.Context) error {
   228  			// prepare: covert []resourcetypes.Resources to map[plugin]resourcetypes.Resources
   229  			// [{"cpu-plugin": {"cpu": 1}}, {"cpu-plugin": {"cpu": 1}}] -> {"cpu-plugin": [{"cpu": 1}, {"cpu": 1}]}
   230  			for _, workloadResource := range workloadsResource {
   231  				for plugin, params := range workloadResource {
   232  					if _, ok := wrksResource[plugin]; !ok {
   233  						wrksResource[plugin] = []resourcetypes.RawParams{}
   234  					}
   235  					wrksResource[plugin] = append(wrksResource[plugin], params)
   236  				}
   237  			}
   238  			if nodeResourceRequest == nil {
   239  				nodeResourceRequest = resourcetypes.Resources{}
   240  			}
   241  			return nil
   242  		},
   243  		// commit: call plugins to set node resource
   244  		func(ctx context.Context) error {
   245  			resps, err := call(ctx, m.plugins, func(plugin plugins.Plugin) (*plugintypes.SetNodeResourceUsageResponse, error) {
   246  				return plugin.SetNodeResourceUsage(ctx, nodename, nodeResource[plugin.Name()], nodeResourceRequest[plugin.Name()], wrksResource[plugin.Name()], delta, incr)
   247  			})
   248  
   249  			if err != nil {
   250  				for plugin, resp := range resps {
   251  					rollbackPlugins = append(rollbackPlugins, plugin)
   252  					before[plugin.Name()] = resp.Before
   253  					after[plugin.Name()] = resp.After
   254  				}
   255  				logger.Error(ctx, err, "failed to set node resource")
   256  			}
   257  			return err
   258  		},
   259  		// rollback: set the rollback resource args in reverse
   260  		func(ctx context.Context) error {
   261  			_, err := call(ctx, rollbackPlugins, func(plugin plugins.Plugin) (*plugintypes.SetNodeResourceUsageResponse, error) {
   262  				resp, err := plugin.SetNodeResourceUsage(ctx, nodename, before[plugin.Name()], nil, nil, false, false)
   263  				if err != nil {
   264  					logger.Errorf(ctx, err, "node %+v plugin %+v failed to rollback node resource", nodename, plugin.Name())
   265  				}
   266  				return resp, err
   267  			})
   268  			return err
   269  		},
   270  		m.config.GlobalTimeout,
   271  	)
   272  }
   273  
   274  // GetNodesDeployCapacity returns available nodes which meet all the requirements
   275  // the caller should require locks
   276  // pure calculation
   277  func (m Manager) GetNodesDeployCapacity(ctx context.Context, nodenames []string, opts resourcetypes.Resources) (map[string]*plugintypes.NodeDeployCapacity, int, error) {
   278  	logger := log.WithFunc("resource.cobalt.GetNodesDeployCapacity")
   279  	var resp map[string]*plugintypes.NodeDeployCapacity
   280  
   281  	resps, err := call(ctx, m.plugins, func(plugin plugins.Plugin) (*plugintypes.GetNodesDeployCapacityResponse, error) {
   282  		resp, err := plugin.GetNodesDeployCapacity(ctx, nodenames, opts[plugin.Name()])
   283  		if err != nil {
   284  			logger.Errorf(ctx, err, "plugin %+v failed to get available nodenames, request %+v", plugin.Name(), opts[plugin.Name()])
   285  		}
   286  		return resp, err
   287  	})
   288  	if err != nil {
   289  		return nil, 0, err
   290  	}
   291  
   292  	// get nodenames with all resource capacities > 0
   293  	for _, info := range resps {
   294  		resp = m.mergeCapacity(resp, info.NodeDeployCapacityMap)
   295  	}
   296  	total := 0
   297  
   298  	// weighted average
   299  	for _, info := range resp {
   300  		info.Rate /= info.Weight
   301  		info.Usage /= info.Weight
   302  		if info.Capacity == math.MaxInt64 {
   303  			total = math.MaxInt64
   304  		} else {
   305  			total += info.Capacity
   306  		}
   307  	}
   308  
   309  	return resp, total, nil
   310  }
   311  
   312  // SetNodeResourceCapacity updates node resource capacity
   313  // receives resource options instead of resource args
   314  func (m Manager) SetNodeResourceCapacity(ctx context.Context, nodename string, nodeResource resourcetypes.Resources, nodeResourceRequest resourcetypes.Resources, delta bool, incr bool) (resourcetypes.Resources, resourcetypes.Resources, error) {
   315  	logger := log.WithFunc("resource.cobalt.SetNodeResourceCapacity").WithField("node", nodename)
   316  
   317  	rollbackPlugins := []plugins.Plugin{}
   318  	before := resourcetypes.Resources{}
   319  	after := resourcetypes.Resources{}
   320  
   321  	return before, after, utils.PCR(ctx,
   322  		func(_ context.Context) error {
   323  			if nodeResourceRequest == nil {
   324  				nodeResourceRequest = resourcetypes.Resources{}
   325  			}
   326  			return nil
   327  		},
   328  		// commit: call plugins to set node resource
   329  		func(ctx context.Context) error {
   330  			resps, err := call(ctx, m.plugins, func(plugin plugins.Plugin) (*plugintypes.SetNodeResourceCapacityResponse, error) {
   331  				if nodeResource[plugin.Name()] == nil && nodeResourceRequest[plugin.Name()] == nil {
   332  					return nil, nil
   333  				}
   334  				resp, err := plugin.SetNodeResourceCapacity(ctx, nodename, nodeResource[plugin.Name()], nodeResourceRequest[plugin.Name()], delta, incr)
   335  				if err != nil {
   336  					logger.Errorf(ctx, err, "plugin %+v failed to set node resource capacity", plugin.Name())
   337  				}
   338  				return resp, err
   339  			})
   340  
   341  			if err != nil {
   342  				for plugin, resp := range resps {
   343  					if resp == nil {
   344  						continue
   345  					}
   346  					rollbackPlugins = append(rollbackPlugins, plugin)
   347  					before[plugin.Name()] = resp.Before
   348  					after[plugin.Name()] = resp.After
   349  				}
   350  				logger.Errorf(ctx, err, "failed to set node resource for node %+v", nodename)
   351  				return err
   352  			}
   353  			return nil
   354  		},
   355  		// rollback: set the rollback resource args in reverse
   356  		func(ctx context.Context) error {
   357  			_, err := call(ctx, rollbackPlugins, func(plugin plugins.Plugin) (*plugintypes.SetNodeResourceCapacityResponse, error) {
   358  				resp, err := plugin.SetNodeResourceCapacity(ctx, nodename, nil, before[plugin.Name()], false, false)
   359  				if err != nil {
   360  					logger.Errorf(ctx, err, "node %+v plugin %+v failed to rollback node resource capacity", nodename, plugin.Name())
   361  				}
   362  				return resp, err
   363  			})
   364  			return err
   365  		},
   366  		m.config.GlobalTimeout,
   367  	)
   368  }
   369  
   370  func (m Manager) mergeCapacity(m1 map[string]*plugintypes.NodeDeployCapacity, m2 map[string]*plugintypes.NodeDeployCapacity) map[string]*plugintypes.NodeDeployCapacity {
   371  	if m1 == nil {
   372  		return m2
   373  	}
   374  
   375  	resp := map[string]*plugintypes.NodeDeployCapacity{}
   376  	for nodename, info1 := range m1 {
   377  		// all the capacities should > 0
   378  		if info2, ok := m2[nodename]; ok {
   379  			resp[nodename] = &plugintypes.NodeDeployCapacity{
   380  				Capacity: utils.Min(info1.Capacity, info2.Capacity),
   381  				Rate:     info1.Rate + info2.Rate*info2.Weight,
   382  				Usage:    info1.Usage + info2.Usage*info2.Weight,
   383  				Weight:   info1.Weight + info2.Weight,
   384  			}
   385  		}
   386  	}
   387  	return resp
   388  }