volcano.sh/volcano@v1.9.0/pkg/scheduler/framework/statement.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package framework
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"k8s.io/klog/v2"
    23  
    24  	"volcano.sh/volcano/pkg/scheduler/api"
    25  	"volcano.sh/volcano/pkg/scheduler/metrics"
    26  )
    27  
    28  // Operation type
    29  type Operation int8
    30  
    31  const (
    32  	// Evict op
    33  	Evict = iota
    34  	// Pipeline op
    35  	Pipeline
    36  	// Allocate op
    37  	Allocate
    38  )
    39  
    40  type operation struct {
    41  	name   Operation
    42  	task   *api.TaskInfo
    43  	reason string
    44  }
    45  
    46  // Statement structure
    47  type Statement struct {
    48  	operations []operation
    49  	ssn        *Session
    50  }
    51  
    52  // NewStatement returns new statement object
    53  func NewStatement(ssn *Session) *Statement {
    54  	return &Statement{
    55  		ssn: ssn,
    56  	}
    57  }
    58  
    59  // Evict the pod
    60  func (s *Statement) Evict(reclaimee *api.TaskInfo, reason string) error {
    61  	// Update status in session
    62  	if job, found := s.ssn.Jobs[reclaimee.Job]; found {
    63  		if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil {
    64  			klog.Errorf("Failed to update task <%v/%v> status to %v when evicting in Session <%v>: %v",
    65  				reclaimee.Namespace, reclaimee.Name, api.Releasing, s.ssn.UID, err)
    66  		}
    67  	} else {
    68  		klog.Errorf("Failed to find Job <%s> in Session <%s> index when evicting.",
    69  			reclaimee.Job, s.ssn.UID)
    70  	}
    71  
    72  	// Update task in node.
    73  	if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
    74  		err := node.UpdateTask(reclaimee)
    75  		if err != nil {
    76  			klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
    77  				reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
    78  			return err
    79  		}
    80  	}
    81  
    82  	for _, eh := range s.ssn.eventHandlers {
    83  		if eh.DeallocateFunc != nil {
    84  			eh.DeallocateFunc(&Event{
    85  				Task: reclaimee,
    86  			})
    87  		}
    88  	}
    89  
    90  	s.operations = append(s.operations, operation{
    91  		name:   Evict,
    92  		task:   reclaimee,
    93  		reason: reason,
    94  	})
    95  
    96  	return nil
    97  }
    98  
    99  func (s *Statement) evict(reclaimee *api.TaskInfo, reason string) error {
   100  	if err := s.ssn.cache.Evict(reclaimee, reason); err != nil {
   101  		if e := s.unevict(reclaimee); e != nil {
   102  			klog.Errorf("Faled to unevict task <%v/%v>: %v.", reclaimee.Namespace, reclaimee.Name, e)
   103  		}
   104  		return err
   105  	}
   106  
   107  	return nil
   108  }
   109  
   110  func (s *Statement) unevict(reclaimee *api.TaskInfo) error {
   111  	// Update status in session
   112  	job, found := s.ssn.Jobs[reclaimee.Job]
   113  	if found {
   114  		if err := job.UpdateTaskStatus(reclaimee, api.Running); err != nil {
   115  			klog.Errorf("Failed to update task <%v/%v> status to %v when unevicting in Session <%v>: %v",
   116  				reclaimee.Namespace, reclaimee.Name, api.Running, s.ssn.UID, err)
   117  		}
   118  	} else {
   119  		klog.Errorf("Failed to find Job <%s> in Session <%s> index when unevicting.",
   120  			reclaimee.Job, s.ssn.UID)
   121  	}
   122  
   123  	// Update task in node.
   124  	if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
   125  		err := node.UpdateTask(reclaimee)
   126  		if err != nil {
   127  			klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
   128  				reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
   129  			return err
   130  		}
   131  	}
   132  
   133  	for _, eh := range s.ssn.eventHandlers {
   134  		if eh.AllocateFunc != nil {
   135  			eh.AllocateFunc(&Event{
   136  				Task: reclaimee,
   137  			})
   138  		}
   139  	}
   140  
   141  	return nil
   142  }
   143  
   144  // Pipeline the task for the node
   145  func (s *Statement) Pipeline(task *api.TaskInfo, hostname string) error {
   146  	job, found := s.ssn.Jobs[task.Job]
   147  	if found {
   148  		if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil {
   149  			klog.Errorf("Failed to update task <%v/%v> status to %v when pipeline in Session <%v>: %v",
   150  				task.Namespace, task.Name, api.Pipelined, s.ssn.UID, err)
   151  		}
   152  	} else {
   153  		klog.Errorf("Failed to find Job <%s> in Session <%s> index when pipeline.",
   154  			task.Job, s.ssn.UID)
   155  	}
   156  
   157  	task.NodeName = hostname
   158  
   159  	if node, found := s.ssn.Nodes[hostname]; found {
   160  		if err := node.AddTask(task); err != nil {
   161  			klog.Errorf("Failed to add task <%v/%v> to node <%v> when pipeline in Session <%v>: %v",
   162  				task.Namespace, task.Name, hostname, s.ssn.UID, err)
   163  		}
   164  		klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
   165  			task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
   166  	} else {
   167  		klog.Errorf("Failed to find Node <%s> in Session <%s> index when pipeline.",
   168  			hostname, s.ssn.UID)
   169  	}
   170  
   171  	for _, eh := range s.ssn.eventHandlers {
   172  		if eh.AllocateFunc != nil {
   173  			eh.AllocateFunc(&Event{
   174  				Task: task,
   175  			})
   176  		}
   177  	}
   178  
   179  	s.operations = append(s.operations, operation{
   180  		name: Pipeline,
   181  		task: task,
   182  	})
   183  
   184  	return nil
   185  }
   186  
   187  func (s *Statement) pipeline(task *api.TaskInfo) {
   188  }
   189  
   190  func (s *Statement) UnPipeline(task *api.TaskInfo) error {
   191  	job, found := s.ssn.Jobs[task.Job]
   192  	if found {
   193  		if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
   194  			klog.Errorf("Failed to update task <%v/%v> status to %v when unpipeline in Session <%v>: %v",
   195  				task.Namespace, task.Name, api.Pending, s.ssn.UID, err)
   196  		}
   197  	} else {
   198  		klog.Errorf("Failed to find Job <%s> in Session <%s> index when unpipeline.", task.Job, s.ssn.UID)
   199  	}
   200  
   201  	if node, found := s.ssn.Nodes[task.NodeName]; found {
   202  		if err := node.RemoveTask(task); err != nil {
   203  			klog.Errorf("Failed to remove task <%v/%v> to node <%v> when unpipeline in Session <%v>: %v",
   204  				task.Namespace, task.Name, task.NodeName, s.ssn.UID, err)
   205  		}
   206  		klog.V(3).Infof("After unpipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
   207  			task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
   208  	} else {
   209  		klog.Errorf("Failed to find Node <%s> in Session <%s> index when unpipeline.",
   210  			task.NodeName, s.ssn.UID)
   211  	}
   212  
   213  	for _, eh := range s.ssn.eventHandlers {
   214  		if eh.DeallocateFunc != nil {
   215  			eh.DeallocateFunc(&Event{
   216  				Task: task,
   217  			})
   218  		}
   219  	}
   220  	task.NodeName = ""
   221  
   222  	return nil
   223  }
   224  
   225  // Allocate the task to node
   226  func (s *Statement) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) (err error) {
   227  	podVolumes, err := s.ssn.cache.GetPodVolumes(task, nodeInfo.Node)
   228  	if err != nil {
   229  		return err
   230  	}
   231  
   232  	hostname := nodeInfo.Name
   233  	if err := s.ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil {
   234  		return err
   235  	}
   236  	defer func() {
   237  		if err != nil {
   238  			s.ssn.cache.RevertVolumes(task, podVolumes)
   239  		}
   240  	}()
   241  
   242  	task.Pod.Spec.NodeName = hostname
   243  	task.PodVolumes = podVolumes
   244  
   245  	// Only update status in session
   246  	job, found := s.ssn.Jobs[task.Job]
   247  	if found {
   248  		if err := job.UpdateTaskStatus(task, api.Allocated); err != nil {
   249  			klog.Errorf("Failed to update task <%v/%v> status to %v when allocating in Session <%v>: %v",
   250  				task.Namespace, task.Name, api.Allocated, s.ssn.UID, err)
   251  			return err
   252  		}
   253  	} else {
   254  		klog.Errorf("Failed to find Job <%s> in Session <%s> index when allocating.",
   255  			task.Job, s.ssn.UID)
   256  		return fmt.Errorf("failed to find job %s", task.Job)
   257  	}
   258  
   259  	task.NodeName = hostname
   260  	if node, found := s.ssn.Nodes[hostname]; found {
   261  		if err := node.AddTask(task); err != nil {
   262  			klog.Errorf("Failed to add task <%v/%v> to node <%v> when allocating in Session <%v>: %v",
   263  				task.Namespace, task.Name, hostname, s.ssn.UID, err)
   264  			return err
   265  		}
   266  		klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
   267  			task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
   268  	} else {
   269  		klog.Errorf("Failed to find Node <%s> in Session <%s> index when allocating.",
   270  			hostname, s.ssn.UID)
   271  		return fmt.Errorf("failed to find node %s", hostname)
   272  	}
   273  
   274  	// Callbacks
   275  	for _, eh := range s.ssn.eventHandlers {
   276  		if eh.AllocateFunc != nil {
   277  			eh.AllocateFunc(&Event{
   278  				Task: task,
   279  			})
   280  		}
   281  	}
   282  
   283  	// Update status in session
   284  	klog.V(3).Info("Allocating operations ...")
   285  	s.operations = append(s.operations, operation{
   286  		name: Allocate,
   287  		task: task,
   288  	})
   289  
   290  	return nil
   291  }
   292  
   293  func (s *Statement) allocate(task *api.TaskInfo) error {
   294  	if err := s.ssn.cache.AddBindTask(task); err != nil {
   295  		return err
   296  	}
   297  
   298  	if job, found := s.ssn.Jobs[task.Job]; found {
   299  		if err := job.UpdateTaskStatus(task, api.Binding); err != nil {
   300  			klog.Errorf("Failed to update task <%v/%v> status to %v when binding in Session <%v>: %v",
   301  				task.Namespace, task.Name, api.Binding, s.ssn.UID, err)
   302  			return err
   303  		}
   304  	} else {
   305  		klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
   306  			task.Job, s.ssn.UID)
   307  		return fmt.Errorf("failed to find job %s", task.Job)
   308  	}
   309  
   310  	metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
   311  	return nil
   312  }
   313  
   314  // unallocate the pod for task
   315  func (s *Statement) unallocate(task *api.TaskInfo) error {
   316  	s.ssn.cache.RevertVolumes(task, task.PodVolumes)
   317  
   318  	// Update status in session
   319  	job, found := s.ssn.Jobs[task.Job]
   320  	if found {
   321  		if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
   322  			klog.Errorf("Failed to update task <%v/%v> status to %v when unallocating in Session <%v>: %v",
   323  				task.Namespace, task.Name, api.Pending, s.ssn.UID, err)
   324  		}
   325  	} else {
   326  		klog.Errorf("Failed to find Job <%s> in Session <%s> index when unallocating.",
   327  			task.Job, s.ssn.UID)
   328  	}
   329  
   330  	if node, found := s.ssn.Nodes[task.NodeName]; found {
   331  		klog.V(3).Infof("Remove Task <%v> on node <%v>", task.Name, task.NodeName)
   332  		err := node.RemoveTask(task)
   333  		if err != nil {
   334  			klog.Errorf("Failed to remove Task <%v> on node <%v> when unallocating: %s", task.Name, task.NodeName, err.Error())
   335  		}
   336  	}
   337  
   338  	for _, eh := range s.ssn.eventHandlers {
   339  		if eh.DeallocateFunc != nil {
   340  			eh.DeallocateFunc(&Event{
   341  				Task: task,
   342  			})
   343  		}
   344  	}
   345  	task.NodeName = ""
   346  
   347  	return nil
   348  }
   349  
   350  // Discard operation for evict, pipeline and allocate
   351  func (s *Statement) Discard() {
   352  	klog.V(3).Info("Discarding operations ...")
   353  	for i := len(s.operations) - 1; i >= 0; i-- {
   354  		op := s.operations[i]
   355  		op.task.GenerateLastTxContext()
   356  		switch op.name {
   357  		case Evict:
   358  			err := s.unevict(op.task)
   359  			if err != nil {
   360  				klog.Errorf("Failed to unevict task: %s", err.Error())
   361  			}
   362  		case Pipeline:
   363  			err := s.UnPipeline(op.task)
   364  			if err != nil {
   365  				klog.Errorf("Failed to unpipeline task: %s", err.Error())
   366  			}
   367  		case Allocate:
   368  			err := s.unallocate(op.task)
   369  			if err != nil {
   370  				klog.Errorf("Failed to unallocate task: %s", err.Error())
   371  			}
   372  		}
   373  	}
   374  }
   375  
   376  // Commit operation for evict and pipeline
   377  func (s *Statement) Commit() {
   378  	klog.V(3).Info("Committing operations ...")
   379  	for _, op := range s.operations {
   380  		op.task.ClearLastTxContext()
   381  		switch op.name {
   382  		case Evict:
   383  			err := s.evict(op.task, op.reason)
   384  			if err != nil {
   385  				klog.Errorf("Failed to evict task: %s", err.Error())
   386  			}
   387  		case Pipeline:
   388  			s.pipeline(op.task)
   389  		case Allocate:
   390  			err := s.allocate(op.task)
   391  			if err != nil {
   392  				if e := s.unallocate(op.task); e != nil {
   393  					klog.Errorf("Failed to unallocate task <%v/%v>: %v.", op.task.Namespace, op.task.Name, e)
   394  				}
   395  				klog.Errorf("Failed to allocate task <%v/%v>: %v.", op.task.Namespace, op.task.Name, err)
   396  			}
   397  		}
   398  	}
   399  }