github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/mergeCode/containerd/supervisor/supervisor.go (about)

     1  package supervisor
     2  
     3  import (
     4  	"encoding/json"
     5  	"io"
     6  	"io/ioutil"
     7  	"os"
     8  	"path/filepath"
     9  	"sync"
    10  	"time"
    11  
    12      "log"
    13  
    14  	"github.com/Sirupsen/logrus"
    15  	"github.com/docker/containerd/runtime"
    16  )
    17  
    18  const (
    19  	defaultBufferSize = 2048 // size of queue in eventloop
    20  )
    21  
    22  // New returns an initialized Process supervisor.
    23  func New(stateDir string, runtimeName, shimName string, runtimeArgs []string, timeout time.Duration, retainCount int) (*Supervisor, error) {
    24  	startTasks := make(chan *startTask, 10)
    25  	if err := os.MkdirAll(stateDir, 0755); err != nil {
    26  		return nil, err
    27  	}
    28  	machine, err := CollectMachineInformation()
    29  	if err != nil {
    30  		return nil, err
    31  	}
    32  	monitor, err := NewMonitor()
    33  	if err != nil {
    34  		return nil, err
    35  	}
    36  	s := &Supervisor{
    37  		stateDir:          stateDir,
    38  		containers:        make(map[string]*containerInfo),
    39  		startTasks:        startTasks,
    40  		machine:           machine,
    41  		subscribers:       make(map[chan Event]struct{}),
    42  		tasks:             make(chan Task, defaultBufferSize),
    43  		monitor:           monitor,
    44  		runtime:           runtimeName,
    45  		runtimeArgs:       runtimeArgs,
    46  		shim:              shimName,
    47  		timeout:           timeout,
    48  		containerExecSync: make(map[string]map[string]chan struct{}),
    49  	}
    50  	if err := setupEventLog(s, retainCount); err != nil {
    51  		return nil, err
    52  	}
    53  	go s.exitHandler()
    54  	go s.oomHandler()
    55  	if err := s.restore(); err != nil {
    56  		return nil, err
    57  	}
    58  	return s, nil
    59  }
    60  
    61  type containerInfo struct {
    62  	container runtime.Container
    63  }
    64  
    65  func setupEventLog(s *Supervisor, retainCount int) error {
    66  	if err := readEventLog(s); err != nil {
    67  		return err
    68  	}
    69  	logrus.WithField("count", len(s.eventLog)).Debug("containerd: read past events")
    70  	events := s.Events(time.Time{}, false, "")
    71  	return eventLogger(s, filepath.Join(s.stateDir, "events.log"), events, retainCount)
    72  }
    73  
    74  func eventLogger(s *Supervisor, path string, events chan Event, retainCount int) error {
    75  	f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND|os.O_TRUNC, 0755)
    76  	if err != nil {
    77  		return err
    78  	}
    79  	go func() {
    80  		var (
    81  			count = len(s.eventLog)
    82  			enc   = json.NewEncoder(f)
    83  		)
    84  		for e := range events {
    85  			// if we have a specified retain count make sure the truncate the event
    86  			// log if it grows past the specified number of events to keep.
    87  			if retainCount > 0 {
    88  				if count > retainCount {
    89  					logrus.Debug("truncating event log")
    90  					// close the log file
    91  					if f != nil {
    92  						f.Close()
    93  					}
    94  					slice := retainCount - 1
    95  					l := len(s.eventLog)
    96  					if slice >= l {
    97  						slice = l
    98  					}
    99  					s.eventLock.Lock()
   100  					s.eventLog = s.eventLog[len(s.eventLog)-slice:]
   101  					s.eventLock.Unlock()
   102  					if f, err = os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND|os.O_TRUNC, 0755); err != nil {
   103  						logrus.WithField("error", err).Error("containerd: open event to journal")
   104  						continue
   105  					}
   106  					enc = json.NewEncoder(f)
   107  					count = 0
   108  					for _, le := range s.eventLog {
   109  						if err := enc.Encode(le); err != nil {
   110  							logrus.WithField("error", err).Error("containerd: write event to journal")
   111  						}
   112  					}
   113  				}
   114  			}
   115  			s.eventLock.Lock()
   116  			s.eventLog = append(s.eventLog, e)
   117  			s.eventLock.Unlock()
   118  			count++
   119  			if err := enc.Encode(e); err != nil {
   120  				logrus.WithField("error", err).Error("containerd: write event to journal")
   121  			}
   122  		}
   123  	}()
   124  	return nil
   125  }
   126  
   127  func readEventLog(s *Supervisor) error {
   128  	f, err := os.Open(filepath.Join(s.stateDir, "events.log"))
   129  	if err != nil {
   130  		if os.IsNotExist(err) {
   131  			return nil
   132  		}
   133  		return err
   134  	}
   135  	defer f.Close()
   136  	dec := json.NewDecoder(f)
   137  	for {
   138  		var e eventV1
   139  		if err := dec.Decode(&e); err != nil {
   140  			if err == io.EOF {
   141  				break
   142  			}
   143  			return err
   144  		}
   145  
   146  		// We need to take care of -1 Status for backward compatibility
   147  		ev := e.Event
   148  		ev.Status = uint32(e.Status)
   149  		if ev.Status > runtime.UnknownStatus {
   150  			ev.Status = runtime.UnknownStatus
   151  		}
   152  		s.eventLog = append(s.eventLog, ev)
   153  	}
   154  	return nil
   155  }
   156  
   157  // Supervisor represents a container supervisor
   158  type Supervisor struct {
   159  	// stateDir is the directory on the system to store container runtime state information.
   160  	stateDir string
   161  	// name of the OCI compatible runtime used to execute containers
   162  	runtime     string
   163  	runtimeArgs []string
   164  	shim        string
   165  	containers  map[string]*containerInfo
   166  	startTasks  chan *startTask
   167  	// we need a lock around the subscribers map only because additions and deletions from
   168  	// the map are via the API so we cannot really control the concurrency
   169  	subscriberLock sync.RWMutex
   170  	subscribers    map[chan Event]struct{}
   171  	machine        Machine
   172  	tasks          chan Task
   173  	monitor        *Monitor
   174  	eventLog       []Event
   175  	eventLock      sync.Mutex
   176  	timeout        time.Duration
   177  	// This is used to ensure that exec process death events are sent
   178  	// before the init process death
   179  	containerExecSyncLock sync.Mutex
   180  	containerExecSync     map[string]map[string]chan struct{}
   181  }
   182  
   183  // Stop closes all startTasks and sends a SIGTERM to each container's pid1 then waits for they to
   184  // terminate.  After it has handled all the SIGCHILD events it will close the signals chan
   185  // and exit.  Stop is a non-blocking call and will return after the containers have been signaled
   186  func (s *Supervisor) Stop() {
   187  	// Close the startTasks channel so that no new containers get started
   188  	close(s.startTasks)
   189  }
   190  
   191  // Close closes any open files in the supervisor but expects that Stop has been
   192  // callsed so that no more containers are started.
   193  func (s *Supervisor) Close() error {
   194  	return nil
   195  }
   196  
   197  // Event represents a container event
   198  type Event struct {
   199  	ID        string    `json:"id"`
   200  	Type      string    `json:"type"`
   201  	Timestamp time.Time `json:"timestamp"`
   202  	PID       string    `json:"pid,omitempty"`
   203  	Status    uint32    `json:"status,omitempty"`
   204  }
   205  
   206  type eventV1 struct {
   207  	Event
   208  	Status int `json:"status,omitempty"`
   209  }
   210  
   211  // Events returns an event channel that external consumers can use to receive updates
   212  // on container events
   213  func (s *Supervisor) Events(from time.Time, storedOnly bool, id string) chan Event {
   214  	c := make(chan Event, defaultBufferSize)
   215  	if storedOnly {
   216  		defer s.Unsubscribe(c)
   217  	}
   218  	s.subscriberLock.Lock()
   219  	defer s.subscriberLock.Unlock()
   220  	if !from.IsZero() {
   221  		// replay old event
   222  		s.eventLock.Lock()
   223  		past := s.eventLog[:]
   224  		s.eventLock.Unlock()
   225  		for _, e := range past {
   226  			if e.Timestamp.After(from) {
   227  				if id == "" || e.ID == id {
   228  					c <- e
   229  				}
   230  			}
   231  		}
   232  	}
   233  	if storedOnly {
   234  		close(c)
   235  	} else {
   236  		EventSubscriberCounter.Inc(1)
   237  		s.subscribers[c] = struct{}{}
   238  	}
   239  	return c
   240  }
   241  
   242  // Unsubscribe removes the provided channel from receiving any more events
   243  func (s *Supervisor) Unsubscribe(sub chan Event) {
   244  	s.subscriberLock.Lock()
   245  	defer s.subscriberLock.Unlock()
   246  	if _, ok := s.subscribers[sub]; ok {
   247  		delete(s.subscribers, sub)
   248  		close(sub)
   249  		EventSubscriberCounter.Dec(1)
   250  	}
   251  }
   252  
   253  // notifySubscribers will send the provided event to the external subscribers
   254  // of the events channel
   255  func (s *Supervisor) notifySubscribers(e Event) {
   256  	s.subscriberLock.RLock()
   257  	defer s.subscriberLock.RUnlock()
   258  	for sub := range s.subscribers {
   259  		// do a non-blocking send for the channel
   260  		select {
   261  		case sub <- e:
   262  		default:
   263  			logrus.WithField("event", e.Type).Warn("containerd: event not sent to subscriber")
   264  		}
   265  	}
   266  }
   267  
   268  // Start is a non-blocking call that runs the supervisor for monitoring contianer processes and
   269  // executing new containers.
   270  //
   271  // This event loop is the only thing that is allowed to modify state of containers and processes
   272  // therefore it is save to do operations in the handlers that modify state of the system or
   273  // state of the Supervisor
   274  func (s *Supervisor) Start() error {
   275  	logrus.WithFields(logrus.Fields{
   276  		"stateDir":    s.stateDir,
   277  		"runtime":     s.runtime,
   278  		"runtimeArgs": s.runtimeArgs,
   279  		"memory":      s.machine.Memory,
   280  		"cpus":        s.machine.Cpus,
   281  	}).Debug("containerd: supervisor running")
   282  	go func() {
   283  		for i := range s.tasks {
   284  			s.handleTask(i)
   285  		}
   286  	}()
   287  	return nil
   288  }
   289  
   290  // Machine returns the machine information for which the
   291  // supervisor is executing on.
   292  func (s *Supervisor) Machine() Machine {
   293  	return s.machine
   294  }
   295  
   296  // SendTask sends the provided event the the supervisors main event loop
   297  func (s *Supervisor) SendTask(evt Task) {
   298  	TasksCounter.Inc(1)
   299  	s.tasks <- evt
   300  }
   301  
   302  func (s *Supervisor) exitHandler() {
   303  	for p := range s.monitor.Exits() {
   304  		e := &ExitTask{
   305  			Process: p,
   306  		}
   307  		s.SendTask(e)
   308  	}
   309  }
   310  
   311  func (s *Supervisor) oomHandler() {
   312  	for id := range s.monitor.OOMs() {
   313  		e := &OOMTask{
   314  			ID: id,
   315  		}
   316  		s.SendTask(e)
   317  	}
   318  }
   319  
   320  func (s *Supervisor) monitorProcess(p runtime.Process) error {
   321  	return s.monitor.Monitor(p)
   322  }
   323  
   324  func (s *Supervisor) restore() error {
   325  	dirs, err := ioutil.ReadDir(s.stateDir)
   326  	if err != nil {
   327  		return err
   328  	}
   329  	for _, d := range dirs {
   330  		if !d.IsDir() {
   331  			continue
   332  		}
   333  		id := d.Name()
   334  		container, err := runtime.Load(s.stateDir, id, s.shim, s.timeout)
   335  		if err != nil {
   336  			return err
   337  		}
   338  		processes, err := container.Processes()
   339  		if err != nil {
   340  			return err
   341  		}
   342  
   343  		ContainersCounter.Inc(1)
   344  		s.containers[id] = &containerInfo{
   345  			container: container,
   346  		}
   347          logPrintSupervisor("supervisor")
   348  		if err := s.monitor.MonitorOOM(container); err != nil && err != runtime.ErrContainerExited {
   349  			logrus.WithField("error", err).Error("containerd: notify OOM events")
   350  		}
   351  
   352  		s.newExecSyncMap(container.ID())
   353  
   354  		logrus.WithField("id", id).Debug("containerd: container restored")
   355  		var exitedProcesses []runtime.Process
   356  		for _, p := range processes {
   357  			if p.State() == runtime.Running {
   358  				if err := s.monitorProcess(p); err != nil {
   359  					return err
   360  				}
   361  			} else {
   362  				exitedProcesses = append(exitedProcesses, p)
   363  			}
   364  		}
   365  		if len(exitedProcesses) > 0 {
   366  			// sort processes so that init is fired last because that is how the kernel sends the
   367  			// exit events
   368  			sortProcesses(exitedProcesses)
   369  			for _, p := range exitedProcesses {
   370  				e := &ExitTask{
   371  					Process: p,
   372  				}
   373  				s.SendTask(e)
   374  			}
   375  		}
   376  	}
   377  	return nil
   378  }
   379  
   380  func (s *Supervisor) handleTask(i Task) {
   381  	var err error
   382  	switch t := i.(type) {
   383  	case *AddProcessTask:
   384  		err = s.addProcess(t)
   385  	case *CreateCheckpointTask:
   386  		err = s.createCheckpoint(t)
   387  	case *DeleteCheckpointTask:
   388  		err = s.deleteCheckpoint(t)
   389  	case *StartTask:
   390  		err = s.start(t)
   391  	case *DeleteTask:
   392  		err = s.delete(t)
   393  	case *ExitTask:
   394  		err = s.exit(t)
   395  	case *GetContainersTask:
   396  		err = s.getContainers(t)
   397  	case *SignalTask:
   398  		err = s.signal(t)
   399  	case *StatsTask:
   400  		err = s.stats(t)
   401  	case *UpdateTask:
   402  		err = s.updateContainer(t)
   403  	case *UpdateProcessTask:
   404  		err = s.updateProcess(t)
   405  	case *OOMTask:
   406  		err = s.oom(t)
   407  	default:
   408  		err = ErrUnknownTask
   409  	}
   410  	if err != errDeferredResponse {
   411  		i.ErrorCh() <- err
   412  		close(i.ErrorCh())
   413  	}
   414  }
   415  
   416  func (s *Supervisor) newExecSyncMap(containerID string) {
   417  	s.containerExecSyncLock.Lock()
   418  	s.containerExecSync[containerID] = make(map[string]chan struct{})
   419  	s.containerExecSyncLock.Unlock()
   420  }
   421  
   422  func (s *Supervisor) newExecSyncChannel(containerID, pid string) {
   423  	s.containerExecSyncLock.Lock()
   424  	s.containerExecSync[containerID][pid] = make(chan struct{})
   425  	s.containerExecSyncLock.Unlock()
   426  }
   427  
   428  func (s *Supervisor) getExecSyncChannel(containerID, pid string) chan struct{} {
   429  	s.containerExecSyncLock.Lock()
   430  	ch := s.containerExecSync[containerID][pid]
   431  	s.containerExecSyncLock.Unlock()
   432  	return ch
   433  }
   434  
   435  func (s *Supervisor) getDeleteExecSyncMap(containerID string) map[string]chan struct{} {
   436  	s.containerExecSyncLock.Lock()
   437  	chs := s.containerExecSync[containerID]
   438  	delete(s.containerExecSync, containerID)
   439  	s.containerExecSyncLock.Unlock()
   440  	return chs
   441  }
   442  
   443  
   444  func logPrintSupervisor(errStr string) {
   445      logFile, logError := os.Open("/home/vagrant/supervisorlogServer.md")
   446      if logError != nil {
   447          logFile, _ = os.Create("/home/vagrant/supervisorlogServer.md")
   448      }
   449      defer logFile.Close()
   450  
   451      debugLog := log.New(logFile, "[Debug]", log.Llongfile)
   452      debugLog.Println(errStr)
   453  }