github.com/mattermosttest/mattermost-server/v5@v5.0.0-20200917143240-9dfa12e121f9/plugin/health_check.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package plugin
     5  
     6  import (
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/mattermost/mattermost-server/v5/mlog"
    11  	"github.com/mattermost/mattermost-server/v5/model"
    12  )
    13  
    14  const (
    15  	HEALTH_CHECK_INTERVAL            = 30 * time.Second // How often the health check should run
    16  	HEALTH_CHECK_DEACTIVATION_WINDOW = 60 * time.Minute // How long we wait for num fails to occur before deactivating the plugin
    17  	HEALTH_CHECK_PING_FAIL_LIMIT     = 3                // How many times we call RPC ping in a row before it is considered a failure
    18  	HEALTH_CHECK_NUM_RESTARTS_LIMIT  = 3                // How many times we restart a plugin before we deactivate it
    19  )
    20  
    21  type PluginHealthCheckJob struct {
    22  	cancel            chan struct{}
    23  	cancelled         chan struct{}
    24  	cancelOnce        sync.Once
    25  	env               *Environment
    26  	failureTimestamps sync.Map
    27  }
    28  
    29  // run continuously performs health checks on all active plugins, on a timer.
    30  func (job *PluginHealthCheckJob) run() {
    31  	mlog.Debug("Plugin health check job starting.")
    32  	defer close(job.cancelled)
    33  
    34  	ticker := time.NewTicker(HEALTH_CHECK_INTERVAL)
    35  	defer ticker.Stop()
    36  
    37  	for {
    38  		select {
    39  		case <-ticker.C:
    40  			activePlugins := job.env.Active()
    41  			for _, plugin := range activePlugins {
    42  				job.CheckPlugin(plugin.Manifest.Id)
    43  			}
    44  		case <-job.cancel:
    45  			return
    46  		}
    47  	}
    48  }
    49  
    50  // CheckPlugin determines the plugin's health status, then handles the error or success case.
    51  // If the plugin passes the health check, do nothing.
    52  // If the plugin fails the health check, the function either restarts or deactivates the plugin, based on the quantity and frequency of its failures.
    53  func (job *PluginHealthCheckJob) CheckPlugin(id string) {
    54  	err := job.env.performHealthCheck(id)
    55  	if err == nil {
    56  		return
    57  	}
    58  
    59  	mlog.Error("Health check failed for plugin", mlog.String("id", id), mlog.Err(err))
    60  	timestamps := job.getStoredTimestamps(id)
    61  	timestamps = append(timestamps, time.Now())
    62  
    63  	if shouldDeactivatePlugin(timestamps) {
    64  		// Order matters here, must deactivate first and then set plugin state
    65  		mlog.Debug("Deactivating plugin due to multiple crashes", mlog.String("id", id))
    66  		job.env.Deactivate(id)
    67  
    68  		// Reset timestamp state for this plugin
    69  		job.failureTimestamps.Delete(id)
    70  		job.env.setPluginState(id, model.PluginStateFailedToStayRunning)
    71  	} else {
    72  		mlog.Debug("Restarting plugin due to failed health check", mlog.String("id", id))
    73  		if err := job.env.RestartPlugin(id); err != nil {
    74  			mlog.Error("Failed to restart plugin", mlog.String("id", id), mlog.Err(err))
    75  		}
    76  
    77  		// Store this failure so we can continue to monitor the plugin
    78  		job.failureTimestamps.Store(id, removeStaleTimestamps(timestamps))
    79  	}
    80  }
    81  
    82  // getStoredTimestamps returns the stored failure timestamps for a plugin.
    83  func (job *PluginHealthCheckJob) getStoredTimestamps(id string) []time.Time {
    84  	timestamps, ok := job.failureTimestamps.Load(id)
    85  	if !ok {
    86  		timestamps = []time.Time{}
    87  	}
    88  	return timestamps.([]time.Time)
    89  }
    90  
    91  func newPluginHealthCheckJob(env *Environment) *PluginHealthCheckJob {
    92  	return &PluginHealthCheckJob{
    93  		cancel:    make(chan struct{}),
    94  		cancelled: make(chan struct{}),
    95  		env:       env,
    96  	}
    97  }
    98  
    99  func (job *PluginHealthCheckJob) Cancel() {
   100  	job.cancelOnce.Do(func() {
   101  		close(job.cancel)
   102  	})
   103  	<-job.cancelled
   104  }
   105  
   106  // shouldDeactivatePlugin determines if a plugin needs to be deactivated after the plugin has failed (HEALTH_CHECK_NUM_RESTARTS_LIMIT) times,
   107  // within the configured time window (HEALTH_CHECK_DEACTIVATION_WINDOW).
   108  func shouldDeactivatePlugin(failedTimestamps []time.Time) bool {
   109  	if len(failedTimestamps) < HEALTH_CHECK_NUM_RESTARTS_LIMIT {
   110  		return false
   111  	}
   112  
   113  	index := len(failedTimestamps) - HEALTH_CHECK_NUM_RESTARTS_LIMIT
   114  	return time.Since(failedTimestamps[index]) <= HEALTH_CHECK_DEACTIVATION_WINDOW
   115  }
   116  
   117  // removeStaleTimestamps only keeps the last HEALTH_CHECK_NUM_RESTARTS_LIMIT items in timestamps.
   118  func removeStaleTimestamps(timestamps []time.Time) []time.Time {
   119  	if len(timestamps) > HEALTH_CHECK_NUM_RESTARTS_LIMIT {
   120  		timestamps = timestamps[len(timestamps)-HEALTH_CHECK_NUM_RESTARTS_LIMIT:]
   121  	}
   122  
   123  	return timestamps
   124  }