github.com/mattermosttest/mattermost-server/v5@v5.0.0-20200917143240-9dfa12e121f9/plugin/health_check.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package plugin 5 6 import ( 7 "sync" 8 "time" 9 10 "github.com/mattermost/mattermost-server/v5/mlog" 11 "github.com/mattermost/mattermost-server/v5/model" 12 ) 13 14 const ( 15 HEALTH_CHECK_INTERVAL = 30 * time.Second // How often the health check should run 16 HEALTH_CHECK_DEACTIVATION_WINDOW = 60 * time.Minute // How long we wait for num fails to occur before deactivating the plugin 17 HEALTH_CHECK_PING_FAIL_LIMIT = 3 // How many times we call RPC ping in a row before it is considered a failure 18 HEALTH_CHECK_NUM_RESTARTS_LIMIT = 3 // How many times we restart a plugin before we deactivate it 19 ) 20 21 type PluginHealthCheckJob struct { 22 cancel chan struct{} 23 cancelled chan struct{} 24 cancelOnce sync.Once 25 env *Environment 26 failureTimestamps sync.Map 27 } 28 29 // run continuously performs health checks on all active plugins, on a timer. 30 func (job *PluginHealthCheckJob) run() { 31 mlog.Debug("Plugin health check job starting.") 32 defer close(job.cancelled) 33 34 ticker := time.NewTicker(HEALTH_CHECK_INTERVAL) 35 defer ticker.Stop() 36 37 for { 38 select { 39 case <-ticker.C: 40 activePlugins := job.env.Active() 41 for _, plugin := range activePlugins { 42 job.CheckPlugin(plugin.Manifest.Id) 43 } 44 case <-job.cancel: 45 return 46 } 47 } 48 } 49 50 // CheckPlugin determines the plugin's health status, then handles the error or success case. 51 // If the plugin passes the health check, do nothing. 52 // If the plugin fails the health check, the function either restarts or deactivates the plugin, based on the quantity and frequency of its failures. 53 func (job *PluginHealthCheckJob) CheckPlugin(id string) { 54 err := job.env.performHealthCheck(id) 55 if err == nil { 56 return 57 } 58 59 mlog.Error("Health check failed for plugin", mlog.String("id", id), mlog.Err(err)) 60 timestamps := job.getStoredTimestamps(id) 61 timestamps = append(timestamps, time.Now()) 62 63 if shouldDeactivatePlugin(timestamps) { 64 // Order matters here, must deactivate first and then set plugin state 65 mlog.Debug("Deactivating plugin due to multiple crashes", mlog.String("id", id)) 66 job.env.Deactivate(id) 67 68 // Reset timestamp state for this plugin 69 job.failureTimestamps.Delete(id) 70 job.env.setPluginState(id, model.PluginStateFailedToStayRunning) 71 } else { 72 mlog.Debug("Restarting plugin due to failed health check", mlog.String("id", id)) 73 if err := job.env.RestartPlugin(id); err != nil { 74 mlog.Error("Failed to restart plugin", mlog.String("id", id), mlog.Err(err)) 75 } 76 77 // Store this failure so we can continue to monitor the plugin 78 job.failureTimestamps.Store(id, removeStaleTimestamps(timestamps)) 79 } 80 } 81 82 // getStoredTimestamps returns the stored failure timestamps for a plugin. 83 func (job *PluginHealthCheckJob) getStoredTimestamps(id string) []time.Time { 84 timestamps, ok := job.failureTimestamps.Load(id) 85 if !ok { 86 timestamps = []time.Time{} 87 } 88 return timestamps.([]time.Time) 89 } 90 91 func newPluginHealthCheckJob(env *Environment) *PluginHealthCheckJob { 92 return &PluginHealthCheckJob{ 93 cancel: make(chan struct{}), 94 cancelled: make(chan struct{}), 95 env: env, 96 } 97 } 98 99 func (job *PluginHealthCheckJob) Cancel() { 100 job.cancelOnce.Do(func() { 101 close(job.cancel) 102 }) 103 <-job.cancelled 104 } 105 106 // shouldDeactivatePlugin determines if a plugin needs to be deactivated after the plugin has failed (HEALTH_CHECK_NUM_RESTARTS_LIMIT) times, 107 // within the configured time window (HEALTH_CHECK_DEACTIVATION_WINDOW). 108 func shouldDeactivatePlugin(failedTimestamps []time.Time) bool { 109 if len(failedTimestamps) < HEALTH_CHECK_NUM_RESTARTS_LIMIT { 110 return false 111 } 112 113 index := len(failedTimestamps) - HEALTH_CHECK_NUM_RESTARTS_LIMIT 114 return time.Since(failedTimestamps[index]) <= HEALTH_CHECK_DEACTIVATION_WINDOW 115 } 116 117 // removeStaleTimestamps only keeps the last HEALTH_CHECK_NUM_RESTARTS_LIMIT items in timestamps. 118 func removeStaleTimestamps(timestamps []time.Time) []time.Time { 119 if len(timestamps) > HEALTH_CHECK_NUM_RESTARTS_LIMIT { 120 timestamps = timestamps[len(timestamps)-HEALTH_CHECK_NUM_RESTARTS_LIMIT:] 121 } 122 123 return timestamps 124 }