github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/alloc_runner_health_watcher.go

github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/alloc_runner_health_watcher.go (about)

     1  package client
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"github.com/hashicorp/consul/api"
     8  	"github.com/hashicorp/nomad/helper"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  const (
    13  	// consulCheckLookupInterval is the  interval at which we check if the
    14  	// Consul checks are healthy or unhealthy.
    15  	consulCheckLookupInterval = 500 * time.Millisecond
    16  )
    17  
    18  // watchHealth is responsible for watching an allocation's task status and
    19  // potentially consul health check status to determine if the allocation is
    20  // healthy or unhealthy.
    21  func (r *AllocRunner) watchHealth(ctx context.Context) {
    22  	// See if we should watch the allocs health
    23  	alloc := r.Alloc()
    24  	if alloc.DeploymentID == "" {
    25  		r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment")
    26  		return
    27  	} else if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
    28  		r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc deployment health already determined")
    29  		return
    30  	}
    31  
    32  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
    33  	if tg == nil {
    34  		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
    35  		return
    36  	}
    37  
    38  	// Checks marks whether we should be watching for Consul health checks
    39  	desiredChecks := 0
    40  	var checkTicker *time.Ticker
    41  	var checkCh <-chan time.Time
    42  
    43  	u := tg.Update
    44  	switch {
    45  	case u == nil:
    46  		r.logger.Printf("[TRACE] client.alloc_watcher: no update block for alloc %q. exiting", alloc.ID)
    47  		return
    48  	case u.HealthCheck == structs.UpdateStrategyHealthCheck_Manual:
    49  		r.logger.Printf("[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting", alloc.ID)
    50  		return
    51  	case u.HealthCheck == structs.UpdateStrategyHealthCheck_Checks:
    52  		for _, task := range tg.Tasks {
    53  			for _, s := range task.Services {
    54  				desiredChecks += len(s.Checks)
    55  			}
    56  		}
    57  
    58  		checkTicker = time.NewTicker(consulCheckLookupInterval)
    59  		checkCh = checkTicker.C
    60  	}
    61  
    62  	// Get a listener so we know when an allocation is updated.
    63  	l := r.allocBroadcast.Listen()
    64  
    65  	// Create a deadline timer for the health
    66  	r.logger.Printf("[DEBUG] client.alloc_watcher: deadline (%v) for alloc %q is at %v", u.HealthyDeadline, alloc.ID, time.Now().Add(u.HealthyDeadline))
    67  	deadline := time.NewTimer(u.HealthyDeadline)
    68  
    69  	// Create a healthy timer
    70  	latestTaskHealthy := time.Unix(0, 0)
    71  	latestChecksHealthy := time.Unix(0, 0)
    72  	healthyTimer := time.NewTimer(0)
    73  	healthyTime := time.Time{}
    74  	cancelHealthyTimer := func() {
    75  		if !healthyTimer.Stop() {
    76  			select {
    77  			case <-healthyTimer.C:
    78  			default:
    79  			}
    80  		}
    81  	}
    82  	cancelHealthyTimer()
    83  
    84  	// Cleanup function
    85  	defer func() {
    86  		if !deadline.Stop() {
    87  			<-deadline.C
    88  		}
    89  		if !healthyTimer.Stop() {
    90  			<-healthyTimer.C
    91  		}
    92  		if checkTicker != nil {
    93  			checkTicker.Stop()
    94  		}
    95  		l.Close()
    96  	}()
    97  
    98  	setHealth := func(h bool) {
    99  		r.allocLock.Lock()
   100  		r.allocHealth = helper.BoolToPtr(h)
   101  		r.allocLock.Unlock()
   102  		r.syncStatus()
   103  	}
   104  
   105  	// Store whether the last consul checks call was successful or not
   106  	consulChecksErr := false
   107  
   108  	var checks []*api.AgentCheck
   109  	first := true
   110  OUTER:
   111  	for {
   112  		if !first {
   113  			select {
   114  			case <-ctx.Done():
   115  				return
   116  			case newAlloc, ok := <-l.Ch:
   117  				if !ok {
   118  					return
   119  				}
   120  
   121  				alloc = newAlloc
   122  				r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)
   123  			case <-checkCh:
   124  				newChecks, err := r.consulClient.Checks(alloc)
   125  				if err != nil {
   126  					if !consulChecksErr {
   127  						consulChecksErr = true
   128  						r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul checks for allocation %q: %v", alloc.ID, err)
   129  					}
   130  				} else {
   131  					consulChecksErr = false
   132  					checks = newChecks
   133  				}
   134  			case <-deadline.C:
   135  				// We have exceeded our deadline without being healthy.
   136  				r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q hit healthy deadline", alloc.ID)
   137  				setHealth(false)
   138  				return
   139  			case <-healthyTimer.C:
   140  				r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q is healthy", alloc.ID)
   141  				setHealth(true)
   142  				return
   143  			}
   144  		}
   145  		first = false
   146  
   147  		// If the alloc is being stopped by the server just exit
   148  		switch alloc.DesiredStatus {
   149  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   150  			r.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)
   151  			return
   152  		}
   153  
   154  		// If the alloc is marked as failed by the client set the status to
   155  		// unhealthy
   156  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   157  			r.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID)
   158  			setHealth(false)
   159  			return
   160  		}
   161  
   162  		if len(alloc.TaskStates) != len(tg.Tasks) {
   163  			r.logger.Printf("[TRACE] client.alloc_watcher: all task runners haven't started")
   164  			continue OUTER
   165  		}
   166  
   167  		// If the task is dead or has restarted, fail
   168  		for _, tstate := range alloc.TaskStates {
   169  			if tstate.Failed || !tstate.FinishedAt.IsZero() || tstate.Restarts != 0 {
   170  				r.logger.Printf("[TRACE] client.alloc_watcher: setting health to false for alloc %q", alloc.ID)
   171  				setHealth(false)
   172  				return
   173  			}
   174  		}
   175  
   176  		// If we should have checks and they aren't all healthy continue
   177  		if len(checks) != desiredChecks {
   178  			r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, len(checks), alloc.ID)
   179  			cancelHealthyTimer()
   180  			continue OUTER
   181  		}
   182  
   183  		// Check if all the checks are passing
   184  		for _, check := range checks {
   185  			if check.Status != api.HealthPassing {
   186  				r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID)
   187  				latestChecksHealthy = time.Time{}
   188  				cancelHealthyTimer()
   189  				continue OUTER
   190  			}
   191  		}
   192  		if latestChecksHealthy.IsZero() {
   193  			latestChecksHealthy = time.Now()
   194  		}
   195  
   196  		// Determine if the allocation is healthy
   197  		for task, tstate := range alloc.TaskStates {
   198  			if tstate.State != structs.TaskStateRunning {
   199  				r.logger.Printf("[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q", task, alloc.ID)
   200  				continue OUTER
   201  			}
   202  
   203  			if tstate.StartedAt.After(latestTaskHealthy) {
   204  				latestTaskHealthy = tstate.StartedAt
   205  			}
   206  		}
   207  
   208  		// Determine when we can mark ourselves as healthy.
   209  		totalHealthy := latestTaskHealthy
   210  		if totalHealthy.Before(latestChecksHealthy) {
   211  			totalHealthy = latestChecksHealthy
   212  		}
   213  
   214  		// Nothing to do since we are already waiting for the healthy timer to
   215  		// fire at the same time.
   216  		if totalHealthy.Equal(healthyTime) {
   217  			continue OUTER
   218  		}
   219  
   220  		healthyTime = totalHealthy
   221  		cancelHealthyTimer()
   222  		d := time.Until(totalHealthy.Add(u.MinHealthyTime))
   223  		healthyTimer.Reset(d)
   224  		r.logger.Printf("[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q", d, alloc.ID)
   225  	}
   226  }