github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/stats_fetcher.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"log"
     6  	"sync"
     7  
     8  	"github.com/hashicorp/consul/agent/consul/autopilot"
     9  	"github.com/hashicorp/nomad/helper/pool"
    10  	"github.com/hashicorp/serf/serf"
    11  )
    12  
    13  // StatsFetcher has two functions for autopilot. First, lets us fetch all the
    14  // stats in parallel so we are taking a sample as close to the same time as
    15  // possible, since we are comparing time-sensitive info for the health check.
    16  // Second, it bounds the time so that one slow RPC can't hold up the health
    17  // check loop; as a side effect of how it implements this, it also limits to
    18  // a single in-flight RPC to any given server, so goroutines don't accumulate
    19  // as we run the health check fairly frequently.
    20  type StatsFetcher struct {
    21  	logger       *log.Logger
    22  	pool         *pool.ConnPool
    23  	region       string
    24  	inflight     map[string]struct{}
    25  	inflightLock sync.Mutex
    26  }
    27  
    28  // NewStatsFetcher returns a stats fetcher.
    29  func NewStatsFetcher(logger *log.Logger, pool *pool.ConnPool, region string) *StatsFetcher {
    30  	return &StatsFetcher{
    31  		logger:   logger,
    32  		pool:     pool,
    33  		region:   region,
    34  		inflight: make(map[string]struct{}),
    35  	}
    36  }
    37  
    38  // fetch does the RPC to fetch the server stats from a single server. We don't
    39  // cancel this when the context is canceled because we only want one in-flight
    40  // RPC to each server, so we let it finish and then clean up the in-flight
    41  // tracking.
    42  func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.ServerStats) {
    43  	var args struct{}
    44  	var reply autopilot.ServerStats
    45  	err := f.pool.RPC(f.region, server.Addr, server.MajorVersion, "Status.RaftStats", &args, &reply)
    46  	if err != nil {
    47  		f.logger.Printf("[WARN] nomad: error getting server health from %q: %v",
    48  			server.Name, err)
    49  	} else {
    50  		replyCh <- &reply
    51  	}
    52  
    53  	f.inflightLock.Lock()
    54  	delete(f.inflight, server.ID)
    55  	f.inflightLock.Unlock()
    56  }
    57  
    58  // Fetch will attempt to query all the servers in parallel.
    59  func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats {
    60  	type workItem struct {
    61  		server  *serverParts
    62  		replyCh chan *autopilot.ServerStats
    63  	}
    64  	var servers []*serverParts
    65  	for _, s := range members {
    66  		if ok, parts := isNomadServer(s); ok {
    67  			servers = append(servers, parts)
    68  		}
    69  	}
    70  
    71  	// Skip any servers that have inflight requests.
    72  	var work []*workItem
    73  	f.inflightLock.Lock()
    74  	for _, server := range servers {
    75  		if _, ok := f.inflight[server.ID]; ok {
    76  			f.logger.Printf("[WARN] nomad: error getting server health from %q: last request still outstanding",
    77  				server.Name)
    78  		} else {
    79  			workItem := &workItem{
    80  				server:  server,
    81  				replyCh: make(chan *autopilot.ServerStats, 1),
    82  			}
    83  			work = append(work, workItem)
    84  			f.inflight[server.ID] = struct{}{}
    85  			go f.fetch(workItem.server, workItem.replyCh)
    86  		}
    87  	}
    88  	f.inflightLock.Unlock()
    89  
    90  	// Now wait for the results to come in, or for the context to be
    91  	// canceled.
    92  	replies := make(map[string]*autopilot.ServerStats)
    93  	for _, workItem := range work {
    94  		select {
    95  		case reply := <-workItem.replyCh:
    96  			replies[workItem.server.ID] = reply
    97  
    98  		case <-ctx.Done():
    99  			f.logger.Printf("[WARN] nomad: error getting server health from %q: %v",
   100  				workItem.server.Name, ctx.Err())
   101  		}
   102  	}
   103  	return replies
   104  }