github.com/manicqin/nomad@v0.9.5/nomad/stats_fetcher.go

github.com/manicqin/nomad@v0.9.5/nomad/stats_fetcher.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  
     7  	log "github.com/hashicorp/go-hclog"
     8  
     9  	"github.com/hashicorp/consul/agent/consul/autopilot"
    10  	"github.com/hashicorp/nomad/helper/pool"
    11  	"github.com/hashicorp/serf/serf"
    12  )
    13  
    14  // StatsFetcher has two functions for autopilot. First, lets us fetch all the
    15  // stats in parallel so we are taking a sample as close to the same time as
    16  // possible, since we are comparing time-sensitive info for the health check.
    17  // Second, it bounds the time so that one slow RPC can't hold up the health
    18  // check loop; as a side effect of how it implements this, it also limits to
    19  // a single in-flight RPC to any given server, so goroutines don't accumulate
    20  // as we run the health check fairly frequently.
    21  type StatsFetcher struct {
    22  	logger       log.Logger
    23  	pool         *pool.ConnPool
    24  	region       string
    25  	inflight     map[string]struct{}
    26  	inflightLock sync.Mutex
    27  }
    28  
    29  // NewStatsFetcher returns a stats fetcher.
    30  func NewStatsFetcher(logger log.Logger, pool *pool.ConnPool, region string) *StatsFetcher {
    31  	return &StatsFetcher{
    32  		logger:   logger.Named("stats_fetcher"),
    33  		pool:     pool,
    34  		region:   region,
    35  		inflight: make(map[string]struct{}),
    36  	}
    37  }
    38  
    39  // fetch does the RPC to fetch the server stats from a single server. We don't
    40  // cancel this when the context is canceled because we only want one in-flight
    41  // RPC to each server, so we let it finish and then clean up the in-flight
    42  // tracking.
    43  func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.ServerStats) {
    44  	var args struct{}
    45  	var reply autopilot.ServerStats
    46  	err := f.pool.RPC(f.region, server.Addr, server.MajorVersion, "Status.RaftStats", &args, &reply)
    47  	if err != nil {
    48  		f.logger.Warn("failed retrieving server health", "server", server.Name, "error", err)
    49  	} else {
    50  		replyCh <- &reply
    51  	}
    52  
    53  	f.inflightLock.Lock()
    54  	delete(f.inflight, server.ID)
    55  	f.inflightLock.Unlock()
    56  }
    57  
    58  // Fetch will attempt to query all the servers in parallel.
    59  func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats {
    60  	type workItem struct {
    61  		server  *serverParts
    62  		replyCh chan *autopilot.ServerStats
    63  	}
    64  	var servers []*serverParts
    65  	for _, s := range members {
    66  		if ok, parts := isNomadServer(s); ok {
    67  			servers = append(servers, parts)
    68  		}
    69  	}
    70  
    71  	// Skip any servers that have inflight requests.
    72  	var work []*workItem
    73  	f.inflightLock.Lock()
    74  	for _, server := range servers {
    75  		if _, ok := f.inflight[server.ID]; ok {
    76  			f.logger.Warn("failed retrieving server health; last request still outstanding", "server", server.Name)
    77  		} else {
    78  			workItem := &workItem{
    79  				server:  server,
    80  				replyCh: make(chan *autopilot.ServerStats, 1),
    81  			}
    82  			work = append(work, workItem)
    83  			f.inflight[server.ID] = struct{}{}
    84  			go f.fetch(workItem.server, workItem.replyCh)
    85  		}
    86  	}
    87  	f.inflightLock.Unlock()
    88  
    89  	// Now wait for the results to come in, or for the context to be
    90  	// canceled.
    91  	replies := make(map[string]*autopilot.ServerStats)
    92  	for _, workItem := range work {
    93  		select {
    94  		case reply := <-workItem.replyCh:
    95  			replies[workItem.server.ID] = reply
    96  
    97  		case <-ctx.Done():
    98  			f.logger.Warn("failed retrieving server health", "server", workItem.server.Name, "error", ctx.Err())
    99  		}
   100  	}
   101  	return replies
   102  }