github.com/manicqin/nomad@v0.9.5/nomad/stats_fetcher.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "sync" 6 7 log "github.com/hashicorp/go-hclog" 8 9 "github.com/hashicorp/consul/agent/consul/autopilot" 10 "github.com/hashicorp/nomad/helper/pool" 11 "github.com/hashicorp/serf/serf" 12 ) 13 14 // StatsFetcher has two functions for autopilot. First, lets us fetch all the 15 // stats in parallel so we are taking a sample as close to the same time as 16 // possible, since we are comparing time-sensitive info for the health check. 17 // Second, it bounds the time so that one slow RPC can't hold up the health 18 // check loop; as a side effect of how it implements this, it also limits to 19 // a single in-flight RPC to any given server, so goroutines don't accumulate 20 // as we run the health check fairly frequently. 21 type StatsFetcher struct { 22 logger log.Logger 23 pool *pool.ConnPool 24 region string 25 inflight map[string]struct{} 26 inflightLock sync.Mutex 27 } 28 29 // NewStatsFetcher returns a stats fetcher. 30 func NewStatsFetcher(logger log.Logger, pool *pool.ConnPool, region string) *StatsFetcher { 31 return &StatsFetcher{ 32 logger: logger.Named("stats_fetcher"), 33 pool: pool, 34 region: region, 35 inflight: make(map[string]struct{}), 36 } 37 } 38 39 // fetch does the RPC to fetch the server stats from a single server. We don't 40 // cancel this when the context is canceled because we only want one in-flight 41 // RPC to each server, so we let it finish and then clean up the in-flight 42 // tracking. 43 func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.ServerStats) { 44 var args struct{} 45 var reply autopilot.ServerStats 46 err := f.pool.RPC(f.region, server.Addr, server.MajorVersion, "Status.RaftStats", &args, &reply) 47 if err != nil { 48 f.logger.Warn("failed retrieving server health", "server", server.Name, "error", err) 49 } else { 50 replyCh <- &reply 51 } 52 53 f.inflightLock.Lock() 54 delete(f.inflight, server.ID) 55 f.inflightLock.Unlock() 56 } 57 58 // Fetch will attempt to query all the servers in parallel. 59 func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats { 60 type workItem struct { 61 server *serverParts 62 replyCh chan *autopilot.ServerStats 63 } 64 var servers []*serverParts 65 for _, s := range members { 66 if ok, parts := isNomadServer(s); ok { 67 servers = append(servers, parts) 68 } 69 } 70 71 // Skip any servers that have inflight requests. 72 var work []*workItem 73 f.inflightLock.Lock() 74 for _, server := range servers { 75 if _, ok := f.inflight[server.ID]; ok { 76 f.logger.Warn("failed retrieving server health; last request still outstanding", "server", server.Name) 77 } else { 78 workItem := &workItem{ 79 server: server, 80 replyCh: make(chan *autopilot.ServerStats, 1), 81 } 82 work = append(work, workItem) 83 f.inflight[server.ID] = struct{}{} 84 go f.fetch(workItem.server, workItem.replyCh) 85 } 86 } 87 f.inflightLock.Unlock() 88 89 // Now wait for the results to come in, or for the context to be 90 // canceled. 91 replies := make(map[string]*autopilot.ServerStats) 92 for _, workItem := range work { 93 select { 94 case reply := <-workItem.replyCh: 95 replies[workItem.server.ID] = reply 96 97 case <-ctx.Done(): 98 f.logger.Warn("failed retrieving server health", "server", workItem.server.Name, "error", ctx.Err()) 99 } 100 } 101 return replies 102 }