github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/stats_fetcher.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "log" 6 "sync" 7 8 "github.com/hashicorp/consul/agent/consul/autopilot" 9 "github.com/hashicorp/nomad/helper/pool" 10 "github.com/hashicorp/serf/serf" 11 ) 12 13 // StatsFetcher has two functions for autopilot. First, lets us fetch all the 14 // stats in parallel so we are taking a sample as close to the same time as 15 // possible, since we are comparing time-sensitive info for the health check. 16 // Second, it bounds the time so that one slow RPC can't hold up the health 17 // check loop; as a side effect of how it implements this, it also limits to 18 // a single in-flight RPC to any given server, so goroutines don't accumulate 19 // as we run the health check fairly frequently. 20 type StatsFetcher struct { 21 logger *log.Logger 22 pool *pool.ConnPool 23 region string 24 inflight map[string]struct{} 25 inflightLock sync.Mutex 26 } 27 28 // NewStatsFetcher returns a stats fetcher. 29 func NewStatsFetcher(logger *log.Logger, pool *pool.ConnPool, region string) *StatsFetcher { 30 return &StatsFetcher{ 31 logger: logger, 32 pool: pool, 33 region: region, 34 inflight: make(map[string]struct{}), 35 } 36 } 37 38 // fetch does the RPC to fetch the server stats from a single server. We don't 39 // cancel this when the context is canceled because we only want one in-flight 40 // RPC to each server, so we let it finish and then clean up the in-flight 41 // tracking. 42 func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.ServerStats) { 43 var args struct{} 44 var reply autopilot.ServerStats 45 err := f.pool.RPC(f.region, server.Addr, server.MajorVersion, "Status.RaftStats", &args, &reply) 46 if err != nil { 47 f.logger.Printf("[WARN] nomad: error getting server health from %q: %v", 48 server.Name, err) 49 } else { 50 replyCh <- &reply 51 } 52 53 f.inflightLock.Lock() 54 delete(f.inflight, server.ID) 55 f.inflightLock.Unlock() 56 } 57 58 // Fetch will attempt to query all the servers in parallel. 59 func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats { 60 type workItem struct { 61 server *serverParts 62 replyCh chan *autopilot.ServerStats 63 } 64 var servers []*serverParts 65 for _, s := range members { 66 if ok, parts := isNomadServer(s); ok { 67 servers = append(servers, parts) 68 } 69 } 70 71 // Skip any servers that have inflight requests. 72 var work []*workItem 73 f.inflightLock.Lock() 74 for _, server := range servers { 75 if _, ok := f.inflight[server.ID]; ok { 76 f.logger.Printf("[WARN] nomad: error getting server health from %q: last request still outstanding", 77 server.Name) 78 } else { 79 workItem := &workItem{ 80 server: server, 81 replyCh: make(chan *autopilot.ServerStats, 1), 82 } 83 work = append(work, workItem) 84 f.inflight[server.ID] = struct{}{} 85 go f.fetch(workItem.server, workItem.replyCh) 86 } 87 } 88 f.inflightLock.Unlock() 89 90 // Now wait for the results to come in, or for the context to be 91 // canceled. 92 replies := make(map[string]*autopilot.ServerStats) 93 for _, workItem := range work { 94 select { 95 case reply := <-workItem.replyCh: 96 replies[workItem.server.ID] = reply 97 98 case <-ctx.Done(): 99 f.logger.Printf("[WARN] nomad: error getting server health from %q: %v", 100 workItem.server.Name, ctx.Err()) 101 } 102 } 103 return replies 104 }