github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/serf.go (about) 1 package nomad 2 3 import ( 4 "sync/atomic" 5 6 "github.com/hashicorp/serf/serf" 7 ) 8 9 const ( 10 // StatusReap is used to update the status of a node if we 11 // are handling a EventMemberReap 12 StatusReap = serf.MemberStatus(-1) 13 ) 14 15 // serfEventHandler is used to handle events from the serf cluster 16 func (s *Server) serfEventHandler() { 17 for { 18 select { 19 case e := <-s.eventCh: 20 switch e.EventType() { 21 case serf.EventMemberJoin: 22 s.nodeJoin(e.(serf.MemberEvent)) 23 s.localMemberEvent(e.(serf.MemberEvent)) 24 case serf.EventMemberLeave, serf.EventMemberFailed: 25 s.nodeFailed(e.(serf.MemberEvent)) 26 s.localMemberEvent(e.(serf.MemberEvent)) 27 case serf.EventMemberUpdate, serf.EventMemberReap, 28 serf.EventUser, serf.EventQuery: // Ignore 29 default: 30 s.logger.Printf("[WARN] nomad: unhandled serf event: %#v", e) 31 } 32 33 case <-s.shutdownCh: 34 return 35 } 36 } 37 } 38 39 // nodeJoin is used to handle join events on the serf cluster 40 func (s *Server) nodeJoin(me serf.MemberEvent) { 41 for _, m := range me.Members { 42 ok, parts := isNomadServer(m) 43 if !ok { 44 s.logger.Printf("[WARN] nomad: non-server in gossip pool: %s", m.Name) 45 continue 46 } 47 s.logger.Printf("[INFO] nomad: adding server %s", parts) 48 49 // Check if this server is known 50 found := false 51 s.peerLock.Lock() 52 existing := s.peers[parts.Region] 53 for idx, e := range existing { 54 if e.Name == parts.Name { 55 existing[idx] = parts 56 found = true 57 break 58 } 59 } 60 61 // Add ot the list if not known 62 if !found { 63 s.peers[parts.Region] = append(existing, parts) 64 } 65 66 // Check if a local peer 67 if parts.Region == s.config.Region { 68 s.localPeers[parts.Addr.String()] = parts 69 } 70 s.peerLock.Unlock() 71 72 // If we still expecting to bootstrap, may need to handle this 73 if atomic.LoadInt32(&s.config.BootstrapExpect) != 0 { 74 s.maybeBootstrap() 75 } 76 } 77 } 78 79 // maybeBootsrap is used to handle bootstrapping when a new server joins 80 func (s *Server) maybeBootstrap() { 81 var index uint64 82 var err error 83 if s.raftStore != nil { 84 index, err = s.raftStore.LastIndex() 85 } else if s.raftInmem != nil { 86 index, err = s.raftInmem.LastIndex() 87 } else { 88 panic("neither raftInmem or raftStore is initialized") 89 } 90 if err != nil { 91 s.logger.Printf("[ERR] nomad: failed to read last raft index: %v", err) 92 return 93 } 94 95 // Bootstrap can only be done if there are no committed logs, 96 // remove our expectations of bootstrapping 97 if index != 0 { 98 atomic.StoreInt32(&s.config.BootstrapExpect, 0) 99 return 100 } 101 102 // Scan for all the known servers 103 members := s.serf.Members() 104 addrs := make([]string, 0) 105 for _, member := range members { 106 valid, p := isNomadServer(member) 107 if !valid { 108 continue 109 } 110 if p.Region != s.config.Region { 111 continue 112 } 113 if p.Expect != 0 && p.Expect != int(atomic.LoadInt32(&s.config.BootstrapExpect)) { 114 s.logger.Printf("[ERR] nomad: peer %v has a conflicting expect value. All nodes should expect the same number.", member) 115 return 116 } 117 if p.Bootstrap { 118 s.logger.Printf("[ERR] nomad: peer %v has bootstrap mode. Expect disabled.", member) 119 return 120 } 121 addrs = append(addrs, p.Addr.String()) 122 } 123 124 // Skip if we haven't met the minimum expect count 125 if len(addrs) < int(atomic.LoadInt32(&s.config.BootstrapExpect)) { 126 return 127 } 128 129 // Update the peer set 130 s.logger.Printf("[INFO] nomad: Attempting bootstrap with nodes: %v", addrs) 131 if err := s.raft.SetPeers(addrs).Error(); err != nil { 132 s.logger.Printf("[ERR] nomad: failed to bootstrap peers: %v", err) 133 } 134 135 // Bootstrapping complete, don't enter this again 136 atomic.StoreInt32(&s.config.BootstrapExpect, 0) 137 } 138 139 // nodeFailed is used to handle fail events on the serf cluster 140 func (s *Server) nodeFailed(me serf.MemberEvent) { 141 for _, m := range me.Members { 142 ok, parts := isNomadServer(m) 143 if !ok { 144 continue 145 } 146 s.logger.Printf("[INFO] nomad: removing server %s", parts) 147 148 // Remove the server if known 149 s.peerLock.Lock() 150 existing := s.peers[parts.Region] 151 n := len(existing) 152 for i := 0; i < n; i++ { 153 if existing[i].Name == parts.Name { 154 existing[i], existing[n-1] = existing[n-1], nil 155 existing = existing[:n-1] 156 n-- 157 break 158 } 159 } 160 161 // Trim the list there are no known servers in a region 162 if n == 0 { 163 delete(s.peers, parts.Region) 164 } else { 165 s.peers[parts.Region] = existing 166 } 167 168 // Check if local peer 169 if parts.Region == s.config.Region { 170 delete(s.localPeers, parts.Addr.String()) 171 } 172 s.peerLock.Unlock() 173 } 174 } 175 176 // localMemberEvent is used to reconcile Serf events with the 177 // consistent store if we are the current leader. 178 func (s *Server) localMemberEvent(me serf.MemberEvent) { 179 // Do nothing if we are not the leader 180 if !s.IsLeader() { 181 return 182 } 183 184 // Check if this is a reap event 185 isReap := me.EventType() == serf.EventMemberReap 186 187 // Queue the members for reconciliation 188 for _, m := range me.Members { 189 // Change the status if this is a reap event 190 if isReap { 191 m.Status = StatusReap 192 } 193 select { 194 case s.reconcileCh <- m: 195 default: 196 } 197 } 198 }