github.com/erda-project/erda-infra@v1.0.9/providers/etcd-election/election.go (about) 1 // Copyright (c) 2021 Terminus, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package election 16 17 import ( 18 "context" 19 "errors" 20 "path/filepath" 21 "reflect" 22 "sync" 23 "time" 24 25 "github.com/coreos/etcd/clientv3" 26 "github.com/coreos/etcd/clientv3/concurrency" 27 "github.com/coreos/etcd/mvcc/mvccpb" 28 "github.com/recallsong/go-utils/errorx" 29 uuid "github.com/satori/go.uuid" 30 31 "github.com/erda-project/erda-infra/base/logs" 32 "github.com/erda-project/erda-infra/base/servicehub" 33 ) 34 35 // Node . 36 type Node struct { 37 ID string 38 } 39 40 // Action . 41 type Action int32 42 43 // Action values 44 const ( 45 ActionPut = Action(iota + 1) 46 ActionDelete 47 ) 48 49 func (a Action) String() string { 50 switch a { 51 case ActionPut: 52 return "put" 53 case ActionDelete: 54 return "delete" 55 } 56 return "unknown" 57 } 58 59 // Event . 60 type Event struct { 61 Action Action 62 Node Node 63 } 64 65 // WatchOption . 66 type WatchOption interface{} 67 68 // Interface . 69 type Interface interface { 70 Node() Node 71 Nodes() ([]Node, error) 72 Leader() (*Node, error) 73 IsLeader() bool 74 ResignLeader() error 75 OnLeader(handler func(context.Context)) 76 Watch(ctx context.Context, opts ...WatchOption) <-chan Event 77 SetNonVoter(b bool) 78 } 79 80 type config struct { 81 Prefix string `file:"root_path" default:"etcd-election"` 82 NodeID string `file:"node_id"` 83 NonVoter bool `file:"non_voter"` 84 } 85 86 type provider struct { 87 Cfg *config 88 Log logs.Logger 89 Client *clientv3.Client `autowired:"etcd-client"` 90 prefix string 91 92 lock sync.RWMutex 93 leaderHandlers []func(ctx context.Context) 94 cancelHandler func() 95 election *concurrency.Election 96 session *concurrency.Session 97 iAmLeader bool 98 waitCh chan struct{} 99 } 100 101 // Init . 102 func (p *provider) Init(ctx servicehub.Context) error { 103 p.Cfg.Prefix = filepath.Clean("/" + p.Cfg.Prefix) 104 p.prefix = p.Cfg.Prefix + "/" 105 if len(p.Cfg.NodeID) <= 0 { 106 p.Cfg.NodeID = uuid.NewV4().String() 107 } 108 if p.Cfg.NonVoter { 109 p.waitCh = make(chan struct{}) 110 } 111 p.Log.Info("my node id: ", p.Cfg.NodeID) 112 return nil 113 } 114 115 func (p *provider) reset(session *concurrency.Session) { 116 session.Close() 117 p.lock.Lock() 118 p.session, p.election = nil, nil 119 p.iAmLeader = false 120 p.lock.Unlock() 121 } 122 123 func (p *provider) Run(ctx context.Context) error { 124 for { 125 select { 126 case <-ctx.Done(): 127 return nil 128 default: 129 } 130 131 p.lock.Lock() 132 waitCh := p.waitCh 133 p.lock.Unlock() 134 for waitCh != nil { 135 select { 136 case <-ctx.Done(): 137 return nil 138 case <-waitCh: 139 } 140 p.lock.Lock() 141 waitCh = p.waitCh 142 p.lock.Unlock() 143 } 144 145 session, err := p.newSession(ctx, 5*time.Second) 146 if err != nil { 147 if errors.Is(err, context.Canceled) { 148 return nil 149 } 150 p.Log.Errorf("fail to NewSession: %s", err) 151 time.Sleep(2 * time.Second) 152 continue 153 } 154 155 election := concurrency.NewElection(session, p.Cfg.Prefix) 156 p.lock.Lock() 157 p.session, p.election = session, election 158 p.lock.Unlock() 159 if err = election.Campaign(ctx, p.Cfg.NodeID); err != nil { 160 if errors.Is(err, context.Canceled) { 161 return nil 162 } 163 p.reset(session) 164 p.Log.Errorf("fail to Campaign: %s", err, reflect.TypeOf(err)) 165 time.Sleep(1 * time.Second) 166 continue 167 } 168 169 // Let's say A is leader and B is non-leader. 170 // The etcd server's stopped and it's restarted after a while like 10 seconds. 171 // The campaign of B exited with nil after connection was restored. 172 select { 173 case <-session.Done(): 174 p.reset(session) 175 continue 176 default: 177 } 178 179 p.Log.Infof("I am leader ! Node is %q", p.Cfg.NodeID) 180 181 p.runHandlers() 182 select { 183 case <-session.Done(): 184 p.resignLeader(nil) 185 continue 186 case <-ctx.Done(): 187 p.resignLeader(nil) 188 return nil 189 } 190 } 191 } 192 193 func (p *provider) newSession(ctx context.Context, ttl time.Duration) (*concurrency.Session, error) { 194 opts := []concurrency.SessionOption{concurrency.WithContext(ctx)} 195 seconds := int(ttl.Seconds()) 196 if seconds > 0 { 197 opts = append(opts, concurrency.WithTTL(seconds)) 198 } 199 return concurrency.NewSession(p.Client, opts...) 200 } 201 202 func (p *provider) runHandlers() { 203 ctx, cancel := context.WithCancel(context.Background()) 204 p.lock.Lock() 205 p.iAmLeader = true 206 p.cancelHandler = cancel 207 p.lock.Unlock() 208 for _, h := range p.leaderHandlers { 209 go func(h func(context.Context)) { 210 h(ctx) 211 }(h) 212 } 213 } 214 215 func (p *provider) Node() Node { 216 return Node{ID: p.Cfg.NodeID} 217 } 218 219 func (p *provider) Nodes() ([]Node, error) { 220 resp, err := p.Client.Get(context.Background(), p.prefix, clientv3.WithPrefix()) 221 if err != nil { 222 return nil, err 223 } 224 var nodes []Node 225 for _, kv := range resp.Kvs { 226 nodes = append(nodes, Node{ID: string(kv.Value)}) 227 } 228 return nodes, nil 229 } 230 231 func (p *provider) Leader() (*Node, error) { 232 if p.IsLeader() { 233 node := p.Node() 234 return &node, nil 235 } 236 clientv3.WithFirstCreate() 237 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 238 defer cancel() 239 resp, err := p.Client.Get(ctx, p.prefix, clientv3.WithFirstCreate()...) 240 if err != nil { 241 return nil, err 242 } 243 if len(resp.Kvs) == 0 { 244 return nil, nil 245 } 246 node := &Node{ID: string(resp.Kvs[0].Value)} 247 return node, nil 248 } 249 250 func (p *provider) IsLeader() bool { 251 p.lock.RLock() 252 defer p.lock.RUnlock() 253 return p.iAmLeader 254 } 255 256 func (p *provider) ResignLeader() error { 257 err := p.resignLeader(nil) 258 if err != nil { 259 p.Log.Warnf("fail to resign leader: %s", err) 260 } 261 return err 262 } 263 264 func (p *provider) resignLeader(wait *bool) error { 265 var election *concurrency.Election 266 var session *concurrency.Session 267 268 p.lock.Lock() 269 if wait != nil { 270 if *wait && p.waitCh == nil { 271 p.waitCh = make(chan struct{}) 272 } else if !*wait && p.waitCh != nil { 273 close(p.waitCh) 274 p.waitCh = nil 275 } 276 } 277 if !p.iAmLeader { 278 p.lock.Unlock() 279 return nil 280 } 281 p.iAmLeader = false 282 p.cancelHandler() 283 p.cancelHandler = nil 284 election = p.election 285 session = p.session 286 p.session, p.election = nil, nil 287 p.lock.Unlock() 288 289 ctx, cancel := context.WithTimeout(context.Background(), time.Second*1) 290 defer cancel() 291 var errs errorx.Errors 292 err := election.Resign(ctx) 293 if err != nil { 294 errs.Append(err) 295 } 296 err = session.Close() 297 if err != nil { 298 errs.Append(err) 299 } 300 p.Log.Infof("Resign leader ! Node is %q", p.Cfg.NodeID) 301 return errs.MaybeUnwrap() 302 } 303 304 func (p *provider) OnLeader(handler func(context.Context)) { 305 p.leaderHandlers = append(p.leaderHandlers, handler) 306 } 307 308 func (p *provider) Watch(ctx context.Context, opts ...WatchOption) <-chan Event { 309 notify := make(chan Event, 8) 310 go func() { 311 defer func() { 312 close(notify) 313 p.Log.Debug("election watcher exited") 314 }() 315 opts := []clientv3.OpOption{clientv3.WithPrefix()} 316 for func() bool { 317 wctx, wcancel := context.WithCancel(context.Background()) 318 defer wcancel() 319 wch := p.Client.Watch(wctx, p.prefix, opts...) 320 for { 321 select { 322 case wr, ok := <-wch: 323 if !ok { 324 return true 325 } else if wr.Err() != nil { 326 p.Log.Errorf("election watcher error: %s", wr.Err()) 327 return true 328 } 329 for _, ev := range wr.Events { 330 if ev.Kv == nil { 331 continue 332 } 333 switch ev.Type { 334 case mvccpb.PUT: 335 notify <- Event{ 336 Action: ActionDelete, 337 Node: Node{ID: string(ev.Kv.Value)}, 338 } 339 case mvccpb.DELETE: 340 notify <- Event{ 341 Action: ActionDelete, 342 Node: Node{ID: string(ev.Kv.Value)}, 343 } 344 } 345 } 346 case <-ctx.Done(): 347 return false 348 } 349 } 350 }() { 351 } 352 }() 353 return notify 354 } 355 356 // SetNonVoter . 357 func (p *provider) SetNonVoter(b bool) { 358 p.resignLeader(&b) 359 } 360 361 func init() { 362 servicehub.Register("etcd-election", &servicehub.Spec{ 363 Services: []string{"etcd-election"}, 364 Types: []reflect.Type{ 365 reflect.TypeOf((*Interface)(nil)).Elem(), 366 }, 367 Dependencies: []string{"etcd"}, 368 ConfigFunc: func() interface{} { return &config{} }, 369 Creator: func() servicehub.Provider { 370 return &provider{} 371 }, 372 }) 373 }