github.com/erda-project/erda-infra@v1.0.9/providers/etcd-election/election.go (about)

     1  // Copyright (c) 2021 Terminus, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package election
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"path/filepath"
    21  	"reflect"
    22  	"sync"
    23  	"time"
    24  
    25  	"github.com/coreos/etcd/clientv3"
    26  	"github.com/coreos/etcd/clientv3/concurrency"
    27  	"github.com/coreos/etcd/mvcc/mvccpb"
    28  	"github.com/recallsong/go-utils/errorx"
    29  	uuid "github.com/satori/go.uuid"
    30  
    31  	"github.com/erda-project/erda-infra/base/logs"
    32  	"github.com/erda-project/erda-infra/base/servicehub"
    33  )
    34  
    35  // Node .
    36  type Node struct {
    37  	ID string
    38  }
    39  
    40  // Action .
    41  type Action int32
    42  
    43  // Action values
    44  const (
    45  	ActionPut = Action(iota + 1)
    46  	ActionDelete
    47  )
    48  
    49  func (a Action) String() string {
    50  	switch a {
    51  	case ActionPut:
    52  		return "put"
    53  	case ActionDelete:
    54  		return "delete"
    55  	}
    56  	return "unknown"
    57  }
    58  
    59  // Event .
    60  type Event struct {
    61  	Action Action
    62  	Node   Node
    63  }
    64  
    65  // WatchOption .
    66  type WatchOption interface{}
    67  
    68  // Interface .
    69  type Interface interface {
    70  	Node() Node
    71  	Nodes() ([]Node, error)
    72  	Leader() (*Node, error)
    73  	IsLeader() bool
    74  	ResignLeader() error
    75  	OnLeader(handler func(context.Context))
    76  	Watch(ctx context.Context, opts ...WatchOption) <-chan Event
    77  	SetNonVoter(b bool)
    78  }
    79  
    80  type config struct {
    81  	Prefix   string `file:"root_path" default:"etcd-election"`
    82  	NodeID   string `file:"node_id"`
    83  	NonVoter bool   `file:"non_voter"`
    84  }
    85  
    86  type provider struct {
    87  	Cfg    *config
    88  	Log    logs.Logger
    89  	Client *clientv3.Client `autowired:"etcd-client"`
    90  	prefix string
    91  
    92  	lock           sync.RWMutex
    93  	leaderHandlers []func(ctx context.Context)
    94  	cancelHandler  func()
    95  	election       *concurrency.Election
    96  	session        *concurrency.Session
    97  	iAmLeader      bool
    98  	waitCh         chan struct{}
    99  }
   100  
   101  // Init .
   102  func (p *provider) Init(ctx servicehub.Context) error {
   103  	p.Cfg.Prefix = filepath.Clean("/" + p.Cfg.Prefix)
   104  	p.prefix = p.Cfg.Prefix + "/"
   105  	if len(p.Cfg.NodeID) <= 0 {
   106  		p.Cfg.NodeID = uuid.NewV4().String()
   107  	}
   108  	if p.Cfg.NonVoter {
   109  		p.waitCh = make(chan struct{})
   110  	}
   111  	p.Log.Info("my node id: ", p.Cfg.NodeID)
   112  	return nil
   113  }
   114  
   115  func (p *provider) reset(session *concurrency.Session) {
   116  	session.Close()
   117  	p.lock.Lock()
   118  	p.session, p.election = nil, nil
   119  	p.iAmLeader = false
   120  	p.lock.Unlock()
   121  }
   122  
   123  func (p *provider) Run(ctx context.Context) error {
   124  	for {
   125  		select {
   126  		case <-ctx.Done():
   127  			return nil
   128  		default:
   129  		}
   130  
   131  		p.lock.Lock()
   132  		waitCh := p.waitCh
   133  		p.lock.Unlock()
   134  		for waitCh != nil {
   135  			select {
   136  			case <-ctx.Done():
   137  				return nil
   138  			case <-waitCh:
   139  			}
   140  			p.lock.Lock()
   141  			waitCh = p.waitCh
   142  			p.lock.Unlock()
   143  		}
   144  
   145  		session, err := p.newSession(ctx, 5*time.Second)
   146  		if err != nil {
   147  			if errors.Is(err, context.Canceled) {
   148  				return nil
   149  			}
   150  			p.Log.Errorf("fail to NewSession: %s", err)
   151  			time.Sleep(2 * time.Second)
   152  			continue
   153  		}
   154  
   155  		election := concurrency.NewElection(session, p.Cfg.Prefix)
   156  		p.lock.Lock()
   157  		p.session, p.election = session, election
   158  		p.lock.Unlock()
   159  		if err = election.Campaign(ctx, p.Cfg.NodeID); err != nil {
   160  			if errors.Is(err, context.Canceled) {
   161  				return nil
   162  			}
   163  			p.reset(session)
   164  			p.Log.Errorf("fail to Campaign: %s", err, reflect.TypeOf(err))
   165  			time.Sleep(1 * time.Second)
   166  			continue
   167  		}
   168  
   169  		// Let's say A is leader and B is non-leader.
   170  		// The etcd server's stopped and it's restarted after a while like 10 seconds.
   171  		// The campaign of B exited with nil after connection was restored.
   172  		select {
   173  		case <-session.Done():
   174  			p.reset(session)
   175  			continue
   176  		default:
   177  		}
   178  
   179  		p.Log.Infof("I am leader ! Node is %q", p.Cfg.NodeID)
   180  
   181  		p.runHandlers()
   182  		select {
   183  		case <-session.Done():
   184  			p.resignLeader(nil)
   185  			continue
   186  		case <-ctx.Done():
   187  			p.resignLeader(nil)
   188  			return nil
   189  		}
   190  	}
   191  }
   192  
   193  func (p *provider) newSession(ctx context.Context, ttl time.Duration) (*concurrency.Session, error) {
   194  	opts := []concurrency.SessionOption{concurrency.WithContext(ctx)}
   195  	seconds := int(ttl.Seconds())
   196  	if seconds > 0 {
   197  		opts = append(opts, concurrency.WithTTL(seconds))
   198  	}
   199  	return concurrency.NewSession(p.Client, opts...)
   200  }
   201  
   202  func (p *provider) runHandlers() {
   203  	ctx, cancel := context.WithCancel(context.Background())
   204  	p.lock.Lock()
   205  	p.iAmLeader = true
   206  	p.cancelHandler = cancel
   207  	p.lock.Unlock()
   208  	for _, h := range p.leaderHandlers {
   209  		go func(h func(context.Context)) {
   210  			h(ctx)
   211  		}(h)
   212  	}
   213  }
   214  
   215  func (p *provider) Node() Node {
   216  	return Node{ID: p.Cfg.NodeID}
   217  }
   218  
   219  func (p *provider) Nodes() ([]Node, error) {
   220  	resp, err := p.Client.Get(context.Background(), p.prefix, clientv3.WithPrefix())
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  	var nodes []Node
   225  	for _, kv := range resp.Kvs {
   226  		nodes = append(nodes, Node{ID: string(kv.Value)})
   227  	}
   228  	return nodes, nil
   229  }
   230  
   231  func (p *provider) Leader() (*Node, error) {
   232  	if p.IsLeader() {
   233  		node := p.Node()
   234  		return &node, nil
   235  	}
   236  	clientv3.WithFirstCreate()
   237  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   238  	defer cancel()
   239  	resp, err := p.Client.Get(ctx, p.prefix, clientv3.WithFirstCreate()...)
   240  	if err != nil {
   241  		return nil, err
   242  	}
   243  	if len(resp.Kvs) == 0 {
   244  		return nil, nil
   245  	}
   246  	node := &Node{ID: string(resp.Kvs[0].Value)}
   247  	return node, nil
   248  }
   249  
   250  func (p *provider) IsLeader() bool {
   251  	p.lock.RLock()
   252  	defer p.lock.RUnlock()
   253  	return p.iAmLeader
   254  }
   255  
   256  func (p *provider) ResignLeader() error {
   257  	err := p.resignLeader(nil)
   258  	if err != nil {
   259  		p.Log.Warnf("fail to resign leader: %s", err)
   260  	}
   261  	return err
   262  }
   263  
   264  func (p *provider) resignLeader(wait *bool) error {
   265  	var election *concurrency.Election
   266  	var session *concurrency.Session
   267  
   268  	p.lock.Lock()
   269  	if wait != nil {
   270  		if *wait && p.waitCh == nil {
   271  			p.waitCh = make(chan struct{})
   272  		} else if !*wait && p.waitCh != nil {
   273  			close(p.waitCh)
   274  			p.waitCh = nil
   275  		}
   276  	}
   277  	if !p.iAmLeader {
   278  		p.lock.Unlock()
   279  		return nil
   280  	}
   281  	p.iAmLeader = false
   282  	p.cancelHandler()
   283  	p.cancelHandler = nil
   284  	election = p.election
   285  	session = p.session
   286  	p.session, p.election = nil, nil
   287  	p.lock.Unlock()
   288  
   289  	ctx, cancel := context.WithTimeout(context.Background(), time.Second*1)
   290  	defer cancel()
   291  	var errs errorx.Errors
   292  	err := election.Resign(ctx)
   293  	if err != nil {
   294  		errs.Append(err)
   295  	}
   296  	err = session.Close()
   297  	if err != nil {
   298  		errs.Append(err)
   299  	}
   300  	p.Log.Infof("Resign leader ! Node is %q", p.Cfg.NodeID)
   301  	return errs.MaybeUnwrap()
   302  }
   303  
   304  func (p *provider) OnLeader(handler func(context.Context)) {
   305  	p.leaderHandlers = append(p.leaderHandlers, handler)
   306  }
   307  
   308  func (p *provider) Watch(ctx context.Context, opts ...WatchOption) <-chan Event {
   309  	notify := make(chan Event, 8)
   310  	go func() {
   311  		defer func() {
   312  			close(notify)
   313  			p.Log.Debug("election watcher exited")
   314  		}()
   315  		opts := []clientv3.OpOption{clientv3.WithPrefix()}
   316  		for func() bool {
   317  			wctx, wcancel := context.WithCancel(context.Background())
   318  			defer wcancel()
   319  			wch := p.Client.Watch(wctx, p.prefix, opts...)
   320  			for {
   321  				select {
   322  				case wr, ok := <-wch:
   323  					if !ok {
   324  						return true
   325  					} else if wr.Err() != nil {
   326  						p.Log.Errorf("election watcher error: %s", wr.Err())
   327  						return true
   328  					}
   329  					for _, ev := range wr.Events {
   330  						if ev.Kv == nil {
   331  							continue
   332  						}
   333  						switch ev.Type {
   334  						case mvccpb.PUT:
   335  							notify <- Event{
   336  								Action: ActionDelete,
   337  								Node:   Node{ID: string(ev.Kv.Value)},
   338  							}
   339  						case mvccpb.DELETE:
   340  							notify <- Event{
   341  								Action: ActionDelete,
   342  								Node:   Node{ID: string(ev.Kv.Value)},
   343  							}
   344  						}
   345  					}
   346  				case <-ctx.Done():
   347  					return false
   348  				}
   349  			}
   350  		}() {
   351  		}
   352  	}()
   353  	return notify
   354  }
   355  
   356  // SetNonVoter .
   357  func (p *provider) SetNonVoter(b bool) {
   358  	p.resignLeader(&b)
   359  }
   360  
   361  func init() {
   362  	servicehub.Register("etcd-election", &servicehub.Spec{
   363  		Services: []string{"etcd-election"},
   364  		Types: []reflect.Type{
   365  			reflect.TypeOf((*Interface)(nil)).Elem(),
   366  		},
   367  		Dependencies: []string{"etcd"},
   368  		ConfigFunc:   func() interface{} { return &config{} },
   369  		Creator: func() servicehub.Provider {
   370  			return &provider{}
   371  		},
   372  	})
   373  }