github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/election/election_test.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package election
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"net/url"
    20  	"sync"
    21  	"testing"
    22  	"time"
    23  
    24  	. "github.com/pingcap/check"
    25  	"github.com/pingcap/failpoint"
    26  	"github.com/pingcap/tiflow/dm/pkg/etcdutil"
    27  	"github.com/pingcap/tiflow/dm/pkg/log"
    28  	"github.com/pingcap/tiflow/dm/pkg/terror"
    29  	"github.com/pingcap/tiflow/dm/pkg/utils"
    30  	"github.com/tikv/pd/pkg/utils/tempurl"
    31  	clientv3 "go.etcd.io/etcd/client/v3"
    32  	"go.etcd.io/etcd/server/v3/embed"
    33  )
    34  
    35  var _ = SerialSuites(&testElectionSuite{})
    36  
    37  func TestSuite(t *testing.T) {
    38  	TestingT(t)
    39  }
    40  
    41  type testElectionSuite struct {
    42  	etcd     *embed.Etcd
    43  	endPoint string
    44  
    45  	notifyBlockTime time.Duration
    46  }
    47  
    48  func (t *testElectionSuite) SetUpTest(c *C) {
    49  	c.Assert(log.InitLogger(&log.Config{}), IsNil)
    50  
    51  	cfg := embed.NewConfig()
    52  	cfg.Name = "election-test"
    53  	cfg.Dir = c.MkDir()
    54  	cfg.ZapLoggerBuilder = embed.NewZapCoreLoggerBuilder(log.L().Logger, log.L().Core(), log.Props().Syncer)
    55  	cfg.Logger = "zap"
    56  	err := cfg.Validate() // verify & trigger the builder
    57  	c.Assert(err, IsNil)
    58  
    59  	t.endPoint = tempurl.Alloc()
    60  	url2, err := url.Parse(t.endPoint)
    61  	c.Assert(err, IsNil)
    62  	cfg.ListenClientUrls = []url.URL{*url2}
    63  	cfg.AdvertiseClientUrls = cfg.ListenClientUrls
    64  
    65  	url2, err = url.Parse(tempurl.Alloc())
    66  	c.Assert(err, IsNil)
    67  	cfg.ListenPeerUrls = []url.URL{*url2}
    68  	cfg.AdvertisePeerUrls = cfg.ListenPeerUrls
    69  
    70  	cfg.InitialCluster = fmt.Sprintf("%s=%s", cfg.Name, url2)
    71  	cfg.ClusterState = embed.ClusterStateFlagNew
    72  
    73  	t.etcd, err = embed.StartEtcd(cfg)
    74  	c.Assert(err, IsNil)
    75  	select {
    76  	case <-t.etcd.Server.ReadyNotify():
    77  	case <-time.After(10 * time.Second):
    78  		c.Fatal("start embed etcd timeout")
    79  	}
    80  
    81  	// some notify leader information is not handled, just reduce the block time and ignore them
    82  	t.notifyBlockTime = 100 * time.Millisecond
    83  }
    84  
    85  func (t *testElectionSuite) TearDownTest(c *C) {
    86  	t.etcd.Close()
    87  }
    88  
    89  func testElection2After1(t *testElectionSuite, c *C, normalExit bool) {
    90  	var (
    91  		timeout    = 3 * time.Second
    92  		sessionTTL = 60
    93  		key        = "unit-test/election-2-after-1"
    94  		ID1        = "member1"
    95  		ID2        = "member2"
    96  		ID3        = "member3"
    97  		addr1      = "127.0.0.1:1"
    98  		addr2      = "127.0.0.1:2"
    99  		addr3      = "127.0.0.1:3"
   100  	)
   101  	cli, err := etcdutil.CreateClient([]string{t.endPoint}, nil)
   102  	c.Assert(err, IsNil)
   103  	defer cli.Close()
   104  	ctx0, cancel0 := context.WithCancel(context.Background())
   105  	defer cancel0()
   106  	_, err = cli.Delete(ctx0, key, clientv3.WithPrefix())
   107  	c.Assert(err, IsNil)
   108  
   109  	ctx1, cancel1 := context.WithCancel(context.Background())
   110  	defer cancel1()
   111  	if !normalExit {
   112  		c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/pkg/election/mockCampaignLoopExitedAbnormally", `return()`), IsNil)
   113  		//nolint:errcheck
   114  		defer failpoint.Disable("github.com/pingcap/tiflow/dm/pkg/election/mockCampaignLoopExitedAbnormally")
   115  	}
   116  	e1, err := NewElection(ctx1, cli, sessionTTL, key, ID1, addr1, t.notifyBlockTime)
   117  	c.Assert(err, IsNil)
   118  	defer e1.Close()
   119  
   120  	// e1 should become the leader
   121  	select {
   122  	case leader := <-e1.LeaderNotify():
   123  		c.Assert(leader.ID, Equals, ID1)
   124  	case <-time.After(timeout):
   125  		c.Fatal("leader campaign timeout")
   126  	}
   127  	c.Assert(e1.IsLeader(), IsTrue)
   128  	_, leaderID, leaderAddr, err := e1.LeaderInfo(ctx1)
   129  	c.Assert(err, IsNil)
   130  	c.Assert(leaderID, Equals, e1.ID())
   131  	c.Assert(leaderAddr, Equals, addr1)
   132  	if !normalExit {
   133  		c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/pkg/election/mockCampaignLoopExitedAbnormally"), IsNil)
   134  	}
   135  
   136  	// start e2
   137  	ctx2, cancel2 := context.WithCancel(context.Background())
   138  	defer cancel2()
   139  	e2, err := NewElection(ctx2, cli, sessionTTL, key, ID2, addr2, t.notifyBlockTime)
   140  	c.Assert(err, IsNil)
   141  	defer e2.Close()
   142  	select {
   143  	case leader := <-e2.leaderCh:
   144  		c.Assert(leader.ID, Equals, ID1)
   145  	case <-time.After(timeout):
   146  		c.Fatal("leader campaign timeout")
   147  	}
   148  	// but the leader should still be e1
   149  	_, leaderID, leaderAddr, err = e2.LeaderInfo(ctx2)
   150  	c.Assert(err, IsNil)
   151  	c.Assert(leaderID, Equals, e1.ID())
   152  	c.Assert(leaderAddr, Equals, addr1)
   153  	c.Assert(e2.IsLeader(), IsFalse)
   154  
   155  	var wg sync.WaitGroup
   156  	e1.Close() // stop the campaign for e1
   157  	c.Assert(e1.IsLeader(), IsFalse)
   158  
   159  	ctx3, cancel3 := context.WithTimeout(context.Background(), 3*time.Second)
   160  	defer cancel3()
   161  	deleted, err := e2.ClearSessionIfNeeded(ctx3, ID1)
   162  	c.Assert(err, IsNil)
   163  	if normalExit {
   164  		// for normally exited election, session has already been closed before
   165  		c.Assert(deleted, IsFalse)
   166  	} else {
   167  		// for abnormally exited election, session will be cleared here
   168  		c.Assert(deleted, IsTrue)
   169  	}
   170  
   171  	// e2 should become the leader
   172  	select {
   173  	case leader := <-e2.LeaderNotify():
   174  		c.Assert(leader.ID, Equals, ID2)
   175  	case <-time.After(timeout):
   176  		c.Fatal("leader campaign timeout")
   177  	}
   178  	c.Assert(e2.IsLeader(), IsTrue)
   179  	_, leaderID, leaderAddr, err = e2.LeaderInfo(ctx2)
   180  	c.Assert(err, IsNil)
   181  	c.Assert(leaderID, Equals, e2.ID())
   182  	c.Assert(leaderAddr, Equals, addr2)
   183  
   184  	// only e2's election info is left in etcd
   185  	ctx4, cancel4 := context.WithTimeout(context.Background(), 3*time.Second)
   186  	defer cancel4()
   187  	resp, err := cli.Get(ctx4, key, clientv3.WithPrefix())
   188  	c.Assert(err, IsNil)
   189  	c.Assert(resp.Kvs, HasLen, 1)
   190  
   191  	// if closing the client when campaigning, we should get an error
   192  	wg.Add(1)
   193  	go func() {
   194  		defer wg.Done()
   195  		select {
   196  		case err2 := <-e2.ErrorNotify():
   197  			c.Assert(terror.ErrElectionCampaignFail.Equal(err2), IsTrue)
   198  			// the old session is done, but we can't create a new one.
   199  			c.Assert(err2, ErrorMatches, ".*fail to campaign leader: create a new session.*")
   200  		case <-time.After(timeout):
   201  			c.Fatal("do not receive error for e2")
   202  		}
   203  	}()
   204  	cli.Close() // close the client
   205  	wg.Wait()
   206  
   207  	// can not elect with closed client.
   208  	ctx5, cancel5 := context.WithCancel(context.Background())
   209  	defer cancel5()
   210  	_, err = NewElection(ctx5, cli, sessionTTL, key, ID3, addr3, t.notifyBlockTime)
   211  	c.Assert(terror.ErrElectionCampaignFail.Equal(err), IsTrue)
   212  	c.Assert(err, ErrorMatches, ".*Message: fail to campaign leader: create the initial session, RawCause: context canceled.*")
   213  	cancel0()
   214  }
   215  
   216  func (t *testElectionSuite) TestElection2After1(c *C) {
   217  	testElection2After1(t, c, true)
   218  	testElection2After1(t, c, false)
   219  }
   220  
   221  func (t *testElectionSuite) TestElectionAlways1(c *C) {
   222  	var (
   223  		timeout    = 3 * time.Second
   224  		sessionTTL = 60
   225  		key        = "unit-test/election-always-1"
   226  		ID1        = "member1"
   227  		ID2        = "member2"
   228  		addr1      = "127.0.0.1:1234"
   229  		addr2      = "127.0.0.1:2345"
   230  	)
   231  	cli, err := etcdutil.CreateClient([]string{t.endPoint}, nil)
   232  	c.Assert(err, IsNil)
   233  	defer cli.Close()
   234  
   235  	ctx1, cancel1 := context.WithCancel(context.Background())
   236  	defer cancel1()
   237  	e1, err := NewElection(ctx1, cli, sessionTTL, key, ID1, addr1, t.notifyBlockTime)
   238  	c.Assert(err, IsNil)
   239  	defer e1.Close()
   240  
   241  	// e1 should become the leader
   242  	select {
   243  	case leader := <-e1.LeaderNotify():
   244  		c.Assert(leader.ID, Equals, ID1)
   245  	case <-time.After(timeout):
   246  		c.Fatal("leader campaign timeout")
   247  	}
   248  	c.Assert(e1.IsLeader(), IsTrue)
   249  	_, leaderID, leaderAddr, err := e1.LeaderInfo(ctx1)
   250  	c.Assert(err, IsNil)
   251  	c.Assert(leaderID, Equals, e1.ID())
   252  	c.Assert(leaderAddr, Equals, addr1)
   253  
   254  	// start e2
   255  	ctx2, cancel2 := context.WithCancel(context.Background())
   256  	defer cancel2()
   257  	e2, err := NewElection(ctx2, cli, sessionTTL, key, ID2, addr2, t.notifyBlockTime)
   258  	c.Assert(err, IsNil)
   259  	defer e2.Close()
   260  	time.Sleep(100 * time.Millisecond) // wait 100ms to start the campaign
   261  	// but the leader should still be e1
   262  	_, leaderID, leaderAddr, err = e2.LeaderInfo(ctx2)
   263  	c.Assert(err, IsNil)
   264  	c.Assert(leaderID, Equals, e1.ID())
   265  	c.Assert(leaderAddr, Equals, addr1)
   266  	c.Assert(e2.IsLeader(), IsFalse)
   267  
   268  	// cancel the campaign for e2, should get no errors
   269  	var wg sync.WaitGroup
   270  	wg.Add(1)
   271  	go func() {
   272  		defer wg.Done()
   273  		select {
   274  		case err2 := <-e2.ErrorNotify():
   275  			c.Fatalf("cancel the campaign should not get an error, %v", err2)
   276  		case <-time.After(timeout): // wait 3s
   277  		}
   278  	}()
   279  	cancel2()
   280  	wg.Wait()
   281  
   282  	// e1 is still the leader
   283  	c.Assert(e1.IsLeader(), IsTrue)
   284  	_, leaderID, leaderAddr, err = e1.LeaderInfo(ctx1)
   285  	c.Assert(err, IsNil)
   286  	c.Assert(leaderID, Equals, e1.ID())
   287  	c.Assert(leaderAddr, Equals, addr1)
   288  	c.Assert(e2.IsLeader(), IsFalse)
   289  }
   290  
   291  func (t *testElectionSuite) TestElectionEvictLeader(c *C) {
   292  	var (
   293  		timeout    = 3 * time.Second
   294  		sessionTTL = 60
   295  		key        = "unit-test/election-evict-leader"
   296  		ID1        = "member1"
   297  		ID2        = "member2"
   298  		addr1      = "127.0.0.1:1234"
   299  		addr2      = "127.0.0.1:2345"
   300  	)
   301  	cli, err := etcdutil.CreateClient([]string{t.endPoint}, nil)
   302  	c.Assert(err, IsNil)
   303  	defer cli.Close()
   304  
   305  	ctx1, cancel1 := context.WithCancel(context.Background())
   306  	defer cancel1()
   307  	e1, err := NewElection(ctx1, cli, sessionTTL, key, ID1, addr1, t.notifyBlockTime)
   308  	c.Assert(err, IsNil)
   309  	defer e1.Close()
   310  
   311  	// e1 should become the leader
   312  	select {
   313  	case leader := <-e1.LeaderNotify():
   314  		c.Assert(leader.ID, Equals, ID1)
   315  	case <-time.After(timeout):
   316  		c.Fatal("leader campaign timeout")
   317  	}
   318  	c.Assert(e1.IsLeader(), IsTrue)
   319  	_, leaderID, leaderAddr, err := e1.LeaderInfo(ctx1)
   320  	c.Assert(err, IsNil)
   321  	c.Assert(leaderID, Equals, e1.ID())
   322  	c.Assert(leaderAddr, Equals, addr1)
   323  
   324  	// start e2
   325  	ctx2, cancel2 := context.WithCancel(context.Background())
   326  	defer cancel2()
   327  	e2, err := NewElection(ctx2, cli, sessionTTL, key, ID2, addr2, t.notifyBlockTime)
   328  	c.Assert(err, IsNil)
   329  	defer e2.Close()
   330  	time.Sleep(100 * time.Millisecond) // wait 100ms to start the campaign
   331  	// but the leader should still be e1
   332  	_, leaderID, leaderAddr, err = e2.LeaderInfo(ctx2)
   333  	c.Assert(err, IsNil)
   334  	c.Assert(leaderID, Equals, e1.ID())
   335  	c.Assert(leaderAddr, Equals, addr1)
   336  	c.Assert(e2.IsLeader(), IsFalse)
   337  
   338  	// e1 evict leader, and e2 will be the leader
   339  	e1.EvictLeader()
   340  	utils.WaitSomething(8, 250*time.Millisecond, func() bool {
   341  		_, leaderID, _, _ = e2.LeaderInfo(ctx2)
   342  		return leaderID == e2.ID()
   343  	})
   344  	_, leaderID, leaderAddr, err = e2.LeaderInfo(ctx2)
   345  	c.Assert(err, IsNil)
   346  	c.Assert(leaderID, Equals, e2.ID())
   347  	c.Assert(leaderAddr, Equals, addr2)
   348  	utils.WaitSomething(10, 10*time.Millisecond, func() bool {
   349  		return e2.IsLeader()
   350  	})
   351  
   352  	// cancel evict of e1, and then evict e2, e1 will be the leader
   353  	e1.CancelEvictLeader()
   354  	e2.EvictLeader()
   355  	utils.WaitSomething(8, 250*time.Millisecond, func() bool {
   356  		_, leaderID, _, _ = e1.LeaderInfo(ctx1)
   357  		return leaderID == e1.ID()
   358  	})
   359  	_, leaderID, leaderAddr, err = e1.LeaderInfo(ctx1)
   360  	c.Assert(err, IsNil)
   361  	c.Assert(leaderID, Equals, e1.ID())
   362  	c.Assert(leaderAddr, Equals, addr1)
   363  	utils.WaitSomething(10, 10*time.Millisecond, func() bool {
   364  		return e1.IsLeader()
   365  	})
   366  }
   367  
   368  func (t *testElectionSuite) TestElectionDeleteKey(c *C) {
   369  	var (
   370  		timeout    = 3 * time.Second
   371  		sessionTTL = 60
   372  		key        = "unit-test/election-delete-key"
   373  		ID         = "member"
   374  		addr       = "127.0.0.1:1234"
   375  	)
   376  	cli, err := etcdutil.CreateClient([]string{t.endPoint}, nil)
   377  	c.Assert(err, IsNil)
   378  	defer cli.Close()
   379  
   380  	ctx, cancel := context.WithCancel(context.Background())
   381  	defer cancel()
   382  	e, err := NewElection(ctx, cli, sessionTTL, key, ID, addr, t.notifyBlockTime)
   383  	c.Assert(err, IsNil)
   384  	defer e.Close()
   385  
   386  	// should become the leader
   387  	select {
   388  	case leader := <-e.LeaderNotify():
   389  		c.Assert(leader.ID, Equals, ID)
   390  	case <-time.After(timeout):
   391  		c.Fatal("leader campaign timeout")
   392  	}
   393  	c.Assert(e.IsLeader(), IsTrue)
   394  	leaderKey, leaderID, leaderAddr, err := e.LeaderInfo(ctx)
   395  	c.Assert(err, IsNil)
   396  	c.Assert(leaderID, Equals, e.ID())
   397  	c.Assert(leaderAddr, Equals, addr)
   398  
   399  	// the leader retired after deleted the key
   400  	var wg sync.WaitGroup
   401  	wg.Add(1)
   402  	go func() {
   403  		wg.Done()
   404  		select {
   405  		case err2 := <-e.ErrorNotify():
   406  			c.Fatalf("delete the leader key should not get an error, %v", err2)
   407  		case leader := <-e.LeaderNotify():
   408  			c.Assert(leader, IsNil)
   409  		}
   410  	}()
   411  	_, err = cli.Delete(ctx, leaderKey)
   412  	c.Assert(err, IsNil)
   413  	wg.Wait()
   414  }