go.etcd.io/etcd@v3.3.27+incompatible/etcdserver/raft.go (about) 1 // Copyright 2015 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package etcdserver 16 17 import ( 18 "encoding/json" 19 "expvar" 20 "sort" 21 "sync" 22 "sync/atomic" 23 "time" 24 25 pb "github.com/coreos/etcd/etcdserver/etcdserverpb" 26 "github.com/coreos/etcd/etcdserver/membership" 27 "github.com/coreos/etcd/pkg/contention" 28 "github.com/coreos/etcd/pkg/pbutil" 29 "github.com/coreos/etcd/pkg/types" 30 "github.com/coreos/etcd/raft" 31 "github.com/coreos/etcd/raft/raftpb" 32 "github.com/coreos/etcd/rafthttp" 33 "github.com/coreos/etcd/wal" 34 "github.com/coreos/etcd/wal/walpb" 35 "github.com/coreos/pkg/capnslog" 36 ) 37 38 const ( 39 // Number of entries for slow follower to catch-up after compacting 40 // the raft storage entries. 41 // We expect the follower has a millisecond level latency with the leader. 42 // The max throughput is around 10K. Keep a 5K entries is enough for helping 43 // follower to catch up. 44 numberOfCatchUpEntries = 5000 45 46 // The max throughput of etcd will not exceed 100MB/s (100K * 1KB value). 47 // Assuming the RTT is around 10ms, 1MB max size is large enough. 48 maxSizePerMsg = 1 * 1024 * 1024 49 // Never overflow the rafthttp buffer, which is 4096. 50 // TODO: a better const? 51 maxInflightMsgs = 4096 / 8 52 ) 53 54 var ( 55 // protects raftStatus 56 raftStatusMu sync.Mutex 57 // indirection for expvar func interface 58 // expvar panics when publishing duplicate name 59 // expvar does not support remove a registered name 60 // so only register a func that calls raftStatus 61 // and change raftStatus as we need. 62 raftStatus func() raft.Status 63 ) 64 65 func init() { 66 raft.SetLogger(capnslog.NewPackageLogger("github.com/coreos/etcd", "raft")) 67 expvar.Publish("raft.status", expvar.Func(func() interface{} { 68 raftStatusMu.Lock() 69 defer raftStatusMu.Unlock() 70 return raftStatus() 71 })) 72 } 73 74 type RaftTimer interface { 75 Index() uint64 76 Term() uint64 77 } 78 79 // apply contains entries, snapshot to be applied. Once 80 // an apply is consumed, the entries will be persisted to 81 // to raft storage concurrently; the application must read 82 // raftDone before assuming the raft messages are stable. 83 type apply struct { 84 entries []raftpb.Entry 85 snapshot raftpb.Snapshot 86 // notifyc synchronizes etcd server applies with the raft node 87 notifyc chan struct{} 88 } 89 90 type raftNode struct { 91 // Cache of the latest raft index and raft term the server has seen. 92 // These three unit64 fields must be the first elements to keep 64-bit 93 // alignment for atomic access to the fields. 94 index uint64 95 term uint64 96 lead uint64 97 98 tickMu *sync.Mutex 99 raftNodeConfig 100 101 // a chan to send/receive snapshot 102 msgSnapC chan raftpb.Message 103 104 // a chan to send out apply 105 applyc chan apply 106 107 // a chan to send out readState 108 readStateC chan raft.ReadState 109 110 // utility 111 ticker *time.Ticker 112 // contention detectors for raft heartbeat message 113 td *contention.TimeoutDetector 114 115 stopped chan struct{} 116 done chan struct{} 117 } 118 119 type raftNodeConfig struct { 120 // to check if msg receiver is removed from cluster 121 isIDRemoved func(id uint64) bool 122 raft.Node 123 raftStorage *raft.MemoryStorage 124 storage Storage 125 heartbeat time.Duration // for logging 126 // transport specifies the transport to send and receive msgs to members. 127 // Sending messages MUST NOT block. It is okay to drop messages, since 128 // clients should timeout and reissue their messages. 129 // If transport is nil, server will panic. 130 transport rafthttp.Transporter 131 } 132 133 func newRaftNode(cfg raftNodeConfig) *raftNode { 134 r := &raftNode{ 135 tickMu: new(sync.Mutex), 136 raftNodeConfig: cfg, 137 // set up contention detectors for raft heartbeat message. 138 // expect to send a heartbeat within 2 heartbeat intervals. 139 td: contention.NewTimeoutDetector(2 * cfg.heartbeat), 140 readStateC: make(chan raft.ReadState, 1), 141 msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap), 142 applyc: make(chan apply), 143 stopped: make(chan struct{}), 144 done: make(chan struct{}), 145 } 146 if r.heartbeat == 0 { 147 r.ticker = &time.Ticker{} 148 } else { 149 r.ticker = time.NewTicker(r.heartbeat) 150 } 151 return r 152 } 153 154 // raft.Node does not have locks in Raft package 155 func (r *raftNode) tick() { 156 r.tickMu.Lock() 157 r.Tick() 158 r.tickMu.Unlock() 159 } 160 161 // start prepares and starts raftNode in a new goroutine. It is no longer safe 162 // to modify the fields after it has been started. 163 func (r *raftNode) start(rh *raftReadyHandler) { 164 internalTimeout := time.Second 165 166 go func() { 167 defer r.onStop() 168 islead := false 169 170 for { 171 select { 172 case <-r.ticker.C: 173 r.tick() 174 case rd := <-r.Ready(): 175 if rd.SoftState != nil { 176 newLeader := rd.SoftState.Lead != raft.None && atomic.LoadUint64(&r.lead) != rd.SoftState.Lead 177 if newLeader { 178 leaderChanges.Inc() 179 } 180 181 if rd.SoftState.Lead == raft.None { 182 hasLeader.Set(0) 183 } else { 184 hasLeader.Set(1) 185 } 186 187 atomic.StoreUint64(&r.lead, rd.SoftState.Lead) 188 islead = rd.RaftState == raft.StateLeader 189 if islead { 190 isLeader.Set(1) 191 } else { 192 isLeader.Set(0) 193 } 194 rh.updateLeadership(newLeader) 195 r.td.Reset() 196 } 197 198 if len(rd.ReadStates) != 0 { 199 select { 200 case r.readStateC <- rd.ReadStates[len(rd.ReadStates)-1]: 201 case <-time.After(internalTimeout): 202 plog.Warningf("timed out sending read state") 203 case <-r.stopped: 204 return 205 } 206 } 207 208 notifyc := make(chan struct{}, 1) 209 ap := apply{ 210 entries: rd.CommittedEntries, 211 snapshot: rd.Snapshot, 212 notifyc: notifyc, 213 } 214 215 updateCommittedIndex(&ap, rh) 216 217 select { 218 case r.applyc <- ap: 219 case <-r.stopped: 220 return 221 } 222 223 // the leader can write to its disk in parallel with replicating to the followers and them 224 // writing to their disks. 225 // For more details, check raft thesis 10.2.1 226 if islead { 227 // gofail: var raftBeforeLeaderSend struct{} 228 r.transport.Send(r.processMessages(rd.Messages)) 229 } 230 231 // Must save the snapshot file and WAL snapshot entry before saving any other entries or hardstate to 232 // ensure that recovery after a snapshot restore is possible. 233 if !raft.IsEmptySnap(rd.Snapshot) { 234 // gofail: var raftBeforeSaveSnap struct{} 235 if err := r.storage.SaveSnap(rd.Snapshot); err != nil { 236 plog.Fatalf("failed to save Raft snapshot %v", err) 237 } 238 // gofail: var raftAfterSaveSnap struct{} 239 } 240 241 // gofail: var raftBeforeSave struct{} 242 if err := r.storage.Save(rd.HardState, rd.Entries); err != nil { 243 plog.Fatalf("failed to raft save state and entries %v", err) 244 } 245 if !raft.IsEmptyHardState(rd.HardState) { 246 proposalsCommitted.Set(float64(rd.HardState.Commit)) 247 } 248 // gofail: var raftAfterSave struct{} 249 250 if !raft.IsEmptySnap(rd.Snapshot) { 251 // Force WAL to fsync its hard state before Release() releases 252 // old data from the WAL. Otherwise could get an error like: 253 // panic: tocommit(107) is out of range [lastIndex(84)]. Was the raft log corrupted, truncated, or lost? 254 // See https://github.com/etcd-io/etcd/issues/10219 for more details. 255 if err := r.storage.Sync(); err != nil { 256 plog.Fatalf("failed to sync Raft snapshot %v", err) 257 } 258 259 // etcdserver now claim the snapshot has been persisted onto the disk 260 notifyc <- struct{}{} 261 262 // gofail: var raftAfterSaveSnap struct{} 263 r.raftStorage.ApplySnapshot(rd.Snapshot) 264 plog.Infof("raft applied incoming snapshot at index %d", rd.Snapshot.Metadata.Index) 265 // gofail: var raftAfterApplySnap struct{} 266 267 if err := r.storage.Release(rd.Snapshot); err != nil { 268 plog.Fatalf("failed to release Raft wal %v", err) 269 } 270 } 271 272 r.raftStorage.Append(rd.Entries) 273 274 if !islead { 275 // finish processing incoming messages before we signal raftdone chan 276 msgs := r.processMessages(rd.Messages) 277 278 // now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots 279 notifyc <- struct{}{} 280 281 // Candidate or follower needs to wait for all pending configuration 282 // changes to be applied before sending messages. 283 // Otherwise we might incorrectly count votes (e.g. votes from removed members). 284 // Also slow machine's follower raft-layer could proceed to become the leader 285 // on its own single-node cluster, before apply-layer applies the config change. 286 // We simply wait for ALL pending entries to be applied for now. 287 // We might improve this later on if it causes unnecessary long blocking issues. 288 waitApply := false 289 for _, ent := range rd.CommittedEntries { 290 if ent.Type == raftpb.EntryConfChange { 291 waitApply = true 292 break 293 } 294 } 295 if waitApply { 296 // blocks until 'applyAll' calls 'applyWait.Trigger' 297 // to be in sync with scheduled config-change job 298 // (assume notifyc has cap of 1) 299 select { 300 case notifyc <- struct{}{}: 301 case <-r.stopped: 302 return 303 } 304 } 305 306 // gofail: var raftBeforeFollowerSend struct{} 307 r.transport.Send(msgs) 308 } else { 309 // leader already processed 'MsgSnap' and signaled 310 notifyc <- struct{}{} 311 } 312 313 r.Advance() 314 case <-r.stopped: 315 return 316 } 317 } 318 }() 319 } 320 321 func updateCommittedIndex(ap *apply, rh *raftReadyHandler) { 322 var ci uint64 323 if len(ap.entries) != 0 { 324 ci = ap.entries[len(ap.entries)-1].Index 325 } 326 if ap.snapshot.Metadata.Index > ci { 327 ci = ap.snapshot.Metadata.Index 328 } 329 if ci != 0 { 330 rh.updateCommittedIndex(ci) 331 } 332 } 333 334 func (r *raftNode) processMessages(ms []raftpb.Message) []raftpb.Message { 335 sentAppResp := false 336 for i := len(ms) - 1; i >= 0; i-- { 337 if r.isIDRemoved(ms[i].To) { 338 ms[i].To = 0 339 } 340 341 if ms[i].Type == raftpb.MsgAppResp { 342 if sentAppResp { 343 ms[i].To = 0 344 } else { 345 sentAppResp = true 346 } 347 } 348 349 if ms[i].Type == raftpb.MsgSnap { 350 // There are two separate data store: the store for v2, and the KV for v3. 351 // The msgSnap only contains the most recent snapshot of store without KV. 352 // So we need to redirect the msgSnap to etcd server main loop for merging in the 353 // current store snapshot and KV snapshot. 354 select { 355 case r.msgSnapC <- ms[i]: 356 default: 357 // drop msgSnap if the inflight chan if full. 358 } 359 ms[i].To = 0 360 } 361 if ms[i].Type == raftpb.MsgHeartbeat { 362 ok, exceed := r.td.Observe(ms[i].To) 363 if !ok { 364 // TODO: limit request rate. 365 plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v, to %x)", r.heartbeat, exceed, ms[i].To) 366 plog.Warningf("server is likely overloaded") 367 heartbeatSendFailures.Inc() 368 } 369 } 370 } 371 return ms 372 } 373 374 func (r *raftNode) apply() chan apply { 375 return r.applyc 376 } 377 378 func (r *raftNode) stop() { 379 r.stopped <- struct{}{} 380 <-r.done 381 } 382 383 func (r *raftNode) onStop() { 384 r.Stop() 385 r.ticker.Stop() 386 r.transport.Stop() 387 if err := r.storage.Close(); err != nil { 388 plog.Panicf("raft close storage error: %v", err) 389 } 390 close(r.done) 391 } 392 393 // for testing 394 func (r *raftNode) pauseSending() { 395 p := r.transport.(rafthttp.Pausable) 396 p.Pause() 397 } 398 399 func (r *raftNode) resumeSending() { 400 p := r.transport.(rafthttp.Pausable) 401 p.Resume() 402 } 403 404 // advanceTicks advances ticks of Raft node. 405 // This can be used for fast-forwarding election 406 // ticks in multi data-center deployments, thus 407 // speeding up election process. 408 func (r *raftNode) advanceTicks(ticks int) { 409 for i := 0; i < ticks; i++ { 410 r.tick() 411 } 412 } 413 414 func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id types.ID, n raft.Node, s *raft.MemoryStorage, w *wal.WAL) { 415 var err error 416 member := cl.MemberByName(cfg.Name) 417 metadata := pbutil.MustMarshal( 418 &pb.Metadata{ 419 NodeID: uint64(member.ID), 420 ClusterID: uint64(cl.ID()), 421 }, 422 ) 423 if w, err = wal.Create(cfg.WALDir(), metadata); err != nil { 424 plog.Panicf("create wal error: %v", err) 425 } 426 peers := make([]raft.Peer, len(ids)) 427 for i, id := range ids { 428 ctx, err := json.Marshal((*cl).Member(id)) 429 if err != nil { 430 plog.Panicf("marshal member should never fail: %v", err) 431 } 432 peers[i] = raft.Peer{ID: uint64(id), Context: ctx} 433 } 434 id = member.ID 435 plog.Infof("starting member %s in cluster %s", id, cl.ID()) 436 s = raft.NewMemoryStorage() 437 c := &raft.Config{ 438 ID: uint64(id), 439 ElectionTick: cfg.ElectionTicks, 440 HeartbeatTick: 1, 441 Storage: s, 442 MaxSizePerMsg: maxSizePerMsg, 443 MaxInflightMsgs: maxInflightMsgs, 444 CheckQuorum: true, 445 } 446 447 n = raft.StartNode(c, peers) 448 raftStatusMu.Lock() 449 raftStatus = n.Status 450 raftStatusMu.Unlock() 451 return id, n, s, w 452 } 453 454 func restartNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *membership.RaftCluster, raft.Node, *raft.MemoryStorage, *wal.WAL) { 455 var walsnap walpb.Snapshot 456 if snapshot != nil { 457 walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term 458 } 459 w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap) 460 461 plog.Infof("restarting member %s in cluster %s at commit index %d", id, cid, st.Commit) 462 cl := membership.NewCluster("") 463 cl.SetID(cid) 464 s := raft.NewMemoryStorage() 465 if snapshot != nil { 466 s.ApplySnapshot(*snapshot) 467 } 468 s.SetHardState(st) 469 s.Append(ents) 470 c := &raft.Config{ 471 ID: uint64(id), 472 ElectionTick: cfg.ElectionTicks, 473 HeartbeatTick: 1, 474 Storage: s, 475 MaxSizePerMsg: maxSizePerMsg, 476 MaxInflightMsgs: maxInflightMsgs, 477 CheckQuorum: true, 478 } 479 480 n := raft.RestartNode(c) 481 raftStatusMu.Lock() 482 raftStatus = n.Status 483 raftStatusMu.Unlock() 484 return id, cl, n, s, w 485 } 486 487 func restartAsStandaloneNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *membership.RaftCluster, raft.Node, *raft.MemoryStorage, *wal.WAL) { 488 var walsnap walpb.Snapshot 489 if snapshot != nil { 490 walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term 491 } 492 w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap) 493 494 // discard the previously uncommitted entries 495 for i, ent := range ents { 496 if ent.Index > st.Commit { 497 plog.Infof("discarding %d uncommitted WAL entries ", len(ents)-i) 498 ents = ents[:i] 499 break 500 } 501 } 502 503 // force append the configuration change entries 504 toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(id), st.Term, st.Commit) 505 ents = append(ents, toAppEnts...) 506 507 // force commit newly appended entries 508 err := w.Save(raftpb.HardState{}, toAppEnts) 509 if err != nil { 510 plog.Fatalf("%v", err) 511 } 512 if len(ents) != 0 { 513 st.Commit = ents[len(ents)-1].Index 514 } 515 516 plog.Printf("forcing restart of member %s in cluster %s at commit index %d", id, cid, st.Commit) 517 cl := membership.NewCluster("") 518 cl.SetID(cid) 519 s := raft.NewMemoryStorage() 520 if snapshot != nil { 521 s.ApplySnapshot(*snapshot) 522 } 523 s.SetHardState(st) 524 s.Append(ents) 525 c := &raft.Config{ 526 ID: uint64(id), 527 ElectionTick: cfg.ElectionTicks, 528 HeartbeatTick: 1, 529 Storage: s, 530 MaxSizePerMsg: maxSizePerMsg, 531 MaxInflightMsgs: maxInflightMsgs, 532 CheckQuorum: true, 533 } 534 n := raft.RestartNode(c) 535 raftStatus = n.Status 536 return id, cl, n, s, w 537 } 538 539 // getIDs returns an ordered set of IDs included in the given snapshot and 540 // the entries. The given snapshot/entries can contain two kinds of 541 // ID-related entry: 542 // - ConfChangeAddNode, in which case the contained ID will be added into the set. 543 // - ConfChangeRemoveNode, in which case the contained ID will be removed from the set. 544 func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 { 545 ids := make(map[uint64]bool) 546 if snap != nil { 547 for _, id := range snap.Metadata.ConfState.Nodes { 548 ids[id] = true 549 } 550 } 551 for _, e := range ents { 552 if e.Type != raftpb.EntryConfChange { 553 continue 554 } 555 var cc raftpb.ConfChange 556 pbutil.MustUnmarshal(&cc, e.Data) 557 switch cc.Type { 558 case raftpb.ConfChangeAddNode: 559 ids[cc.NodeID] = true 560 case raftpb.ConfChangeRemoveNode: 561 delete(ids, cc.NodeID) 562 case raftpb.ConfChangeUpdateNode: 563 // do nothing 564 default: 565 plog.Panicf("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!") 566 } 567 } 568 sids := make(types.Uint64Slice, 0, len(ids)) 569 for id := range ids { 570 sids = append(sids, id) 571 } 572 sort.Sort(sids) 573 return []uint64(sids) 574 } 575 576 // createConfigChangeEnts creates a series of Raft entries (i.e. 577 // EntryConfChange) to remove the set of given IDs from the cluster. The ID 578 // `self` is _not_ removed, even if present in the set. 579 // If `self` is not inside the given ids, it creates a Raft entry to add a 580 // default member with the given `self`. 581 func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry { 582 ents := make([]raftpb.Entry, 0) 583 next := index + 1 584 found := false 585 for _, id := range ids { 586 if id == self { 587 found = true 588 continue 589 } 590 cc := &raftpb.ConfChange{ 591 Type: raftpb.ConfChangeRemoveNode, 592 NodeID: id, 593 } 594 e := raftpb.Entry{ 595 Type: raftpb.EntryConfChange, 596 Data: pbutil.MustMarshal(cc), 597 Term: term, 598 Index: next, 599 } 600 ents = append(ents, e) 601 next++ 602 } 603 if !found { 604 m := membership.Member{ 605 ID: types.ID(self), 606 RaftAttributes: membership.RaftAttributes{PeerURLs: []string{"http://localhost:2380"}}, 607 } 608 ctx, err := json.Marshal(m) 609 if err != nil { 610 plog.Panicf("marshal member should never fail: %v", err) 611 } 612 cc := &raftpb.ConfChange{ 613 Type: raftpb.ConfChangeAddNode, 614 NodeID: self, 615 Context: ctx, 616 } 617 e := raftpb.Entry{ 618 Type: raftpb.EntryConfChange, 619 Data: pbutil.MustMarshal(cc), 620 Term: term, 621 Index: next, 622 } 623 ents = append(ents, e) 624 } 625 return ents 626 }