go.etcd.io/etcd@v3.3.27+incompatible/etcdserver/server.go (about) 1 // Copyright 2015 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package etcdserver 16 17 import ( 18 "context" 19 "encoding/json" 20 "expvar" 21 "fmt" 22 "math" 23 "math/rand" 24 "net/http" 25 "os" 26 "path" 27 "regexp" 28 "sync" 29 "sync/atomic" 30 "time" 31 32 "github.com/coreos/etcd/alarm" 33 "github.com/coreos/etcd/auth" 34 "github.com/coreos/etcd/compactor" 35 "github.com/coreos/etcd/discovery" 36 "github.com/coreos/etcd/etcdserver/api" 37 "github.com/coreos/etcd/etcdserver/api/v2http/httptypes" 38 pb "github.com/coreos/etcd/etcdserver/etcdserverpb" 39 "github.com/coreos/etcd/etcdserver/membership" 40 "github.com/coreos/etcd/etcdserver/stats" 41 "github.com/coreos/etcd/lease" 42 "github.com/coreos/etcd/lease/leasehttp" 43 "github.com/coreos/etcd/mvcc" 44 "github.com/coreos/etcd/mvcc/backend" 45 "github.com/coreos/etcd/pkg/fileutil" 46 "github.com/coreos/etcd/pkg/idutil" 47 "github.com/coreos/etcd/pkg/pbutil" 48 "github.com/coreos/etcd/pkg/runtime" 49 "github.com/coreos/etcd/pkg/schedule" 50 "github.com/coreos/etcd/pkg/types" 51 "github.com/coreos/etcd/pkg/wait" 52 "github.com/coreos/etcd/raft" 53 "github.com/coreos/etcd/raft/raftpb" 54 "github.com/coreos/etcd/rafthttp" 55 "github.com/coreos/etcd/snap" 56 "github.com/coreos/etcd/store" 57 "github.com/coreos/etcd/version" 58 "github.com/coreos/etcd/wal" 59 60 "github.com/coreos/go-semver/semver" 61 "github.com/coreos/pkg/capnslog" 62 "github.com/prometheus/client_golang/prometheus" 63 ) 64 65 const ( 66 DefaultSnapCount = 100000 67 68 StoreClusterPrefix = "/0" 69 StoreKeysPrefix = "/1" 70 71 // HealthInterval is the minimum time the cluster should be healthy 72 // before accepting add member requests. 73 HealthInterval = 5 * time.Second 74 75 purgeFileInterval = 30 * time.Second 76 // monitorVersionInterval should be smaller than the timeout 77 // on the connection. Or we will not be able to reuse the connection 78 // (since it will timeout). 79 monitorVersionInterval = rafthttp.ConnWriteTimeout - time.Second 80 81 // max number of in-flight snapshot messages etcdserver allows to have 82 // This number is more than enough for most clusters with 5 machines. 83 maxInFlightMsgSnap = 16 84 85 releaseDelayAfterSnapshot = 30 * time.Second 86 87 // maxPendingRevokes is the maximum number of outstanding expired lease revocations. 88 maxPendingRevokes = 16 89 90 recommendedMaxRequestBytes = 10 * 1024 * 1024 91 ) 92 93 var ( 94 plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "etcdserver") 95 96 storeMemberAttributeRegexp = regexp.MustCompile(path.Join(membership.StoreMembersPrefix, "[[:xdigit:]]{1,16}", "attributes")) 97 ) 98 99 func init() { 100 rand.Seed(time.Now().UnixNano()) 101 102 expvar.Publish( 103 "file_descriptor_limit", 104 expvar.Func( 105 func() interface{} { 106 n, _ := runtime.FDLimit() 107 return n 108 }, 109 ), 110 ) 111 } 112 113 type Response struct { 114 Term uint64 115 Index uint64 116 Event *store.Event 117 Watcher store.Watcher 118 Err error 119 } 120 121 type ServerV2 interface { 122 Server 123 // Do takes a V2 request and attempts to fulfill it, returning a Response. 124 Do(ctx context.Context, r pb.Request) (Response, error) 125 stats.Stats 126 ClientCertAuthEnabled() bool 127 } 128 129 type ServerV3 interface { 130 Server 131 ID() types.ID 132 RaftTimer 133 } 134 135 func (s *EtcdServer) ClientCertAuthEnabled() bool { return s.Cfg.ClientCertAuthEnabled } 136 137 type Server interface { 138 // Leader returns the ID of the leader Server. 139 Leader() types.ID 140 141 // AddMember attempts to add a member into the cluster. It will return 142 // ErrIDRemoved if member ID is removed from the cluster, or return 143 // ErrIDExists if member ID exists in the cluster. 144 AddMember(ctx context.Context, memb membership.Member) ([]*membership.Member, error) 145 // RemoveMember attempts to remove a member from the cluster. It will 146 // return ErrIDRemoved if member ID is removed from the cluster, or return 147 // ErrIDNotFound if member ID is not in the cluster. 148 RemoveMember(ctx context.Context, id uint64) ([]*membership.Member, error) 149 // UpdateMember attempts to update an existing member in the cluster. It will 150 // return ErrIDNotFound if the member ID does not exist. 151 UpdateMember(ctx context.Context, updateMemb membership.Member) ([]*membership.Member, error) 152 153 // ClusterVersion is the cluster-wide minimum major.minor version. 154 // Cluster version is set to the min version that an etcd member is 155 // compatible with when first bootstrap. 156 // 157 // ClusterVersion is nil until the cluster is bootstrapped (has a quorum). 158 // 159 // During a rolling upgrades, the ClusterVersion will be updated 160 // automatically after a sync. (5 second by default) 161 // 162 // The API/raft component can utilize ClusterVersion to determine if 163 // it can accept a client request or a raft RPC. 164 // NOTE: ClusterVersion might be nil when etcd 2.1 works with etcd 2.0 and 165 // the leader is etcd 2.0. etcd 2.0 leader will not update clusterVersion since 166 // this feature is introduced post 2.0. 167 ClusterVersion() *semver.Version 168 Cluster() api.Cluster 169 Alarms() []*pb.AlarmMember 170 } 171 172 // EtcdServer is the production implementation of the Server interface 173 type EtcdServer struct { 174 // inflightSnapshots holds count the number of snapshots currently inflight. 175 inflightSnapshots int64 // must use atomic operations to access; keep 64-bit aligned. 176 appliedIndex uint64 // must use atomic operations to access; keep 64-bit aligned. 177 committedIndex uint64 // must use atomic operations to access; keep 64-bit aligned. 178 // consistIndex used to hold the offset of current executing entry 179 // It is initialized to 0 before executing any entry. 180 consistIndex consistentIndex // must use atomic operations to access; keep 64-bit aligned. 181 r raftNode // uses 64-bit atomics; keep 64-bit aligned. 182 183 readych chan struct{} 184 Cfg ServerConfig 185 186 w wait.Wait 187 188 readMu sync.RWMutex 189 // read routine notifies etcd server that it waits for reading by sending an empty struct to 190 // readwaitC 191 readwaitc chan struct{} 192 // readNotifier is used to notify the read routine that it can process the request 193 // when there is no error 194 readNotifier *notifier 195 196 // stop signals the run goroutine should shutdown. 197 stop chan struct{} 198 // stopping is closed by run goroutine on shutdown. 199 stopping chan struct{} 200 // done is closed when all goroutines from start() complete. 201 done chan struct{} 202 leaderChanged chan struct{} 203 leaderChangedMu sync.RWMutex 204 205 errorc chan error 206 id types.ID 207 attributes membership.Attributes 208 209 cluster *membership.RaftCluster 210 211 store store.Store 212 snapshotter *snap.Snapshotter 213 214 applyV2 ApplierV2 215 216 // applyV3 is the applier with auth and quotas 217 applyV3 applierV3 218 // applyV3Base is the core applier without auth or quotas 219 applyV3Base applierV3 220 applyWait wait.WaitTime 221 222 kv mvcc.ConsistentWatchableKV 223 lessor lease.Lessor 224 bemu sync.Mutex 225 be backend.Backend 226 authStore auth.AuthStore 227 alarmStore *alarm.AlarmStore 228 229 stats *stats.ServerStats 230 lstats *stats.LeaderStats 231 232 SyncTicker *time.Ticker 233 // compactor is used to auto-compact the KV. 234 compactor compactor.Compactor 235 236 // peerRt used to send requests (version, lease) to peers. 237 peerRt http.RoundTripper 238 reqIDGen *idutil.Generator 239 240 // forceVersionC is used to force the version monitor loop 241 // to detect the cluster version immediately. 242 forceVersionC chan struct{} 243 244 // wgMu blocks concurrent waitgroup mutation while server stopping 245 wgMu sync.RWMutex 246 // wg is used to wait for the go routines that depends on the server state 247 // to exit when stopping the server. 248 wg sync.WaitGroup 249 250 // ctx is used for etcd-initiated requests that may need to be canceled 251 // on etcd server shutdown. 252 ctx context.Context 253 cancel context.CancelFunc 254 255 leadTimeMu sync.RWMutex 256 leadElectedTime time.Time 257 } 258 259 // NewServer creates a new EtcdServer from the supplied configuration. The 260 // configuration is considered static for the lifetime of the EtcdServer. 261 func NewServer(cfg ServerConfig) (srv *EtcdServer, err error) { 262 st := store.New(StoreClusterPrefix, StoreKeysPrefix) 263 264 var ( 265 w *wal.WAL 266 n raft.Node 267 s *raft.MemoryStorage 268 id types.ID 269 cl *membership.RaftCluster 270 ) 271 272 if cfg.MaxRequestBytes > recommendedMaxRequestBytes { 273 plog.Warningf("MaxRequestBytes %v exceeds maximum recommended size %v", cfg.MaxRequestBytes, recommendedMaxRequestBytes) 274 } 275 276 if terr := fileutil.TouchDirAll(cfg.DataDir); terr != nil { 277 return nil, fmt.Errorf("cannot access data directory: %v", terr) 278 } 279 280 haveWAL := wal.Exist(cfg.WALDir()) 281 282 if err = fileutil.TouchDirAll(cfg.SnapDir()); err != nil { 283 plog.Fatalf("create snapshot directory error: %v", err) 284 } 285 ss := snap.New(cfg.SnapDir()) 286 287 bepath := cfg.backendPath() 288 beExist := fileutil.Exist(bepath) 289 be := openBackend(cfg) 290 291 defer func() { 292 if err != nil { 293 be.Close() 294 } 295 }() 296 297 prt, err := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) 298 if err != nil { 299 return nil, err 300 } 301 var ( 302 remotes []*membership.Member 303 snapshot *raftpb.Snapshot 304 ) 305 306 switch { 307 case !haveWAL && !cfg.NewCluster: 308 if err = cfg.VerifyJoinExisting(); err != nil { 309 return nil, err 310 } 311 cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) 312 if err != nil { 313 return nil, err 314 } 315 existingCluster, gerr := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) 316 if gerr != nil { 317 return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", gerr) 318 } 319 if err = membership.ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { 320 return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) 321 } 322 if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, prt) { 323 return nil, fmt.Errorf("incompatible with current running cluster") 324 } 325 326 remotes = existingCluster.Members() 327 cl.SetID(existingCluster.ID()) 328 cl.SetStore(st) 329 cl.SetBackend(be) 330 cfg.Print() 331 id, n, s, w = startNode(cfg, cl, nil) 332 case !haveWAL && cfg.NewCluster: 333 if err = cfg.VerifyBootstrap(); err != nil { 334 return nil, err 335 } 336 cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) 337 if err != nil { 338 return nil, err 339 } 340 m := cl.MemberByName(cfg.Name) 341 if isMemberBootstrapped(cl, cfg.Name, prt, cfg.bootstrapTimeout()) { 342 return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) 343 } 344 if cfg.ShouldDiscover() { 345 var str string 346 str, err = discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) 347 if err != nil { 348 return nil, &DiscoveryError{Op: "join", Err: err} 349 } 350 var urlsmap types.URLsMap 351 urlsmap, err = types.NewURLsMap(str) 352 if err != nil { 353 return nil, err 354 } 355 if checkDuplicateURL(urlsmap) { 356 return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) 357 } 358 if cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, urlsmap); err != nil { 359 return nil, err 360 } 361 } 362 cl.SetStore(st) 363 cl.SetBackend(be) 364 cfg.PrintWithInitial() 365 id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) 366 case haveWAL: 367 if err = fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { 368 return nil, fmt.Errorf("cannot write to member directory: %v", err) 369 } 370 371 if err = fileutil.IsDirWriteable(cfg.WALDir()); err != nil { 372 return nil, fmt.Errorf("cannot write to WAL directory: %v", err) 373 } 374 375 if cfg.ShouldDiscover() { 376 plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) 377 } 378 379 // Find a snapshot to start/restart a raft node 380 walSnaps, serr := wal.ValidSnapshotEntries(cfg.WALDir()) 381 if serr != nil { 382 return nil, serr 383 } 384 // snapshot files can be orphaned if etcd crashes after writing them but before writing the corresponding 385 // wal log entries 386 snapshot, err = ss.LoadNewestAvailable(walSnaps) 387 if err != nil && err != snap.ErrNoSnapshot { 388 return nil, err 389 } 390 if snapshot != nil { 391 if err = st.Recovery(snapshot.Data); err != nil { 392 plog.Panicf("recovered store from snapshot error: %v", err) 393 } 394 plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) 395 if be, err = recoverSnapshotBackend(cfg, be, *snapshot); err != nil { 396 plog.Panicf("recovering backend from snapshot error: %v", err) 397 } 398 } 399 cfg.Print() 400 if !cfg.ForceNewCluster { 401 id, cl, n, s, w = restartNode(cfg, snapshot) 402 } else { 403 id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) 404 } 405 cl.SetStore(st) 406 cl.SetBackend(be) 407 cl.Recover(api.UpdateCapability) 408 if cl.Version() != nil && !cl.Version().LessThan(semver.Version{Major: 3}) && !beExist { 409 os.RemoveAll(bepath) 410 return nil, fmt.Errorf("database file (%v) of the backend is missing", bepath) 411 } 412 default: 413 return nil, fmt.Errorf("unsupported bootstrap config") 414 } 415 416 if terr := fileutil.TouchDirAll(cfg.MemberDir()); terr != nil { 417 return nil, fmt.Errorf("cannot access member directory: %v", terr) 418 } 419 420 sstats := stats.NewServerStats(cfg.Name, id.String()) 421 lstats := stats.NewLeaderStats(id.String()) 422 423 heartbeat := time.Duration(cfg.TickMs) * time.Millisecond 424 srv = &EtcdServer{ 425 readych: make(chan struct{}), 426 Cfg: cfg, 427 errorc: make(chan error, 1), 428 store: st, 429 snapshotter: ss, 430 r: *newRaftNode( 431 raftNodeConfig{ 432 isIDRemoved: func(id uint64) bool { return cl.IsIDRemoved(types.ID(id)) }, 433 Node: n, 434 heartbeat: heartbeat, 435 raftStorage: s, 436 storage: NewStorage(w, ss), 437 }, 438 ), 439 id: id, 440 attributes: membership.Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()}, 441 cluster: cl, 442 stats: sstats, 443 lstats: lstats, 444 SyncTicker: time.NewTicker(500 * time.Millisecond), 445 peerRt: prt, 446 reqIDGen: idutil.NewGenerator(uint16(id), time.Now()), 447 forceVersionC: make(chan struct{}), 448 } 449 serverID.With(prometheus.Labels{"server_id": id.String()}).Set(1) 450 451 srv.applyV2 = &applierV2store{store: srv.store, cluster: srv.cluster} 452 453 srv.be = be 454 minTTL := time.Duration((3*cfg.ElectionTicks)/2) * heartbeat 455 456 // always recover lessor before kv. When we recover the mvcc.KV it will reattach keys to its leases. 457 // If we recover mvcc.KV first, it will attach the keys to the wrong lessor before it recovers. 458 srv.lessor = lease.NewLessor(srv.be, int64(math.Ceil(minTTL.Seconds()))) 459 460 tp, err := auth.NewTokenProvider(cfg.AuthToken, 461 func(index uint64) <-chan struct{} { 462 return srv.applyWait.Wait(index) 463 }, 464 time.Duration(cfg.TokenTTL)*time.Second, 465 ) 466 if err != nil { 467 plog.Warningf("failed to create token provider,err is %v", err) 468 return nil, err 469 } 470 srv.authStore = auth.NewAuthStore(srv.be, tp) 471 472 srv.kv = mvcc.New(srv.be, srv.lessor, srv.authStore, &srv.consistIndex) 473 if beExist { 474 kvindex := srv.kv.ConsistentIndex() 475 // TODO: remove kvindex != 0 checking when we do not expect users to upgrade 476 // etcd from pre-3.0 release. 477 if snapshot != nil && kvindex < snapshot.Metadata.Index { 478 if kvindex != 0 { 479 return nil, fmt.Errorf("database file (%v index %d) does not match with snapshot (index %d).", bepath, kvindex, snapshot.Metadata.Index) 480 } 481 plog.Warningf("consistent index never saved (snapshot index=%d)", snapshot.Metadata.Index) 482 } 483 } 484 newSrv := srv // since srv == nil in defer if srv is returned as nil 485 defer func() { 486 // closing backend without first closing kv can cause 487 // resumed compactions to fail with closed tx errors 488 if err != nil { 489 newSrv.kv.Close() 490 } 491 }() 492 493 srv.consistIndex.setConsistentIndex(srv.kv.ConsistentIndex()) 494 if num := cfg.AutoCompactionRetention; num != 0 { 495 srv.compactor, err = compactor.New(cfg.AutoCompactionMode, num, srv.kv, srv) 496 if err != nil { 497 return nil, err 498 } 499 srv.compactor.Run() 500 } 501 502 srv.applyV3Base = srv.newApplierV3Backend() 503 if err = srv.restoreAlarms(); err != nil { 504 return nil, err 505 } 506 507 // TODO: move transport initialization near the definition of remote 508 tr := &rafthttp.Transport{ 509 TLSInfo: cfg.PeerTLSInfo, 510 DialTimeout: cfg.peerDialTimeout(), 511 ID: id, 512 URLs: cfg.PeerURLs, 513 ClusterID: cl.ID(), 514 Raft: srv, 515 Snapshotter: ss, 516 ServerStats: sstats, 517 LeaderStats: lstats, 518 ErrorC: srv.errorc, 519 } 520 if err = tr.Start(); err != nil { 521 return nil, err 522 } 523 // add all remotes into transport 524 for _, m := range remotes { 525 if m.ID != id { 526 tr.AddRemote(m.ID, m.PeerURLs) 527 } 528 } 529 for _, m := range cl.Members() { 530 if m.ID != id { 531 tr.AddPeer(m.ID, m.PeerURLs) 532 } 533 } 534 srv.r.transport = tr 535 536 return srv, nil 537 } 538 539 func (s *EtcdServer) adjustTicks() { 540 clusterN := len(s.cluster.Members()) 541 542 // single-node fresh start, or single-node recovers from snapshot 543 if clusterN == 1 { 544 ticks := s.Cfg.ElectionTicks - 1 545 plog.Infof("%s as single-node; fast-forwarding %d ticks (election ticks %d)", s.ID(), ticks, s.Cfg.ElectionTicks) 546 s.r.advanceTicks(ticks) 547 return 548 } 549 550 if !s.Cfg.InitialElectionTickAdvance { 551 plog.Infof("skipping initial election tick advance (election tick %d)", s.Cfg.ElectionTicks) 552 return 553 } 554 555 // retry up to "rafthttp.ConnReadTimeout", which is 5-sec 556 // until peer connection reports; otherwise: 557 // 1. all connections failed, or 558 // 2. no active peers, or 559 // 3. restarted single-node with no snapshot 560 // then, do nothing, because advancing ticks would have no effect 561 waitTime := rafthttp.ConnReadTimeout 562 itv := 50 * time.Millisecond 563 for i := int64(0); i < int64(waitTime/itv); i++ { 564 select { 565 case <-time.After(itv): 566 case <-s.stopping: 567 return 568 } 569 570 peerN := s.r.transport.ActivePeers() 571 if peerN > 1 { 572 // multi-node received peer connection reports 573 // adjust ticks, in case slow leader message receive 574 ticks := s.Cfg.ElectionTicks - 2 575 plog.Infof("%s initialzed peer connection; fast-forwarding %d ticks (election ticks %d) with %d active peer(s)", s.ID(), ticks, s.Cfg.ElectionTicks, peerN) 576 s.r.advanceTicks(ticks) 577 return 578 } 579 } 580 } 581 582 // Start performs any initialization of the Server necessary for it to 583 // begin serving requests. It must be called before Do or Process. 584 // Start must be non-blocking; any long-running server functionality 585 // should be implemented in goroutines. 586 func (s *EtcdServer) Start() { 587 s.start() 588 s.goAttach(func() { s.adjustTicks() }) 589 s.goAttach(func() { s.publish(s.Cfg.ReqTimeout()) }) 590 s.goAttach(s.purgeFile) 591 s.goAttach(func() { monitorFileDescriptor(s.stopping) }) 592 s.goAttach(s.monitorVersions) 593 s.goAttach(s.linearizableReadLoop) 594 s.goAttach(s.monitorKVHash) 595 } 596 597 // start prepares and starts server in a new goroutine. It is no longer safe to 598 // modify a server's fields after it has been sent to Start. 599 // This function is just used for testing. 600 func (s *EtcdServer) start() { 601 if s.Cfg.SnapCount == 0 { 602 plog.Infof("set snapshot count to default %d", DefaultSnapCount) 603 s.Cfg.SnapCount = DefaultSnapCount 604 } 605 s.w = wait.New() 606 s.applyWait = wait.NewTimeList() 607 s.done = make(chan struct{}) 608 s.stop = make(chan struct{}) 609 s.stopping = make(chan struct{}) 610 s.ctx, s.cancel = context.WithCancel(context.Background()) 611 s.readwaitc = make(chan struct{}, 1) 612 s.readNotifier = newNotifier() 613 s.leaderChanged = make(chan struct{}) 614 if s.ClusterVersion() != nil { 615 plog.Infof("starting server... [version: %v, cluster version: %v]", version.Version, version.Cluster(s.ClusterVersion().String())) 616 membership.ClusterVersionMetrics.With(prometheus.Labels{"cluster_version": version.Cluster(s.ClusterVersion().String())}).Set(1) 617 } else { 618 plog.Infof("starting server... [version: %v, cluster version: to_be_decided]", version.Version) 619 } 620 // TODO: if this is an empty log, writes all peer infos 621 // into the first entry 622 go s.run() 623 } 624 625 func (s *EtcdServer) purgeFile() { 626 var dberrc, serrc, werrc <-chan error 627 var dbdonec, sdonec, wdonec <-chan struct{} 628 if s.Cfg.MaxSnapFiles > 0 { 629 dbdonec, dberrc = fileutil.PurgeFileWithDoneNotify(s.Cfg.SnapDir(), "snap.db", s.Cfg.MaxSnapFiles, purgeFileInterval, s.stopping) 630 sdonec, serrc = fileutil.PurgeFileWithDoneNotify(s.Cfg.SnapDir(), "snap", s.Cfg.MaxSnapFiles, purgeFileInterval, s.stopping) 631 } 632 if s.Cfg.MaxWALFiles > 0 { 633 wdonec, werrc = fileutil.PurgeFileWithDoneNotify(s.Cfg.WALDir(), "wal", s.Cfg.MaxWALFiles, purgeFileInterval, s.stopping) 634 } 635 select { 636 case e := <-dberrc: 637 plog.Fatalf("failed to purge snap db file %v", e) 638 case e := <-serrc: 639 plog.Fatalf("failed to purge snap file %v", e) 640 case e := <-werrc: 641 plog.Fatalf("failed to purge wal file %v", e) 642 case <-s.stopping: 643 if dbdonec != nil { 644 <-dbdonec 645 } 646 if sdonec != nil { 647 <-sdonec 648 } 649 if wdonec != nil { 650 <-wdonec 651 } 652 return 653 } 654 } 655 656 func (s *EtcdServer) ID() types.ID { return s.id } 657 658 func (s *EtcdServer) Cluster() api.Cluster { return s.cluster } 659 660 func (s *EtcdServer) ApplyWait() <-chan struct{} { return s.applyWait.Wait(s.getCommittedIndex()) } 661 662 type ServerPeer interface { 663 ServerV2 664 RaftHandler() http.Handler 665 LeaseHandler() http.Handler 666 } 667 668 func (s *EtcdServer) LeaseHandler() http.Handler { 669 if s.lessor == nil { 670 return nil 671 } 672 return leasehttp.NewHandler(s.lessor, s.ApplyWait) 673 } 674 675 func (s *EtcdServer) RaftHandler() http.Handler { return s.r.transport.Handler() } 676 677 // Process takes a raft message and applies it to the server's raft state 678 // machine, respecting any timeout of the given context. 679 func (s *EtcdServer) Process(ctx context.Context, m raftpb.Message) error { 680 if s.cluster.IsIDRemoved(types.ID(m.From)) { 681 plog.Warningf("reject message from removed member %s", types.ID(m.From).String()) 682 return httptypes.NewHTTPError(http.StatusForbidden, "cannot process message from removed member") 683 } 684 if m.Type == raftpb.MsgApp { 685 s.stats.RecvAppendReq(types.ID(m.From).String(), m.Size()) 686 } 687 return s.r.Step(ctx, m) 688 } 689 690 func (s *EtcdServer) IsIDRemoved(id uint64) bool { return s.cluster.IsIDRemoved(types.ID(id)) } 691 692 func (s *EtcdServer) ReportUnreachable(id uint64) { s.r.ReportUnreachable(id) } 693 694 // ReportSnapshot reports snapshot sent status to the raft state machine, 695 // and clears the used snapshot from the snapshot store. 696 func (s *EtcdServer) ReportSnapshot(id uint64, status raft.SnapshotStatus) { 697 s.r.ReportSnapshot(id, status) 698 } 699 700 type etcdProgress struct { 701 confState raftpb.ConfState 702 snapi uint64 703 appliedt uint64 704 appliedi uint64 705 } 706 707 // raftReadyHandler contains a set of EtcdServer operations to be called by raftNode, 708 // and helps decouple state machine logic from Raft algorithms. 709 // TODO: add a state machine interface to apply the commit entries and do snapshot/recover 710 type raftReadyHandler struct { 711 updateLeadership func(newLeader bool) 712 updateCommittedIndex func(uint64) 713 } 714 715 func (s *EtcdServer) run() { 716 sn, err := s.r.raftStorage.Snapshot() 717 if err != nil { 718 plog.Panicf("get snapshot from raft storage error: %v", err) 719 } 720 721 // asynchronously accept apply packets, dispatch progress in-order 722 sched := schedule.NewFIFOScheduler() 723 724 var ( 725 smu sync.RWMutex 726 syncC <-chan time.Time 727 ) 728 setSyncC := func(ch <-chan time.Time) { 729 smu.Lock() 730 syncC = ch 731 smu.Unlock() 732 } 733 getSyncC := func() (ch <-chan time.Time) { 734 smu.RLock() 735 ch = syncC 736 smu.RUnlock() 737 return 738 } 739 rh := &raftReadyHandler{ 740 updateLeadership: func(newLeader bool) { 741 if !s.isLeader() { 742 if s.lessor != nil { 743 s.lessor.Demote() 744 } 745 if s.compactor != nil { 746 s.compactor.Pause() 747 } 748 setSyncC(nil) 749 } else { 750 if newLeader { 751 t := time.Now() 752 s.leadTimeMu.Lock() 753 s.leadElectedTime = t 754 s.leadTimeMu.Unlock() 755 } 756 setSyncC(s.SyncTicker.C) 757 if s.compactor != nil { 758 s.compactor.Resume() 759 } 760 } 761 if newLeader { 762 select { 763 case s.leaderChanged <- struct{}{}: 764 default: 765 } 766 s.leaderChangedMu.Lock() 767 lc := s.leaderChanged 768 s.leaderChanged = make(chan struct{}) 769 s.leaderChangedMu.Unlock() 770 close(lc) 771 } 772 773 // TODO: remove the nil checking 774 // current test utility does not provide the stats 775 if s.stats != nil { 776 s.stats.BecomeLeader() 777 } 778 }, 779 updateCommittedIndex: func(ci uint64) { 780 cci := s.getCommittedIndex() 781 if ci > cci { 782 s.setCommittedIndex(ci) 783 } 784 }, 785 } 786 s.r.start(rh) 787 788 ep := etcdProgress{ 789 confState: sn.Metadata.ConfState, 790 snapi: sn.Metadata.Index, 791 appliedt: sn.Metadata.Term, 792 appliedi: sn.Metadata.Index, 793 } 794 795 defer func() { 796 s.wgMu.Lock() // block concurrent waitgroup adds in goAttach while stopping 797 close(s.stopping) 798 s.wgMu.Unlock() 799 s.cancel() 800 801 sched.Stop() 802 803 // wait for gouroutines before closing raft so wal stays open 804 s.wg.Wait() 805 806 s.SyncTicker.Stop() 807 808 // must stop raft after scheduler-- etcdserver can leak rafthttp pipelines 809 // by adding a peer after raft stops the transport 810 s.r.stop() 811 812 // kv, lessor and backend can be nil if running without v3 enabled 813 // or running unit tests. 814 if s.lessor != nil { 815 s.lessor.Stop() 816 } 817 if s.kv != nil { 818 s.kv.Close() 819 } 820 if s.authStore != nil { 821 s.authStore.Close() 822 } 823 if s.be != nil { 824 s.be.Close() 825 } 826 if s.compactor != nil { 827 s.compactor.Stop() 828 } 829 close(s.done) 830 }() 831 832 var expiredLeaseC <-chan []*lease.Lease 833 if s.lessor != nil { 834 expiredLeaseC = s.lessor.ExpiredLeasesC() 835 } 836 837 for { 838 select { 839 case ap := <-s.r.apply(): 840 f := func(context.Context) { s.applyAll(&ep, &ap) } 841 sched.Schedule(f) 842 case leases := <-expiredLeaseC: 843 s.goAttach(func() { 844 // Increases throughput of expired leases deletion process through parallelization 845 c := make(chan struct{}, maxPendingRevokes) 846 for _, lease := range leases { 847 select { 848 case c <- struct{}{}: 849 case <-s.stopping: 850 return 851 } 852 lid := lease.ID 853 s.goAttach(func() { 854 ctx := s.authStore.WithRoot(s.ctx) 855 _, lerr := s.LeaseRevoke(ctx, &pb.LeaseRevokeRequest{ID: int64(lid)}) 856 if lerr == nil { 857 leaseExpired.Inc() 858 } else { 859 plog.Warningf("failed to revoke %016x (%q)", lid, lerr.Error()) 860 } 861 862 <-c 863 }) 864 } 865 }) 866 case err := <-s.errorc: 867 plog.Errorf("%s", err) 868 plog.Infof("the data-dir used by this member must be removed.") 869 return 870 case <-getSyncC(): 871 if s.store.HasTTLKeys() { 872 s.sync(s.Cfg.ReqTimeout()) 873 } 874 case <-s.stop: 875 return 876 } 877 } 878 } 879 880 func (s *EtcdServer) leaderChangedNotify() <-chan struct{} { 881 s.leaderChangedMu.RLock() 882 defer s.leaderChangedMu.RUnlock() 883 return s.leaderChanged 884 } 885 886 func (s *EtcdServer) applyAll(ep *etcdProgress, apply *apply) { 887 s.applySnapshot(ep, apply) 888 s.applyEntries(ep, apply) 889 890 proposalsApplied.Set(float64(ep.appliedi)) 891 s.applyWait.Trigger(ep.appliedi) 892 // wait for the raft routine to finish the disk writes before triggering a 893 // snapshot. or applied index might be greater than the last index in raft 894 // storage, since the raft routine might be slower than apply routine. 895 <-apply.notifyc 896 897 s.triggerSnapshot(ep) 898 select { 899 // snapshot requested via send() 900 case m := <-s.r.msgSnapC: 901 merged := s.createMergedSnapshotMessage(m, ep.appliedt, ep.appliedi, ep.confState) 902 s.sendMergedSnap(merged) 903 default: 904 } 905 } 906 907 func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) { 908 if raft.IsEmptySnap(apply.snapshot) { 909 return 910 } 911 applySnapshotInProgress.Inc() 912 plog.Infof("applying snapshot at index %d...", ep.snapi) 913 defer func() { 914 plog.Infof("finished applying incoming snapshot at index %d", ep.snapi) 915 applySnapshotInProgress.Dec() 916 }() 917 918 if apply.snapshot.Metadata.Index <= ep.appliedi { 919 plog.Panicf("snapshot index [%d] should > appliedi[%d] + 1", 920 apply.snapshot.Metadata.Index, ep.appliedi) 921 } 922 923 // wait for raftNode to persist snapshot onto the disk 924 <-apply.notifyc 925 926 newbe, err := openSnapshotBackend(s.Cfg, s.snapshotter, apply.snapshot) 927 if err != nil { 928 plog.Panic(err) 929 } 930 931 // always recover lessor before kv. When we recover the mvcc.KV it will reattach keys to its leases. 932 // If we recover mvcc.KV first, it will attach the keys to the wrong lessor before it recovers. 933 if s.lessor != nil { 934 plog.Info("recovering lessor...") 935 s.lessor.Recover(newbe, func() lease.TxnDelete { return s.kv.Write() }) 936 plog.Info("finished recovering lessor") 937 } 938 939 plog.Info("restoring mvcc store...") 940 941 if err := s.kv.Restore(newbe); err != nil { 942 plog.Panicf("restore KV error: %v", err) 943 } 944 s.consistIndex.setConsistentIndex(s.kv.ConsistentIndex()) 945 946 plog.Info("finished restoring mvcc store") 947 948 // Closing old backend might block until all the txns 949 // on the backend are finished. 950 // We do not want to wait on closing the old backend. 951 s.bemu.Lock() 952 oldbe := s.be 953 go func() { 954 plog.Info("closing old backend...") 955 defer plog.Info("finished closing old backend") 956 957 if err := oldbe.Close(); err != nil { 958 plog.Panicf("close backend error: %v", err) 959 } 960 }() 961 962 s.be = newbe 963 s.bemu.Unlock() 964 965 plog.Info("recovering alarms...") 966 if err := s.restoreAlarms(); err != nil { 967 plog.Panicf("restore alarms error: %v", err) 968 } 969 plog.Info("finished recovering alarms") 970 971 if s.authStore != nil { 972 plog.Info("recovering auth store...") 973 s.authStore.Recover(newbe) 974 plog.Info("finished recovering auth store") 975 } 976 977 plog.Info("recovering store v2...") 978 if err := s.store.Recovery(apply.snapshot.Data); err != nil { 979 plog.Panicf("recovery store error: %v", err) 980 } 981 plog.Info("finished recovering store v2") 982 983 s.cluster.SetBackend(s.be) 984 plog.Info("recovering cluster configuration...") 985 s.cluster.Recover(api.UpdateCapability) 986 plog.Info("finished recovering cluster configuration") 987 988 plog.Info("removing old peers from network...") 989 // recover raft transport 990 s.r.transport.RemoveAllPeers() 991 plog.Info("finished removing old peers from network") 992 993 plog.Info("adding peers from new cluster configuration into network...") 994 for _, m := range s.cluster.Members() { 995 if m.ID == s.ID() { 996 continue 997 } 998 s.r.transport.AddPeer(m.ID, m.PeerURLs) 999 } 1000 plog.Info("finished adding peers from new cluster configuration into network...") 1001 1002 ep.appliedt = apply.snapshot.Metadata.Term 1003 ep.appliedi = apply.snapshot.Metadata.Index 1004 ep.snapi = ep.appliedi 1005 ep.confState = apply.snapshot.Metadata.ConfState 1006 } 1007 1008 func (s *EtcdServer) applyEntries(ep *etcdProgress, apply *apply) { 1009 if len(apply.entries) == 0 { 1010 return 1011 } 1012 firsti := apply.entries[0].Index 1013 if firsti > ep.appliedi+1 { 1014 plog.Panicf("first index of committed entry[%d] should <= appliedi[%d] + 1", firsti, ep.appliedi) 1015 } 1016 var ents []raftpb.Entry 1017 if ep.appliedi+1-firsti < uint64(len(apply.entries)) { 1018 ents = apply.entries[ep.appliedi+1-firsti:] 1019 } 1020 if len(ents) == 0 { 1021 return 1022 } 1023 var shouldstop bool 1024 if ep.appliedt, ep.appliedi, shouldstop = s.apply(ents, &ep.confState); shouldstop { 1025 go s.stopWithDelay(10*100*time.Millisecond, fmt.Errorf("the member has been permanently removed from the cluster")) 1026 } 1027 } 1028 1029 func (s *EtcdServer) triggerSnapshot(ep *etcdProgress) { 1030 if ep.appliedi-ep.snapi <= s.Cfg.SnapCount { 1031 return 1032 } 1033 1034 plog.Infof("start to snapshot (applied: %d, lastsnap: %d)", ep.appliedi, ep.snapi) 1035 s.snapshot(ep.appliedi, ep.confState) 1036 ep.snapi = ep.appliedi 1037 } 1038 1039 func (s *EtcdServer) isMultiNode() bool { 1040 return s.cluster != nil && len(s.cluster.MemberIDs()) > 1 1041 } 1042 1043 func (s *EtcdServer) isLeader() bool { 1044 return uint64(s.ID()) == s.Lead() 1045 } 1046 1047 // MoveLeader transfers the leader to the given transferee. 1048 func (s *EtcdServer) MoveLeader(ctx context.Context, lead, transferee uint64) error { 1049 now := time.Now() 1050 interval := time.Duration(s.Cfg.TickMs) * time.Millisecond 1051 1052 plog.Infof("%s starts leadership transfer from %s to %s", s.ID(), types.ID(lead), types.ID(transferee)) 1053 s.r.TransferLeadership(ctx, lead, transferee) 1054 for s.Lead() != transferee { 1055 select { 1056 case <-ctx.Done(): // time out 1057 return ErrTimeoutLeaderTransfer 1058 case <-time.After(interval): 1059 } 1060 } 1061 1062 // TODO: drain all requests, or drop all messages to the old leader 1063 1064 plog.Infof("%s finished leadership transfer from %s to %s (took %v)", s.ID(), types.ID(lead), types.ID(transferee), time.Since(now)) 1065 return nil 1066 } 1067 1068 // TransferLeadership transfers the leader to the chosen transferee. 1069 func (s *EtcdServer) TransferLeadership() error { 1070 if !s.isLeader() { 1071 plog.Printf("skipped leadership transfer for stopping non-leader member") 1072 return nil 1073 } 1074 1075 if !s.isMultiNode() { 1076 plog.Printf("skipped leadership transfer for single member cluster") 1077 return nil 1078 } 1079 1080 transferee, ok := longestConnected(s.r.transport, s.cluster.MemberIDs()) 1081 if !ok { 1082 return ErrUnhealthy 1083 } 1084 1085 tm := s.Cfg.ReqTimeout() 1086 ctx, cancel := context.WithTimeout(s.ctx, tm) 1087 err := s.MoveLeader(ctx, s.Lead(), uint64(transferee)) 1088 cancel() 1089 return err 1090 } 1091 1092 // HardStop stops the server without coordination with other members in the cluster. 1093 func (s *EtcdServer) HardStop() { 1094 select { 1095 case s.stop <- struct{}{}: 1096 case <-s.done: 1097 return 1098 } 1099 <-s.done 1100 } 1101 1102 // Stop stops the server gracefully, and shuts down the running goroutine. 1103 // Stop should be called after a Start(s), otherwise it will block forever. 1104 // When stopping leader, Stop transfers its leadership to one of its peers 1105 // before stopping the server. 1106 // Stop terminates the Server and performs any necessary finalization. 1107 // Do and Process cannot be called after Stop has been invoked. 1108 func (s *EtcdServer) Stop() { 1109 if err := s.TransferLeadership(); err != nil { 1110 plog.Warningf("%s failed to transfer leadership (%v)", s.ID(), err) 1111 } 1112 s.HardStop() 1113 } 1114 1115 // ReadyNotify returns a channel that will be closed when the server 1116 // is ready to serve client requests 1117 func (s *EtcdServer) ReadyNotify() <-chan struct{} { return s.readych } 1118 1119 func (s *EtcdServer) stopWithDelay(d time.Duration, err error) { 1120 select { 1121 case <-time.After(d): 1122 case <-s.done: 1123 } 1124 select { 1125 case s.errorc <- err: 1126 default: 1127 } 1128 } 1129 1130 // StopNotify returns a channel that receives a empty struct 1131 // when the server is stopped. 1132 func (s *EtcdServer) StopNotify() <-chan struct{} { return s.done } 1133 1134 func (s *EtcdServer) SelfStats() []byte { return s.stats.JSON() } 1135 1136 func (s *EtcdServer) LeaderStats() []byte { 1137 lead := atomic.LoadUint64(&s.r.lead) 1138 if lead != uint64(s.id) { 1139 return nil 1140 } 1141 return s.lstats.JSON() 1142 } 1143 1144 func (s *EtcdServer) StoreStats() []byte { return s.store.JsonStats() } 1145 1146 func (s *EtcdServer) checkMembershipOperationPermission(ctx context.Context) error { 1147 if s.authStore == nil { 1148 // In the context of ordinary etcd process, s.authStore will never be nil. 1149 // This branch is for handling cases in server_test.go 1150 return nil 1151 } 1152 1153 // Note that this permission check is done in the API layer, 1154 // so TOCTOU problem can be caused potentially in a schedule like this: 1155 // update membership with user A -> revoke root role of A -> apply membership change 1156 // in the state machine layer 1157 // However, both of membership change and role management requires the root privilege. 1158 // So careful operation by admins can prevent the problem. 1159 authInfo, err := s.AuthInfoFromCtx(ctx) 1160 if err != nil { 1161 return err 1162 } 1163 1164 return s.AuthStore().IsAdminPermitted(authInfo) 1165 } 1166 1167 func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) ([]*membership.Member, error) { 1168 if err := s.checkMembershipOperationPermission(ctx); err != nil { 1169 return nil, err 1170 } 1171 1172 if s.Cfg.StrictReconfigCheck { 1173 // by default StrictReconfigCheck is enabled; reject new members if unhealthy 1174 if !s.cluster.IsReadyToAddNewMember() { 1175 plog.Warningf("not enough started members, rejecting member add %+v", memb) 1176 return nil, ErrNotEnoughStartedMembers 1177 } 1178 if !isConnectedFullySince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), s.cluster.Members()) { 1179 plog.Warningf("not healthy for reconfigure, rejecting member add %+v", memb) 1180 return nil, ErrUnhealthy 1181 } 1182 } 1183 1184 // TODO: move Member to protobuf type 1185 b, err := json.Marshal(memb) 1186 if err != nil { 1187 return nil, err 1188 } 1189 cc := raftpb.ConfChange{ 1190 Type: raftpb.ConfChangeAddNode, 1191 NodeID: uint64(memb.ID), 1192 Context: b, 1193 } 1194 return s.configure(ctx, cc) 1195 } 1196 1197 func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) ([]*membership.Member, error) { 1198 if err := s.checkMembershipOperationPermission(ctx); err != nil { 1199 return nil, err 1200 } 1201 1202 // by default StrictReconfigCheck is enabled; reject removal if leads to quorum loss 1203 if err := s.mayRemoveMember(types.ID(id)); err != nil { 1204 return nil, err 1205 } 1206 1207 cc := raftpb.ConfChange{ 1208 Type: raftpb.ConfChangeRemoveNode, 1209 NodeID: id, 1210 } 1211 return s.configure(ctx, cc) 1212 } 1213 1214 func (s *EtcdServer) mayRemoveMember(id types.ID) error { 1215 if !s.Cfg.StrictReconfigCheck { 1216 return nil 1217 } 1218 1219 if !s.cluster.IsReadyToRemoveMember(uint64(id)) { 1220 plog.Warningf("not enough started members, rejecting remove member %s", id) 1221 return ErrNotEnoughStartedMembers 1222 } 1223 1224 // downed member is safe to remove since it's not part of the active quorum 1225 if t := s.r.transport.ActiveSince(id); id != s.ID() && t.IsZero() { 1226 return nil 1227 } 1228 1229 // protect quorum if some members are down 1230 m := s.cluster.Members() 1231 active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), m) 1232 if (active - 1) < 1+((len(m)-1)/2) { 1233 plog.Warningf("reconfigure breaks active quorum, rejecting remove member %s", id) 1234 return ErrUnhealthy 1235 } 1236 1237 return nil 1238 } 1239 1240 func (s *EtcdServer) UpdateMember(ctx context.Context, memb membership.Member) ([]*membership.Member, error) { 1241 b, merr := json.Marshal(memb) 1242 if merr != nil { 1243 return nil, merr 1244 } 1245 1246 if err := s.checkMembershipOperationPermission(ctx); err != nil { 1247 return nil, err 1248 } 1249 cc := raftpb.ConfChange{ 1250 Type: raftpb.ConfChangeUpdateNode, 1251 NodeID: uint64(memb.ID), 1252 Context: b, 1253 } 1254 return s.configure(ctx, cc) 1255 } 1256 1257 // Implement the RaftTimer interface 1258 1259 func (s *EtcdServer) Index() uint64 { return atomic.LoadUint64(&s.r.index) } 1260 1261 func (s *EtcdServer) Term() uint64 { return atomic.LoadUint64(&s.r.term) } 1262 1263 // Lead is only for testing purposes. 1264 // TODO: add Raft server interface to expose raft related info: 1265 // Index, Term, Lead, Committed, Applied, LastIndex, etc. 1266 func (s *EtcdServer) Lead() uint64 { return atomic.LoadUint64(&s.r.lead) } 1267 1268 func (s *EtcdServer) Leader() types.ID { return types.ID(s.Lead()) } 1269 1270 type confChangeResponse struct { 1271 membs []*membership.Member 1272 err error 1273 } 1274 1275 // configure sends a configuration change through consensus and 1276 // then waits for it to be applied to the server. It 1277 // will block until the change is performed or there is an error. 1278 func (s *EtcdServer) configure(ctx context.Context, cc raftpb.ConfChange) ([]*membership.Member, error) { 1279 cc.ID = s.reqIDGen.Next() 1280 ch := s.w.Register(cc.ID) 1281 start := time.Now() 1282 if err := s.r.ProposeConfChange(ctx, cc); err != nil { 1283 s.w.Trigger(cc.ID, nil) 1284 return nil, err 1285 } 1286 select { 1287 case x := <-ch: 1288 if x == nil { 1289 plog.Panicf("configure trigger value should never be nil") 1290 } 1291 resp := x.(*confChangeResponse) 1292 return resp.membs, resp.err 1293 case <-ctx.Done(): 1294 s.w.Trigger(cc.ID, nil) // GC wait 1295 return nil, s.parseProposeCtxErr(ctx.Err(), start) 1296 case <-s.stopping: 1297 return nil, ErrStopped 1298 } 1299 } 1300 1301 // sync proposes a SYNC request and is non-blocking. 1302 // This makes no guarantee that the request will be proposed or performed. 1303 // The request will be canceled after the given timeout. 1304 func (s *EtcdServer) sync(timeout time.Duration) { 1305 req := pb.Request{ 1306 Method: "SYNC", 1307 ID: s.reqIDGen.Next(), 1308 Time: time.Now().UnixNano(), 1309 } 1310 data := pbutil.MustMarshal(&req) 1311 // There is no promise that node has leader when do SYNC request, 1312 // so it uses goroutine to propose. 1313 ctx, cancel := context.WithTimeout(s.ctx, timeout) 1314 s.goAttach(func() { 1315 s.r.Propose(ctx, data) 1316 cancel() 1317 }) 1318 } 1319 1320 // publish registers server information into the cluster. The information 1321 // is the JSON representation of this server's member struct, updated with the 1322 // static clientURLs of the server. 1323 // The function keeps attempting to register until it succeeds, 1324 // or its server is stopped. 1325 func (s *EtcdServer) publish(timeout time.Duration) { 1326 b, err := json.Marshal(s.attributes) 1327 if err != nil { 1328 plog.Panicf("json marshal error: %v", err) 1329 return 1330 } 1331 req := pb.Request{ 1332 Method: "PUT", 1333 Path: membership.MemberAttributesStorePath(s.id), 1334 Val: string(b), 1335 } 1336 1337 for { 1338 ctx, cancel := context.WithTimeout(s.ctx, timeout) 1339 _, err := s.Do(ctx, req) 1340 cancel() 1341 switch err { 1342 case nil: 1343 close(s.readych) 1344 plog.Infof("published %+v to cluster %s", s.attributes, s.cluster.ID()) 1345 return 1346 case ErrStopped: 1347 plog.Infof("aborting publish because server is stopped") 1348 return 1349 default: 1350 plog.Errorf("publish error: %v", err) 1351 } 1352 } 1353 } 1354 1355 func (s *EtcdServer) sendMergedSnap(merged snap.Message) { 1356 atomic.AddInt64(&s.inflightSnapshots, 1) 1357 1358 s.r.transport.SendSnapshot(merged) 1359 s.goAttach(func() { 1360 select { 1361 case ok := <-merged.CloseNotify(): 1362 // delay releasing inflight snapshot for another 30 seconds to 1363 // block log compaction. 1364 // If the follower still fails to catch up, it is probably just too slow 1365 // to catch up. We cannot avoid the snapshot cycle anyway. 1366 if ok { 1367 select { 1368 case <-time.After(releaseDelayAfterSnapshot): 1369 case <-s.stopping: 1370 } 1371 } 1372 atomic.AddInt64(&s.inflightSnapshots, -1) 1373 case <-s.stopping: 1374 return 1375 } 1376 }) 1377 } 1378 1379 // apply takes entries received from Raft (after it has been committed) and 1380 // applies them to the current state of the EtcdServer. 1381 // The given entries should not be empty. 1382 func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (appliedt uint64, appliedi uint64, shouldStop bool) { 1383 for i := range es { 1384 e := es[i] 1385 switch e.Type { 1386 case raftpb.EntryNormal: 1387 s.applyEntryNormal(&e) 1388 case raftpb.EntryConfChange: 1389 // set the consistent index of current executing entry 1390 if e.Index > s.consistIndex.ConsistentIndex() { 1391 s.consistIndex.setConsistentIndex(e.Index) 1392 } 1393 var cc raftpb.ConfChange 1394 pbutil.MustUnmarshal(&cc, e.Data) 1395 removedSelf, err := s.applyConfChange(cc, confState) 1396 s.setAppliedIndex(e.Index) 1397 shouldStop = shouldStop || removedSelf 1398 s.w.Trigger(cc.ID, &confChangeResponse{s.cluster.Members(), err}) 1399 default: 1400 plog.Panicf("entry type should be either EntryNormal or EntryConfChange") 1401 } 1402 atomic.StoreUint64(&s.r.index, e.Index) 1403 atomic.StoreUint64(&s.r.term, e.Term) 1404 appliedt = e.Term 1405 appliedi = e.Index 1406 } 1407 return appliedt, appliedi, shouldStop 1408 } 1409 1410 // applyEntryNormal apples an EntryNormal type raftpb request to the EtcdServer 1411 func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) { 1412 shouldApplyV3 := false 1413 if e.Index > s.consistIndex.ConsistentIndex() { 1414 // set the consistent index of current executing entry 1415 s.consistIndex.setConsistentIndex(e.Index) 1416 shouldApplyV3 = true 1417 } 1418 defer s.setAppliedIndex(e.Index) 1419 1420 // raft state machine may generate noop entry when leader confirmation. 1421 // skip it in advance to avoid some potential bug in the future 1422 if len(e.Data) == 0 { 1423 select { 1424 case s.forceVersionC <- struct{}{}: 1425 default: 1426 } 1427 // promote lessor when the local member is leader and finished 1428 // applying all entries from the last term. 1429 if s.isLeader() { 1430 s.lessor.Promote(s.Cfg.electionTimeout()) 1431 } 1432 return 1433 } 1434 1435 var raftReq pb.InternalRaftRequest 1436 if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible 1437 var r pb.Request 1438 rp := &r 1439 pbutil.MustUnmarshal(rp, e.Data) 1440 s.w.Trigger(r.ID, s.applyV2Request((*RequestV2)(rp))) 1441 return 1442 } 1443 if raftReq.V2 != nil { 1444 req := (*RequestV2)(raftReq.V2) 1445 s.w.Trigger(req.ID, s.applyV2Request(req)) 1446 return 1447 } 1448 1449 // do not re-apply applied entries. 1450 if !shouldApplyV3 { 1451 return 1452 } 1453 1454 id := raftReq.ID 1455 if id == 0 { 1456 id = raftReq.Header.ID 1457 } 1458 1459 var ar *applyResult 1460 needResult := s.w.IsRegistered(id) 1461 if needResult || !noSideEffect(&raftReq) { 1462 if !needResult && raftReq.Txn != nil { 1463 removeNeedlessRangeReqs(raftReq.Txn) 1464 } 1465 ar = s.applyV3.Apply(&raftReq) 1466 } 1467 1468 if ar == nil { 1469 return 1470 } 1471 1472 if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 { 1473 s.w.Trigger(id, ar) 1474 return 1475 } 1476 1477 plog.Errorf("applying raft message exceeded backend quota") 1478 s.goAttach(func() { 1479 a := &pb.AlarmRequest{ 1480 MemberID: uint64(s.ID()), 1481 Action: pb.AlarmRequest_ACTIVATE, 1482 Alarm: pb.AlarmType_NOSPACE, 1483 } 1484 s.raftRequest(s.ctx, pb.InternalRaftRequest{Alarm: a}) 1485 s.w.Trigger(id, ar) 1486 }) 1487 } 1488 1489 // applyConfChange applies a ConfChange to the server. It is only 1490 // invoked with a ConfChange that has already passed through Raft 1491 func (s *EtcdServer) applyConfChange(cc raftpb.ConfChange, confState *raftpb.ConfState) (bool, error) { 1492 if err := s.cluster.ValidateConfigurationChange(cc); err != nil { 1493 cc.NodeID = raft.None 1494 s.r.ApplyConfChange(cc) 1495 return false, err 1496 } 1497 *confState = *s.r.ApplyConfChange(cc) 1498 switch cc.Type { 1499 case raftpb.ConfChangeAddNode: 1500 m := new(membership.Member) 1501 if err := json.Unmarshal(cc.Context, m); err != nil { 1502 plog.Panicf("unmarshal member should never fail: %v", err) 1503 } 1504 if cc.NodeID != uint64(m.ID) { 1505 plog.Panicf("nodeID should always be equal to member ID") 1506 } 1507 s.cluster.AddMember(m) 1508 if m.ID != s.id { 1509 s.r.transport.AddPeer(m.ID, m.PeerURLs) 1510 } 1511 case raftpb.ConfChangeRemoveNode: 1512 id := types.ID(cc.NodeID) 1513 s.cluster.RemoveMember(id) 1514 if id == s.id { 1515 return true, nil 1516 } 1517 s.r.transport.RemovePeer(id) 1518 case raftpb.ConfChangeUpdateNode: 1519 m := new(membership.Member) 1520 if err := json.Unmarshal(cc.Context, m); err != nil { 1521 plog.Panicf("unmarshal member should never fail: %v", err) 1522 } 1523 if cc.NodeID != uint64(m.ID) { 1524 plog.Panicf("nodeID should always be equal to member ID") 1525 } 1526 s.cluster.UpdateRaftAttributes(m.ID, m.RaftAttributes) 1527 if m.ID != s.id { 1528 s.r.transport.UpdatePeer(m.ID, m.PeerURLs) 1529 } 1530 } 1531 return false, nil 1532 } 1533 1534 // TODO: non-blocking snapshot 1535 func (s *EtcdServer) snapshot(snapi uint64, confState raftpb.ConfState) { 1536 clone := s.store.Clone() 1537 // commit kv to write metadata (for example: consistent index) to disk. 1538 // KV().commit() updates the consistent index in backend. 1539 // All operations that update consistent index must be called sequentially 1540 // from applyAll function. 1541 // So KV().Commit() cannot run in parallel with apply. It has to be called outside 1542 // the go routine created below. 1543 s.KV().Commit() 1544 1545 s.goAttach(func() { 1546 d, err := clone.SaveNoCopy() 1547 // TODO: current store will never fail to do a snapshot 1548 // what should we do if the store might fail? 1549 if err != nil { 1550 plog.Panicf("store save should never fail: %v", err) 1551 } 1552 snap, err := s.r.raftStorage.CreateSnapshot(snapi, &confState, d) 1553 if err != nil { 1554 // the snapshot was done asynchronously with the progress of raft. 1555 // raft might have already got a newer snapshot. 1556 if err == raft.ErrSnapOutOfDate { 1557 return 1558 } 1559 plog.Panicf("unexpected create snapshot error %v", err) 1560 } 1561 // SaveSnap saves the snapshot and releases the locked wal files 1562 // to the snapshot index. 1563 if err = s.r.storage.SaveSnap(snap); err != nil { 1564 plog.Fatalf("save snapshot error: %v", err) 1565 } 1566 plog.Infof("saved snapshot at index %d", snap.Metadata.Index) 1567 1568 if err = s.r.storage.Release(snap); err != nil { 1569 plog.Panicf("failed to release wal %v", err) 1570 } 1571 1572 // When sending a snapshot, etcd will pause compaction. 1573 // After receives a snapshot, the slow follower needs to get all the entries right after 1574 // the snapshot sent to catch up. If we do not pause compaction, the log entries right after 1575 // the snapshot sent might already be compacted. It happens when the snapshot takes long time 1576 // to send and save. Pausing compaction avoids triggering a snapshot sending cycle. 1577 if atomic.LoadInt64(&s.inflightSnapshots) != 0 { 1578 plog.Infof("skip compaction since there is an inflight snapshot") 1579 return 1580 } 1581 1582 // keep some in memory log entries for slow followers. 1583 compacti := uint64(1) 1584 if snapi > numberOfCatchUpEntries { 1585 compacti = snapi - numberOfCatchUpEntries 1586 } 1587 err = s.r.raftStorage.Compact(compacti) 1588 if err != nil { 1589 // the compaction was done asynchronously with the progress of raft. 1590 // raft log might already been compact. 1591 if err == raft.ErrCompacted { 1592 return 1593 } 1594 plog.Panicf("unexpected compaction error %v", err) 1595 } 1596 plog.Infof("compacted raft log at %d", compacti) 1597 }) 1598 } 1599 1600 // CutPeer drops messages to the specified peer. 1601 func (s *EtcdServer) CutPeer(id types.ID) { 1602 tr, ok := s.r.transport.(*rafthttp.Transport) 1603 if ok { 1604 tr.CutPeer(id) 1605 } 1606 } 1607 1608 // MendPeer recovers the message dropping behavior of the given peer. 1609 func (s *EtcdServer) MendPeer(id types.ID) { 1610 tr, ok := s.r.transport.(*rafthttp.Transport) 1611 if ok { 1612 tr.MendPeer(id) 1613 } 1614 } 1615 1616 func (s *EtcdServer) PauseSending() { s.r.pauseSending() } 1617 1618 func (s *EtcdServer) ResumeSending() { s.r.resumeSending() } 1619 1620 func (s *EtcdServer) ClusterVersion() *semver.Version { 1621 if s.cluster == nil { 1622 return nil 1623 } 1624 return s.cluster.Version() 1625 } 1626 1627 // monitorVersions checks the member's version every monitorVersionInterval. 1628 // It updates the cluster version if all members agrees on a higher one. 1629 // It prints out log if there is a member with a higher version than the 1630 // local version. 1631 func (s *EtcdServer) monitorVersions() { 1632 for { 1633 select { 1634 case <-s.forceVersionC: 1635 case <-time.After(monitorVersionInterval): 1636 case <-s.stopping: 1637 return 1638 } 1639 1640 if s.Leader() != s.ID() { 1641 continue 1642 } 1643 1644 v := decideClusterVersion(getVersions(s.cluster, s.id, s.peerRt)) 1645 if v != nil { 1646 // only keep major.minor version for comparison 1647 v = &semver.Version{ 1648 Major: v.Major, 1649 Minor: v.Minor, 1650 } 1651 } 1652 1653 // if the current version is nil: 1654 // 1. use the decided version if possible 1655 // 2. or use the min cluster version 1656 if s.cluster.Version() == nil { 1657 verStr := version.MinClusterVersion 1658 if v != nil { 1659 verStr = v.String() 1660 } 1661 s.goAttach(func() { s.updateClusterVersion(verStr) }) 1662 continue 1663 } 1664 1665 // update cluster version only if the decided version is greater than 1666 // the current cluster version 1667 if v != nil && s.cluster.Version().LessThan(*v) { 1668 s.goAttach(func() { s.updateClusterVersion(v.String()) }) 1669 } 1670 } 1671 } 1672 1673 func (s *EtcdServer) updateClusterVersion(ver string) { 1674 if s.cluster.Version() == nil { 1675 plog.Infof("setting up the initial cluster version to %s", version.Cluster(ver)) 1676 } else { 1677 plog.Infof("updating the cluster version from %s to %s", version.Cluster(s.cluster.Version().String()), version.Cluster(ver)) 1678 } 1679 req := pb.Request{ 1680 Method: "PUT", 1681 Path: membership.StoreClusterVersionKey(), 1682 Val: ver, 1683 } 1684 ctx, cancel := context.WithTimeout(s.ctx, s.Cfg.ReqTimeout()) 1685 _, err := s.Do(ctx, req) 1686 cancel() 1687 switch err { 1688 case nil: 1689 return 1690 case ErrStopped: 1691 plog.Infof("aborting update cluster version because server is stopped") 1692 return 1693 default: 1694 plog.Errorf("error updating cluster version (%v)", err) 1695 } 1696 } 1697 1698 func (s *EtcdServer) parseProposeCtxErr(err error, start time.Time) error { 1699 switch err { 1700 case context.Canceled: 1701 return ErrCanceled 1702 case context.DeadlineExceeded: 1703 s.leadTimeMu.RLock() 1704 curLeadElected := s.leadElectedTime 1705 s.leadTimeMu.RUnlock() 1706 prevLeadLost := curLeadElected.Add(-2 * time.Duration(s.Cfg.ElectionTicks) * time.Duration(s.Cfg.TickMs) * time.Millisecond) 1707 if start.After(prevLeadLost) && start.Before(curLeadElected) { 1708 return ErrTimeoutDueToLeaderFail 1709 } 1710 1711 lead := types.ID(atomic.LoadUint64(&s.r.lead)) 1712 switch lead { 1713 case types.ID(raft.None): 1714 // TODO: return error to specify it happens because the cluster does not have leader now 1715 case s.ID(): 1716 if !isConnectedToQuorumSince(s.r.transport, start, s.ID(), s.cluster.Members()) { 1717 return ErrTimeoutDueToConnectionLost 1718 } 1719 default: 1720 if !isConnectedSince(s.r.transport, start, lead) { 1721 return ErrTimeoutDueToConnectionLost 1722 } 1723 } 1724 1725 return ErrTimeout 1726 default: 1727 return err 1728 } 1729 } 1730 1731 func (s *EtcdServer) KV() mvcc.ConsistentWatchableKV { return s.kv } 1732 func (s *EtcdServer) Backend() backend.Backend { 1733 s.bemu.Lock() 1734 defer s.bemu.Unlock() 1735 return s.be 1736 } 1737 1738 func (s *EtcdServer) AuthStore() auth.AuthStore { return s.authStore } 1739 1740 func (s *EtcdServer) restoreAlarms() error { 1741 s.applyV3 = s.newApplierV3() 1742 as, err := alarm.NewAlarmStore(s) 1743 if err != nil { 1744 return err 1745 } 1746 s.alarmStore = as 1747 if len(as.Get(pb.AlarmType_NOSPACE)) > 0 { 1748 s.applyV3 = newApplierV3Capped(s.applyV3) 1749 } 1750 if len(as.Get(pb.AlarmType_CORRUPT)) > 0 { 1751 s.applyV3 = newApplierV3Corrupt(s.applyV3) 1752 } 1753 return nil 1754 } 1755 1756 func (s *EtcdServer) getAppliedIndex() uint64 { 1757 return atomic.LoadUint64(&s.appliedIndex) 1758 } 1759 1760 func (s *EtcdServer) setAppliedIndex(v uint64) { 1761 atomic.StoreUint64(&s.appliedIndex, v) 1762 } 1763 1764 func (s *EtcdServer) getCommittedIndex() uint64 { 1765 return atomic.LoadUint64(&s.committedIndex) 1766 } 1767 1768 func (s *EtcdServer) setCommittedIndex(v uint64) { 1769 atomic.StoreUint64(&s.committedIndex, v) 1770 } 1771 1772 // goAttach creates a goroutine on a given function and tracks it using 1773 // the etcdserver waitgroup. 1774 func (s *EtcdServer) goAttach(f func()) { 1775 s.wgMu.RLock() // this blocks with ongoing close(s.stopping) 1776 defer s.wgMu.RUnlock() 1777 select { 1778 case <-s.stopping: 1779 plog.Warning("server has stopped (skipping goAttach)") 1780 return 1781 default: 1782 } 1783 1784 // now safe to add since waitgroup wait has not started yet 1785 s.wg.Add(1) 1786 go func() { 1787 defer s.wg.Done() 1788 f() 1789 }() 1790 } 1791 1792 func (s *EtcdServer) Alarms() []*pb.AlarmMember { 1793 return s.alarmStore.Get(pb.AlarmType_NONE) 1794 }