go.etcd.io/etcd@v3.3.27+incompatible/functional/rpcpb/rpc.proto (about) 1 syntax = "proto3"; 2 package rpcpb; 3 4 import "github.com/gogo/protobuf/gogoproto/gogo.proto"; 5 6 option (gogoproto.marshaler_all) = true; 7 option (gogoproto.sizer_all) = true; 8 option (gogoproto.unmarshaler_all) = true; 9 option (gogoproto.goproto_getters_all) = false; 10 11 message Request { 12 Operation Operation = 1; 13 // Member contains the same Member object from tester configuration. 14 Member Member = 2; 15 // Tester contains tester configuration. 16 Tester Tester = 3; 17 } 18 19 // SnapshotInfo contains SAVE_SNAPSHOT request results. 20 message SnapshotInfo { 21 string MemberName = 1; 22 repeated string MemberClientURLs = 2; 23 string SnapshotPath = 3; 24 string SnapshotFileSize = 4; 25 string SnapshotTotalSize = 5; 26 int64 SnapshotTotalKey = 6; 27 int64 SnapshotHash = 7; 28 int64 SnapshotRevision = 8; 29 string Took = 9; 30 } 31 32 message Response { 33 bool Success = 1; 34 string Status = 2; 35 36 // Member contains the same Member object from tester request. 37 Member Member = 3; 38 39 // SnapshotInfo contains SAVE_SNAPSHOT request results. 40 SnapshotInfo SnapshotInfo = 4; 41 } 42 43 service Transport { 44 rpc Transport(stream Request) returns (stream Response) {} 45 } 46 47 message Member { 48 // EtcdExecPath is the executable etcd binary path in agent server. 49 string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""]; 50 // TODO: support embedded etcd 51 52 // AgentAddr is the agent HTTP server address. 53 string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""]; 54 // FailpointHTTPAddr is the agent's failpoints HTTP server address. 55 string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""]; 56 57 // BaseDir is the base directory where all logs and etcd data are stored. 58 string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""]; 59 // EtcdLogPath is the log file to store current etcd server logs. 60 string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""]; 61 62 // EtcdClientProxy is true when client traffic needs to be proxied. 63 // If true, listen client URL port must be different than advertise client URL port. 64 bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""]; 65 // EtcdPeerProxy is true when peer traffic needs to be proxied. 66 // If true, listen peer URL port must be different than advertise peer URL port. 67 bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""]; 68 69 // EtcdClientEndpoint is the etcd client endpoint. 70 string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""]; 71 // Etcd defines etcd binary configuration flags. 72 Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""]; 73 // EtcdOnSnapshotRestore defines one-time use configuration during etcd 74 // snapshot recovery process. 75 Etcd EtcdOnSnapshotRestore = 303; 76 77 // ClientCertData contains cert file contents from this member's etcd server. 78 string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""]; 79 string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""]; 80 // ClientKeyData contains key file contents from this member's etcd server. 81 string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""]; 82 string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""]; 83 // ClientTrustedCAData contains trusted CA file contents from this member's etcd server. 84 string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""]; 85 string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""]; 86 87 // PeerCertData contains cert file contents from this member's etcd server. 88 string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""]; 89 string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""]; 90 // PeerKeyData contains key file contents from this member's etcd server. 91 string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""]; 92 string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""]; 93 // PeerTrustedCAData contains trusted CA file contents from this member's etcd server. 94 string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""]; 95 string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""]; 96 97 // SnapshotPath is the snapshot file path to store or restore from. 98 string SnapshotPath = 601 [(gogoproto.moretags) = "yaml:\"snapshot-path\""]; 99 // SnapshotInfo contains last SAVE_SNAPSHOT request results. 100 SnapshotInfo SnapshotInfo = 602; 101 } 102 103 message Tester { 104 string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""]; 105 string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""]; 106 string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""]; 107 108 // DelayLatencyMsRv is the delay latency in milliseconds, 109 // to inject to simulated slow network. 110 uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""]; 111 // DelayLatencyMsRv is the delay latency random variable in milliseconds. 112 uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""]; 113 // UpdatedDelayLatencyMs is the update delay latency in milliseconds, 114 // to inject to simulated slow network. It's the final latency to apply, 115 // in case the latency numbers are randomly generated from given delay latency field. 116 uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""]; 117 118 // RoundLimit is the limit of rounds to run failure set (-1 to run without limits). 119 int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""]; 120 // ExitOnCaseFail is true, then exit tester on first failure. 121 bool ExitOnCaseFail = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""]; 122 // EnablePprof is true to enable profiler. 123 bool EnablePprof = 23 [(gogoproto.moretags) = "yaml:\"enable-pprof\""]; 124 125 // CaseDelayMs is the delay duration after failure is injected. 126 // Useful when triggering snapshot or no-op failure cases. 127 uint32 CaseDelayMs = 31 [(gogoproto.moretags) = "yaml:\"case-delay-ms\""]; 128 // CaseShuffle is true to randomize failure injecting order. 129 bool CaseShuffle = 32 [(gogoproto.moretags) = "yaml:\"case-shuffle\""]; 130 // Cases is the selected test cases to schedule. 131 // If empty, run all failure cases. 132 repeated string Cases = 33 [(gogoproto.moretags) = "yaml:\"cases\""]; 133 // FailpointCommands is the list of "gofail" commands 134 // (e.g. panic("etcd-tester"),1*sleep(1000). 135 repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""]; 136 137 // RunnerExecPath is a path of etcd-runner binary. 138 string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""]; 139 // ExternalExecPath is a path of script for enabling/disabling an external fault injector. 140 string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""]; 141 142 // Stressers is the list of stresser types: 143 // KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER. 144 repeated string Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""]; 145 // Checkers is the list of consistency checker types: 146 // KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER. 147 // Leave empty to skip consistency checks. 148 repeated string Checkers = 102 [(gogoproto.moretags) = "yaml:\"checkers\""]; 149 150 // StressKeySize is the size of each small key written into etcd. 151 int32 StressKeySize = 201 [(gogoproto.moretags) = "yaml:\"stress-key-size\""]; 152 // StressKeySizeLarge is the size of each large key written into etcd. 153 int32 StressKeySizeLarge = 202 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""]; 154 // StressKeySuffixRange is the count of key range written into etcd. 155 // Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)". 156 int32 StressKeySuffixRange = 203 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""]; 157 // StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100). 158 // Stress keys are created with "fmt.Sprintf("/k%03d", i)". 159 int32 StressKeySuffixRangeTxn = 204 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""]; 160 // StressKeyTxnOps is the number of operations per a transaction (max 64). 161 int32 StressKeyTxnOps = 205 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""]; 162 163 // StressClients is the number of concurrent stressing clients 164 // with "one" shared TCP connection. 165 int32 StressClients = 301 [(gogoproto.moretags) = "yaml:\"stress-clients\""]; 166 // StressQPS is the maximum number of stresser requests per second. 167 int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""]; 168 } 169 170 message Etcd { 171 string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""]; 172 string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""]; 173 string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""]; 174 175 // HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval. 176 // Default value is 100, which is 100ms. 177 int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""]; 178 // ElectionTimeoutMs is the time (in milliseconds) for an election to timeout. 179 // Default value is 1000, which is 1s. 180 int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""]; 181 182 repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""]; 183 repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""]; 184 bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""]; 185 bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""]; 186 string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""]; 187 string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""]; 188 string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""]; 189 190 repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""]; 191 repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""]; 192 bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""]; 193 bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""]; 194 string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""]; 195 string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""]; 196 string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""]; 197 198 string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""]; 199 string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""]; 200 string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""]; 201 202 int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""]; 203 int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""]; 204 205 bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""]; 206 bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""]; 207 } 208 209 enum Operation { 210 // NOT_STARTED is the agent status before etcd first start. 211 NOT_STARTED = 0; 212 213 // INITIAL_START_ETCD is only called to start etcd, the very first time. 214 INITIAL_START_ETCD = 10; 215 // RESTART_ETCD is sent to restart killed etcd. 216 RESTART_ETCD = 11; 217 218 // SIGTERM_ETCD pauses etcd process while keeping data directories 219 // and previous etcd configurations. 220 SIGTERM_ETCD = 20; 221 // SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data 222 // directories to simulate destroying the whole machine. 223 SIGQUIT_ETCD_AND_REMOVE_DATA = 21; 224 225 // SAVE_SNAPSHOT is sent to trigger local member to download its snapshot 226 // onto its local disk with the specified path from tester. 227 SAVE_SNAPSHOT = 30; 228 // RESTORE_RESTART_FROM_SNAPSHOT is sent to trigger local member to 229 // restore a cluster from existing snapshot from disk, and restart 230 // an etcd instance from recovered data. 231 RESTORE_RESTART_FROM_SNAPSHOT = 31; 232 // RESTART_FROM_SNAPSHOT is sent to trigger local member to restart 233 // and join an existing cluster that has been recovered from a snapshot. 234 // Local member joins this cluster with fresh data. 235 RESTART_FROM_SNAPSHOT = 32; 236 237 // SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed, 238 // thus need to archive etcd data directories. 239 SIGQUIT_ETCD_AND_ARCHIVE_DATA = 40; 240 // SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process, 241 // etcd data, and agent server. 242 SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 41; 243 244 // BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to 245 // the peer port on target member's peer port. 246 BLACKHOLE_PEER_PORT_TX_RX = 100; 247 // UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping. 248 UNBLACKHOLE_PEER_PORT_TX_RX = 101; 249 250 // DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to 251 // the peer port on target member's peer port. 252 DELAY_PEER_PORT_TX_RX = 200; 253 // UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays. 254 UNDELAY_PEER_PORT_TX_RX = 201; 255 } 256 257 // Case defines various system faults or test case in distributed systems, 258 // in order to verify correct behavior of etcd servers and clients. 259 enum Case { 260 // SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader) 261 // but does not delete its data directories on disk for next restart. 262 // It waits "delay-ms" before recovering this failure. 263 // The expected behavior is that the follower comes back online 264 // and rejoins the cluster, and then each member continues to process 265 // client requests ('Put' request that requires Raft consensus). 266 SIGTERM_ONE_FOLLOWER = 0; 267 268 // SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen 269 // follower but does not delete its data directories on disk for next 270 // restart. And waits until most up-to-date node (leader) applies the 271 // snapshot count of entries since the stop operation. 272 // The expected behavior is that the follower comes back online and 273 // rejoins the cluster, and then active leader sends snapshot 274 // to the follower to force it to follow the leader's log. 275 // As always, after recovery, each member must be able to process 276 // client requests. 277 SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1; 278 279 // SIGTERM_LEADER stops the active leader node but does not delete its 280 // data directories on disk for next restart. Then it waits "delay-ms" 281 // before recovering this failure, in order to trigger election timeouts. 282 // The expected behavior is that a new leader gets elected, and the 283 // old leader comes back online and rejoins the cluster as a follower. 284 // As always, after recovery, each member must be able to process 285 // client requests. 286 SIGTERM_LEADER = 2; 287 288 // SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node 289 // but does not delete its data directories on disk for next restart. 290 // And waits until most up-to-date node ("new" leader) applies the 291 // snapshot count of entries since the stop operation. 292 // The expected behavior is that cluster elects a new leader, and the 293 // old leader comes back online and rejoins the cluster as a follower. 294 // And it receives the snapshot from the new leader to overwrite its 295 // store. As always, after recovery, each member must be able to 296 // process client requests. 297 SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3; 298 299 // SIGTERM_QUORUM stops majority number of nodes to make the whole cluster 300 // inoperable but does not delete data directories on stopped nodes 301 // for next restart. And it waits "delay-ms" before recovering failure. 302 // The expected behavior is that nodes come back online, thus cluster 303 // comes back operative as well. As always, after recovery, each member 304 // must be able to process client requests. 305 SIGTERM_QUORUM = 4; 306 307 // SIGTERM_ALL stops the whole cluster but does not delete data directories 308 // on disk for next restart. And it waits "delay-ms" before recovering 309 // this failure. 310 // The expected behavior is that nodes come back online, thus cluster 311 // comes back operative as well. As always, after recovery, each member 312 // must be able to process client requests. 313 SIGTERM_ALL = 5; 314 315 // SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower 316 // (non-leader), deletes its data directories on disk, and removes 317 // this member from cluster (membership reconfiguration). On recovery, 318 // tester adds a new member, and this member joins the existing cluster 319 // with fresh data. It waits "delay-ms" before recovering this 320 // failure. This simulates destroying one follower machine, where operator 321 // needs to add a new member from a fresh machine. 322 // The expected behavior is that a new member joins the existing cluster, 323 // and then each member continues to process client requests. 324 SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10; 325 326 // SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly 327 // chosen follower, deletes its data directories on disk, and removes 328 // this member from cluster (membership reconfiguration). On recovery, 329 // tester adds a new member, and this member joins the existing cluster 330 // restart. On member remove, cluster waits until most up-to-date node 331 // (leader) applies the snapshot count of entries since the stop operation. 332 // This simulates destroying a leader machine, where operator needs to add 333 // a new member from a fresh machine. 334 // The expected behavior is that a new member joins the existing cluster, 335 // and receives a snapshot from the active leader. As always, after 336 // recovery, each member must be able to process client requests. 337 SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11; 338 339 // SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its 340 // data directories on disk, and removes this member from cluster. 341 // On recovery, tester adds a new member, and this member joins the 342 // existing cluster with fresh data. It waits "delay-ms" before 343 // recovering this failure. This simulates destroying a leader machine, 344 // where operator needs to add a new member from a fresh machine. 345 // The expected behavior is that a new member joins the existing cluster, 346 // and then each member continues to process client requests. 347 SIGQUIT_AND_REMOVE_LEADER = 12; 348 349 // SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader, 350 // deletes its data directories on disk, and removes this member from 351 // cluster (membership reconfiguration). On recovery, tester adds a new 352 // member, and this member joins the existing cluster restart. On member 353 // remove, cluster waits until most up-to-date node (new leader) applies 354 // the snapshot count of entries since the stop operation. This simulates 355 // destroying a leader machine, where operator needs to add a new member 356 // from a fresh machine. 357 // The expected behavior is that on member remove, cluster elects a new 358 // leader, and a new member joins the existing cluster and receives a 359 // snapshot from the newly elected leader. As always, after recovery, each 360 // member must be able to process client requests. 361 SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13; 362 363 // SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH first 364 // stops majority number of nodes, deletes data directories on those quorum 365 // nodes, to make the whole cluster inoperable. Now that quorum and their 366 // data are totally destroyed, cluster cannot even remove unavailable nodes 367 // (e.g. 2 out of 3 are lost, so no leader can be elected). 368 // Let's assume 3-node cluster of node A, B, and C. One day, node A and B 369 // are destroyed and all their data are gone. The only viable solution is 370 // to recover from C's latest snapshot. 371 // 372 // To simulate: 373 // 1. Assume node C is the current leader with most up-to-date data. 374 // 2. Download snapshot from node C, before destroying node A and B. 375 // 3. Destroy node A and B, and make the whole cluster inoperable. 376 // 4. Now node C cannot operate either. 377 // 5. SIGTERM node C and remove its data directories. 378 // 6. Restore a new seed member from node C's latest snapshot file. 379 // 7. Add another member to establish 2-node cluster. 380 // 8. Add another member to establish 3-node cluster. 381 // 9. Add more if any. 382 // 383 // The expected behavior is that etcd successfully recovers from such 384 // disastrous situation as only 1-node survives out of 3-node cluster, 385 // new members joins the existing cluster, and previous data from snapshot 386 // are still preserved after recovery process. As always, after recovery, 387 // each member must be able to process client requests. 388 SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH = 14; 389 390 // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming 391 // packets from/to the peer port on a randomly chosen follower 392 // (non-leader), and waits for "delay-ms" until recovery. 393 // The expected behavior is that once dropping operation is undone, 394 // each member must be able to process client requests. 395 BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100; 396 397 // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops 398 // all outgoing/incoming packets from/to the peer port on a randomly 399 // chosen follower (non-leader), and waits for most up-to-date node 400 // (leader) applies the snapshot count of entries since the blackhole 401 // operation. 402 // The expected behavior is that once packet drop operation is undone, 403 // the slow follower tries to catch up, possibly receiving the snapshot 404 // from the active leader. As always, after recovery, each member must 405 // be able to process client requests. 406 BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101; 407 408 // BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets 409 // from/to the peer port on the active leader (isolated), and waits for 410 // "delay-ms" until recovery, in order to trigger election timeout. 411 // The expected behavior is that after election timeout, a new leader gets 412 // elected, and once dropping operation is undone, the old leader comes 413 // back and rejoins the cluster as a follower. As always, after recovery, 414 // each member must be able to process client requests. 415 BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102; 416 417 // BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all 418 // outgoing/incoming packets from/to the peer port on the active leader, 419 // and waits for most up-to-date node (leader) applies the snapshot 420 // count of entries since the blackhole operation. 421 // The expected behavior is that cluster elects a new leader, and once 422 // dropping operation is undone, the old leader comes back and rejoins 423 // the cluster as a follower. The slow follower tries to catch up, likely 424 // receiving the snapshot from the new active leader. As always, after 425 // recovery, each member must be able to process client requests. 426 BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103; 427 428 // BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets 429 // from/to the peer ports on majority nodes of cluster, thus losing its 430 // leader and cluster being inoperable. And it waits for "delay-ms" 431 // until recovery. 432 // The expected behavior is that once packet drop operation is undone, 433 // nodes come back online, thus cluster comes back operative. As always, 434 // after recovery, each member must be able to process client requests. 435 BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104; 436 437 // BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets 438 // from/to the peer ports on all nodes, thus making cluster totally 439 // inoperable. It waits for "delay-ms" until recovery. 440 // The expected behavior is that once packet drop operation is undone, 441 // nodes come back online, thus cluster comes back operative. As always, 442 // after recovery, each member must be able to process client requests. 443 BLACKHOLE_PEER_PORT_TX_RX_ALL = 105; 444 445 // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets 446 // from/to the peer port on a randomly chosen follower (non-leader). 447 // It waits for "delay-ms" until recovery. 448 // The expected behavior is that once packet delay operation is undone, 449 // the follower comes back and tries to catch up with latest changes from 450 // cluster. And as always, after recovery, each member must be able to 451 // process client requests. 452 DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200; 453 454 // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming 455 // packets from/to the peer port on a randomly chosen follower 456 // (non-leader) with a randomized time duration (thus isolated). It 457 // waits for "delay-ms" until recovery. 458 // The expected behavior is that once packet delay operation is undone, 459 // each member must be able to process client requests. 460 RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201; 461 462 // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays 463 // outgoing/incoming packets from/to the peer port on a randomly chosen 464 // follower (non-leader), and waits for most up-to-date node (leader) 465 // applies the snapshot count of entries since the delay operation. 466 // The expected behavior is that the delayed follower gets isolated 467 // and behind the current active leader, and once delay operation is undone, 468 // the slow follower comes back and catches up possibly receiving snapshot 469 // from the active leader. As always, after recovery, each member must be 470 // able to process client requests. 471 DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202; 472 473 // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays 474 // outgoing/incoming packets from/to the peer port on a randomly chosen 475 // follower (non-leader) with a randomized time duration, and waits for 476 // most up-to-date node (leader) applies the snapshot count of entries 477 // since the delay operation. 478 // The expected behavior is that the delayed follower gets isolated 479 // and behind the current active leader, and once delay operation is undone, 480 // the slow follower comes back and catches up, possibly receiving a 481 // snapshot from the active leader. As always, after recovery, each member 482 // must be able to process client requests. 483 RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203; 484 485 // DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to 486 // the peer port on the active leader. And waits for "delay-ms" until 487 // recovery. 488 // The expected behavior is that cluster may elect a new leader, and 489 // once packet delay operation is undone, the (old) leader comes back 490 // and tries to catch up with latest changes from cluster. As always, 491 // after recovery, each member must be able to process client requests. 492 DELAY_PEER_PORT_TX_RX_LEADER = 204; 493 494 // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets 495 // from/to the peer port on the active leader with a randomized time 496 // duration. And waits for "delay-ms" until recovery. 497 // The expected behavior is that cluster may elect a new leader, and 498 // once packet delay operation is undone, the (old) leader comes back 499 // and tries to catch up with latest changes from cluster. As always, 500 // after recovery, each member must be able to process client requests. 501 RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205; 502 503 // DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays 504 // outgoing/incoming packets from/to the peer port on the active leader, 505 // and waits for most up-to-date node (current or new leader) applies the 506 // snapshot count of entries since the delay operation. 507 // The expected behavior is that cluster may elect a new leader, and 508 // the old leader gets isolated and behind the current active leader, 509 // and once delay operation is undone, the slow follower comes back 510 // and catches up, likely receiving a snapshot from the active leader. 511 // As always, after recovery, each member must be able to process client 512 // requests. 513 DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206; 514 515 // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays 516 // outgoing/incoming packets from/to the peer port on the active leader, 517 // with a randomized time duration. And it waits for most up-to-date node 518 // (current or new leader) applies the snapshot count of entries since the 519 // delay operation. 520 // The expected behavior is that cluster may elect a new leader, and 521 // the old leader gets isolated and behind the current active leader, 522 // and once delay operation is undone, the slow follower comes back 523 // and catches up, likely receiving a snapshot from the active leader. 524 // As always, after recovery, each member must be able to process client 525 // requests. 526 RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207; 527 528 // DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to 529 // the peer ports on majority nodes of cluster. And it waits for 530 // "delay-ms" until recovery, likely to trigger election timeouts. 531 // The expected behavior is that cluster may elect a new leader, while 532 // quorum of nodes struggle with slow networks, and once delay operation 533 // is undone, nodes come back and cluster comes back operative. As always, 534 // after recovery, each member must be able to process client requests. 535 DELAY_PEER_PORT_TX_RX_QUORUM = 208; 536 537 // RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets 538 // from/to the peer ports on majority nodes of cluster, with randomized 539 // time durations. And it waits for "delay-ms" until recovery, likely 540 // to trigger election timeouts. 541 // The expected behavior is that cluster may elect a new leader, while 542 // quorum of nodes struggle with slow networks, and once delay operation 543 // is undone, nodes come back and cluster comes back operative. As always, 544 // after recovery, each member must be able to process client requests. 545 RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209; 546 547 // DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the 548 // peer ports on all nodes. And it waits for "delay-ms" until recovery, 549 // likely to trigger election timeouts. 550 // The expected behavior is that cluster may become totally inoperable, 551 // struggling with slow networks across the whole cluster. Once delay 552 // operation is undone, nodes come back and cluster comes back operative. 553 // As always, after recovery, each member must be able to process client 554 // requests. 555 DELAY_PEER_PORT_TX_RX_ALL = 210; 556 557 // RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets 558 // from/to the peer ports on all nodes, with randomized time durations. 559 // And it waits for "delay-ms" until recovery, likely to trigger 560 // election timeouts. 561 // The expected behavior is that cluster may become totally inoperable, 562 // struggling with slow networks across the whole cluster. Once delay 563 // operation is undone, nodes come back and cluster comes back operative. 564 // As always, after recovery, each member must be able to process client 565 // requests. 566 RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211; 567 568 // NO_FAIL_WITH_STRESS stops injecting failures while testing the 569 // consistency and correctness under pressure loads, for the duration of 570 // "delay-ms". Goal is to ensure cluster be still making progress 571 // on recovery, and verify system does not deadlock following a sequence 572 // of failure injections. 573 // The expected behavior is that cluster remains fully operative in healthy 574 // condition. As always, after recovery, each member must be able to process 575 // client requests. 576 NO_FAIL_WITH_STRESS = 300; 577 578 // NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS neither injects failures nor 579 // sends stressig client requests to the cluster, for the duration of 580 // "delay-ms". Goal is to ensure cluster be still making progress 581 // on recovery, and verify system does not deadlock following a sequence 582 // of failure injections. 583 // The expected behavior is that cluster remains fully operative in healthy 584 // condition, and clients requests during liveness period succeed without 585 // errors. 586 // Note: this is how Google Chubby does failure injection testing 587 // https://static.googleusercontent.com/media/research.google.com/en//archive/paxos_made_live.pdf. 588 NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301; 589 590 // FAILPOINTS injects failpoints to etcd server runtime, triggering panics 591 // in critical code paths. 592 FAILPOINTS = 400; 593 594 // EXTERNAL runs external failure injection scripts. 595 EXTERNAL = 500; 596 } 597 598 enum Stresser { 599 KV = 0; 600 LEASE = 1; 601 ELECTION_RUNNER = 2; 602 WATCH_RUNNER = 3; 603 LOCK_RACER_RUNNER = 4; 604 LEASE_RUNNER = 5; 605 } 606 607 enum Checker { 608 KV_HASH = 0; 609 LEASE_EXPIRE = 1; 610 RUNNER = 2; 611 NO_CHECK = 3; 612 }