github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/rpcpb/rpc.proto (about) 1 syntax = "proto3"; 2 package rpcpb; 3 4 import "gogoproto/gogo.proto"; 5 6 option (gogoproto.marshaler_all) = true; 7 option (gogoproto.sizer_all) = true; 8 option (gogoproto.unmarshaler_all) = true; 9 option (gogoproto.goproto_getters_all) = false; 10 11 message Request { 12 Operation Operation = 1; 13 // Member contains the same Member object from tester configuration. 14 Member Member = 2; 15 // Tester contains tester configuration. 16 Tester Tester = 3; 17 } 18 19 // SnapshotInfo contains SAVE_SNAPSHOT request results. 20 message SnapshotInfo { 21 string MemberName = 1; 22 repeated string MemberClientURLs = 2; 23 string SnapshotPath = 3; 24 string SnapshotFileSize = 4; 25 string SnapshotTotalSize = 5; 26 int64 SnapshotTotalKey = 6; 27 int64 SnapshotHash = 7; 28 int64 SnapshotRevision = 8; 29 string Took = 9; 30 string Version = 10; 31 } 32 33 message Response { 34 bool Success = 1; 35 string Status = 2; 36 37 // Member contains the same Member object from tester request. 38 Member Member = 3; 39 40 // SnapshotInfo contains SAVE_SNAPSHOT request results. 41 SnapshotInfo SnapshotInfo = 4; 42 } 43 44 service Transport { 45 rpc Transport(stream Request) returns (stream Response) {} 46 } 47 48 message Member { 49 // EtcdExec is the executable etcd binary path in agent server. 50 string EtcdExec = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec\""]; 51 52 // AgentAddr is the agent HTTP server address. 53 string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""]; 54 // FailpointHTTPAddr is the agent's failpoints HTTP server address. 55 string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""]; 56 57 // BaseDir is the base directory where all logs and etcd data are stored. 58 string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""]; 59 60 // EtcdClientProxy is true when client traffic needs to be proxied. 61 // If true, listen client URL port must be different than advertise client URL port. 62 bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""]; 63 // EtcdPeerProxy is true when peer traffic needs to be proxied. 64 // If true, listen peer URL port must be different than advertise peer URL port. 65 bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""]; 66 67 // EtcdClientEndpoint is the etcd client endpoint. 68 string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""]; 69 // Etcd defines etcd binary configuration flags. 70 Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""]; 71 // EtcdOnSnapshotRestore defines one-time use configuration during etcd 72 // snapshot recovery process. 73 Etcd EtcdOnSnapshotRestore = 303; 74 75 // ClientCertData contains cert file contents from this member's etcd server. 76 string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""]; 77 string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""]; 78 // ClientKeyData contains key file contents from this member's etcd server. 79 string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""]; 80 string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""]; 81 // ClientTrustedCAData contains trusted CA file contents from this member's etcd server. 82 string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""]; 83 string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""]; 84 85 // PeerCertData contains cert file contents from this member's etcd server. 86 string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""]; 87 string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""]; 88 // PeerKeyData contains key file contents from this member's etcd server. 89 string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""]; 90 string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""]; 91 // PeerTrustedCAData contains trusted CA file contents from this member's etcd server. 92 string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""]; 93 string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""]; 94 95 // SnapshotPath is the snapshot file path to store or restore from. 96 string SnapshotPath = 601 [(gogoproto.moretags) = "yaml:\"snapshot-path\""]; 97 // SnapshotInfo contains last SAVE_SNAPSHOT request results. 98 SnapshotInfo SnapshotInfo = 602; 99 100 // Failpoints is the GOFAIL_FAILPOINTS environment variable value to use when starting etcd. 101 string Failpoints = 701 [(gogoproto.moretags) = "yaml:\"failpoints\""]; 102 } 103 104 message Tester { 105 string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""]; 106 string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""]; 107 string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""]; 108 109 // DelayLatencyMsRv is the delay latency in milliseconds, 110 // to inject to simulated slow network. 111 uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""]; 112 // DelayLatencyMsRv is the delay latency random variable in milliseconds. 113 uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""]; 114 // UpdatedDelayLatencyMs is the update delay latency in milliseconds, 115 // to inject to simulated slow network. It's the final latency to apply, 116 // in case the latency numbers are randomly generated from given delay latency field. 117 uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""]; 118 119 // RoundLimit is the limit of rounds to run failure set (-1 to run without limits). 120 int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""]; 121 // ExitOnCaseFail is true, then exit tester on first failure. 122 bool ExitOnCaseFail = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""]; 123 // EnablePprof is true to enable profiler. 124 bool EnablePprof = 23 [(gogoproto.moretags) = "yaml:\"enable-pprof\""]; 125 126 // CaseDelayMs is the delay duration after failure is injected. 127 // Useful when triggering snapshot or no-op failure cases. 128 uint32 CaseDelayMs = 31 [(gogoproto.moretags) = "yaml:\"case-delay-ms\""]; 129 // CaseShuffle is true to randomize failure injecting order. 130 bool CaseShuffle = 32 [(gogoproto.moretags) = "yaml:\"case-shuffle\""]; 131 // Cases is the selected test cases to schedule. 132 // If empty, run all failure cases. 133 repeated string Cases = 33 [(gogoproto.moretags) = "yaml:\"cases\""]; 134 // FailpointCommands is the list of "gofail" commands 135 // (e.g. panic("etcd-tester"),1*sleep(1000). 136 repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""]; 137 138 // RunnerExecPath is a path of etcd-runner binary. 139 string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""]; 140 // ExternalExecPath is a path of script for enabling/disabling an external fault injector. 141 string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""]; 142 143 // Stressers is the list of stresser types: 144 // KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER. 145 repeated Stresser Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""]; 146 // Checkers is the list of consistency checker types: 147 // KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER. 148 // Leave empty to skip consistency checks. 149 repeated string Checkers = 102 [(gogoproto.moretags) = "yaml:\"checkers\""]; 150 151 // StressKeySize is the size of each small key written into etcd. 152 int32 StressKeySize = 201 [(gogoproto.moretags) = "yaml:\"stress-key-size\""]; 153 // StressKeySizeLarge is the size of each large key written into etcd. 154 int32 StressKeySizeLarge = 202 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""]; 155 // StressKeySuffixRange is the count of key range written into etcd. 156 // Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)". 157 int32 StressKeySuffixRange = 203 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""]; 158 // StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100). 159 // Stress keys are created with "fmt.Sprintf("/k%03d", i)". 160 int32 StressKeySuffixRangeTxn = 204 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""]; 161 // StressKeyTxnOps is the number of operations per a transaction (max 64). 162 int32 StressKeyTxnOps = 205 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""]; 163 164 // StressClients is the number of concurrent stressing clients 165 // with "one" shared TCP connection. 166 int32 StressClients = 301 [(gogoproto.moretags) = "yaml:\"stress-clients\""]; 167 // StressQPS is the maximum number of stresser requests per second. 168 int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""]; 169 } 170 171 enum StresserType { 172 KV_WRITE_SMALL = 0; 173 KV_WRITE_LARGE = 1; 174 KV_READ_ONE_KEY = 2; 175 KV_READ_RANGE = 3; 176 KV_DELETE_ONE_KEY = 4; 177 KV_DELETE_RANGE = 5; 178 KV_TXN_WRITE_DELETE = 6; 179 180 LEASE = 10; 181 182 ELECTION_RUNNER = 20; 183 WATCH_RUNNER = 31; 184 LOCK_RACER_RUNNER = 41; 185 LEASE_RUNNER = 51; 186 } 187 188 message Stresser { 189 string Type = 1 [(gogoproto.moretags) = "yaml:\"type\""]; 190 double Weight = 2 [(gogoproto.moretags) = "yaml:\"weight\""]; 191 } 192 193 enum Checker { 194 KV_HASH = 0; 195 LEASE_EXPIRE = 1; 196 RUNNER = 2; 197 NO_CHECK = 3; 198 SHORT_TTL_LEASE_EXPIRE = 4; 199 } 200 201 message Etcd { 202 string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""]; 203 string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""]; 204 string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""]; 205 206 // HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval. 207 // Default value is 100, which is 100ms. 208 int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""]; 209 // ElectionTimeoutMs is the time (in milliseconds) for an election to timeout. 210 // Default value is 1000, which is 1s. 211 int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""]; 212 213 repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""]; 214 repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""]; 215 bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""]; 216 bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""]; 217 string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""]; 218 string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""]; 219 string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""]; 220 221 repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""]; 222 repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""]; 223 bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""]; 224 bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""]; 225 string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""]; 226 string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""]; 227 string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""]; 228 229 string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""]; 230 string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""]; 231 string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""]; 232 233 int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""]; 234 int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""]; 235 236 bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""]; 237 bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""]; 238 239 string Logger = 71 [(gogoproto.moretags) = "yaml:\"logger\""]; 240 // LogOutputs is the log file to store current etcd server logs. 241 repeated string LogOutputs = 72 [(gogoproto.moretags) = "yaml:\"log-outputs\""]; 242 string LogLevel = 73 [(gogoproto.moretags) = "yaml:\"log-level\""]; 243 244 bool SocketReuseAddress = 81 [(gogoproto.moretags) = "yaml:\"socket-reuse-address\""]; 245 bool SocketReusePort = 82 [(gogoproto.moretags) = "yaml:\"socket-reuse-port\""]; 246 } 247 248 enum Operation { 249 // NOT_STARTED is the agent status before etcd first start. 250 NOT_STARTED = 0; 251 252 // INITIAL_START_ETCD is only called to start etcd, the very first time. 253 INITIAL_START_ETCD = 10; 254 // RESTART_ETCD is sent to restart killed etcd. 255 RESTART_ETCD = 11; 256 257 // SIGTERM_ETCD pauses etcd process while keeping data directories 258 // and previous etcd configurations. 259 SIGTERM_ETCD = 20; 260 // SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data 261 // directories to simulate destroying the whole machine. 262 SIGQUIT_ETCD_AND_REMOVE_DATA = 21; 263 264 // SAVE_SNAPSHOT is sent to trigger local member to download its snapshot 265 // onto its local disk with the specified path from tester. 266 SAVE_SNAPSHOT = 30; 267 // RESTORE_RESTART_FROM_SNAPSHOT is sent to trigger local member to 268 // restore a cluster from existing snapshot from disk, and restart 269 // an etcd instance from recovered data. 270 RESTORE_RESTART_FROM_SNAPSHOT = 31; 271 // RESTART_FROM_SNAPSHOT is sent to trigger local member to restart 272 // and join an existing cluster that has been recovered from a snapshot. 273 // Local member joins this cluster with fresh data. 274 RESTART_FROM_SNAPSHOT = 32; 275 276 // SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed, 277 // thus need to archive etcd data directories. 278 SIGQUIT_ETCD_AND_ARCHIVE_DATA = 40; 279 // SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process, 280 // etcd data, and agent server. 281 SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 41; 282 283 // BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to 284 // the peer port on target member's peer port. 285 BLACKHOLE_PEER_PORT_TX_RX = 100; 286 // UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping. 287 UNBLACKHOLE_PEER_PORT_TX_RX = 101; 288 289 // DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to 290 // the peer port on target member's peer port. 291 DELAY_PEER_PORT_TX_RX = 200; 292 // UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays. 293 UNDELAY_PEER_PORT_TX_RX = 201; 294 } 295 296 // Case defines various system faults or test case in distributed systems, 297 // in order to verify correct behavior of etcd servers and clients. 298 enum Case { 299 // SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader) 300 // but does not delete its data directories on disk for next restart. 301 // It waits "delay-ms" before recovering this failure. 302 // The expected behavior is that the follower comes back online 303 // and rejoins the cluster, and then each member continues to process 304 // client requests ('Put' request that requires Raft consensus). 305 SIGTERM_ONE_FOLLOWER = 0; 306 307 // SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen 308 // follower but does not delete its data directories on disk for next 309 // restart. And waits until most up-to-date node (leader) applies the 310 // snapshot count of entries since the stop operation. 311 // The expected behavior is that the follower comes back online and 312 // rejoins the cluster, and then active leader sends snapshot 313 // to the follower to force it to follow the leader's log. 314 // As always, after recovery, each member must be able to process 315 // client requests. 316 SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1; 317 318 // SIGTERM_LEADER stops the active leader node but does not delete its 319 // data directories on disk for next restart. Then it waits "delay-ms" 320 // before recovering this failure, in order to trigger election timeouts. 321 // The expected behavior is that a new leader gets elected, and the 322 // old leader comes back online and rejoins the cluster as a follower. 323 // As always, after recovery, each member must be able to process 324 // client requests. 325 SIGTERM_LEADER = 2; 326 327 // SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node 328 // but does not delete its data directories on disk for next restart. 329 // And waits until most up-to-date node ("new" leader) applies the 330 // snapshot count of entries since the stop operation. 331 // The expected behavior is that cluster elects a new leader, and the 332 // old leader comes back online and rejoins the cluster as a follower. 333 // And it receives the snapshot from the new leader to overwrite its 334 // store. As always, after recovery, each member must be able to 335 // process client requests. 336 SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3; 337 338 // SIGTERM_QUORUM stops majority number of nodes to make the whole cluster 339 // inoperable but does not delete data directories on stopped nodes 340 // for next restart. And it waits "delay-ms" before recovering failure. 341 // The expected behavior is that nodes come back online, thus cluster 342 // comes back operative as well. As always, after recovery, each member 343 // must be able to process client requests. 344 SIGTERM_QUORUM = 4; 345 346 // SIGTERM_ALL stops the whole cluster but does not delete data directories 347 // on disk for next restart. And it waits "delay-ms" before recovering 348 // this failure. 349 // The expected behavior is that nodes come back online, thus cluster 350 // comes back operative as well. As always, after recovery, each member 351 // must be able to process client requests. 352 SIGTERM_ALL = 5; 353 354 // SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower 355 // (non-leader), deletes its data directories on disk, and removes 356 // this member from cluster (membership reconfiguration). On recovery, 357 // tester adds a new member, and this member joins the existing cluster 358 // with fresh data. It waits "delay-ms" before recovering this 359 // failure. This simulates destroying one follower machine, where operator 360 // needs to add a new member from a fresh machine. 361 // The expected behavior is that a new member joins the existing cluster, 362 // and then each member continues to process client requests. 363 SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10; 364 365 // SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly 366 // chosen follower, deletes its data directories on disk, and removes 367 // this member from cluster (membership reconfiguration). On recovery, 368 // tester adds a new member, and this member joins the existing cluster 369 // restart. On member remove, cluster waits until most up-to-date node 370 // (leader) applies the snapshot count of entries since the stop operation. 371 // This simulates destroying a leader machine, where operator needs to add 372 // a new member from a fresh machine. 373 // The expected behavior is that a new member joins the existing cluster, 374 // and receives a snapshot from the active leader. As always, after 375 // recovery, each member must be able to process client requests. 376 SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11; 377 378 // SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its 379 // data directories on disk, and removes this member from cluster. 380 // On recovery, tester adds a new member, and this member joins the 381 // existing cluster with fresh data. It waits "delay-ms" before 382 // recovering this failure. This simulates destroying a leader machine, 383 // where operator needs to add a new member from a fresh machine. 384 // The expected behavior is that a new member joins the existing cluster, 385 // and then each member continues to process client requests. 386 SIGQUIT_AND_REMOVE_LEADER = 12; 387 388 // SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader, 389 // deletes its data directories on disk, and removes this member from 390 // cluster (membership reconfiguration). On recovery, tester adds a new 391 // member, and this member joins the existing cluster restart. On member 392 // remove, cluster waits until most up-to-date node (new leader) applies 393 // the snapshot count of entries since the stop operation. This simulates 394 // destroying a leader machine, where operator needs to add a new member 395 // from a fresh machine. 396 // The expected behavior is that on member remove, cluster elects a new 397 // leader, and a new member joins the existing cluster and receives a 398 // snapshot from the newly elected leader. As always, after recovery, each 399 // member must be able to process client requests. 400 SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13; 401 402 // SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH first 403 // stops majority number of nodes, deletes data directories on those quorum 404 // nodes, to make the whole cluster inoperable. Now that quorum and their 405 // data are totally destroyed, cluster cannot even remove unavailable nodes 406 // (e.g. 2 out of 3 are lost, so no leader can be elected). 407 // Let's assume 3-node cluster of node A, B, and C. One day, node A and B 408 // are destroyed and all their data are gone. The only viable solution is 409 // to recover from C's latest snapshot. 410 // 411 // To simulate: 412 // 1. Assume node C is the current leader with most up-to-date data. 413 // 2. Download snapshot from node C, before destroying node A and B. 414 // 3. Destroy node A and B, and make the whole cluster inoperable. 415 // 4. Now node C cannot operate either. 416 // 5. SIGTERM node C and remove its data directories. 417 // 6. Restore a new seed member from node C's latest snapshot file. 418 // 7. Add another member to establish 2-node cluster. 419 // 8. Add another member to establish 3-node cluster. 420 // 9. Add more if any. 421 // 422 // The expected behavior is that etcd successfully recovers from such 423 // disastrous situation as only 1-node survives out of 3-node cluster, 424 // new members joins the existing cluster, and previous data from snapshot 425 // are still preserved after recovery process. As always, after recovery, 426 // each member must be able to process client requests. 427 SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH = 14; 428 429 // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming 430 // packets from/to the peer port on a randomly chosen follower 431 // (non-leader), and waits for "delay-ms" until recovery. 432 // The expected behavior is that once dropping operation is undone, 433 // each member must be able to process client requests. 434 BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100; 435 436 // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops 437 // all outgoing/incoming packets from/to the peer port on a randomly 438 // chosen follower (non-leader), and waits for most up-to-date node 439 // (leader) applies the snapshot count of entries since the blackhole 440 // operation. 441 // The expected behavior is that once packet drop operation is undone, 442 // the slow follower tries to catch up, possibly receiving the snapshot 443 // from the active leader. As always, after recovery, each member must 444 // be able to process client requests. 445 BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101; 446 447 // BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets 448 // from/to the peer port on the active leader (isolated), and waits for 449 // "delay-ms" until recovery, in order to trigger election timeout. 450 // The expected behavior is that after election timeout, a new leader gets 451 // elected, and once dropping operation is undone, the old leader comes 452 // back and rejoins the cluster as a follower. As always, after recovery, 453 // each member must be able to process client requests. 454 BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102; 455 456 // BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all 457 // outgoing/incoming packets from/to the peer port on the active leader, 458 // and waits for most up-to-date node (leader) applies the snapshot 459 // count of entries since the blackhole operation. 460 // The expected behavior is that cluster elects a new leader, and once 461 // dropping operation is undone, the old leader comes back and rejoins 462 // the cluster as a follower. The slow follower tries to catch up, likely 463 // receiving the snapshot from the new active leader. As always, after 464 // recovery, each member must be able to process client requests. 465 BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103; 466 467 // BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets 468 // from/to the peer ports on majority nodes of cluster, thus losing its 469 // leader and cluster being inoperable. And it waits for "delay-ms" 470 // until recovery. 471 // The expected behavior is that once packet drop operation is undone, 472 // nodes come back online, thus cluster comes back operative. As always, 473 // after recovery, each member must be able to process client requests. 474 BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104; 475 476 // BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets 477 // from/to the peer ports on all nodes, thus making cluster totally 478 // inoperable. It waits for "delay-ms" until recovery. 479 // The expected behavior is that once packet drop operation is undone, 480 // nodes come back online, thus cluster comes back operative. As always, 481 // after recovery, each member must be able to process client requests. 482 BLACKHOLE_PEER_PORT_TX_RX_ALL = 105; 483 484 // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets 485 // from/to the peer port on a randomly chosen follower (non-leader). 486 // It waits for "delay-ms" until recovery. 487 // The expected behavior is that once packet delay operation is undone, 488 // the follower comes back and tries to catch up with latest changes from 489 // cluster. And as always, after recovery, each member must be able to 490 // process client requests. 491 DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200; 492 493 // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming 494 // packets from/to the peer port on a randomly chosen follower 495 // (non-leader) with a randomized time duration (thus isolated). It 496 // waits for "delay-ms" until recovery. 497 // The expected behavior is that once packet delay operation is undone, 498 // each member must be able to process client requests. 499 RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201; 500 501 // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays 502 // outgoing/incoming packets from/to the peer port on a randomly chosen 503 // follower (non-leader), and waits for most up-to-date node (leader) 504 // applies the snapshot count of entries since the delay operation. 505 // The expected behavior is that the delayed follower gets isolated 506 // and behind the current active leader, and once delay operation is undone, 507 // the slow follower comes back and catches up possibly receiving snapshot 508 // from the active leader. As always, after recovery, each member must be 509 // able to process client requests. 510 DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202; 511 512 // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays 513 // outgoing/incoming packets from/to the peer port on a randomly chosen 514 // follower (non-leader) with a randomized time duration, and waits for 515 // most up-to-date node (leader) applies the snapshot count of entries 516 // since the delay operation. 517 // The expected behavior is that the delayed follower gets isolated 518 // and behind the current active leader, and once delay operation is undone, 519 // the slow follower comes back and catches up, possibly receiving a 520 // snapshot from the active leader. As always, after recovery, each member 521 // must be able to process client requests. 522 RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203; 523 524 // DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to 525 // the peer port on the active leader. And waits for "delay-ms" until 526 // recovery. 527 // The expected behavior is that cluster may elect a new leader, and 528 // once packet delay operation is undone, the (old) leader comes back 529 // and tries to catch up with latest changes from cluster. As always, 530 // after recovery, each member must be able to process client requests. 531 DELAY_PEER_PORT_TX_RX_LEADER = 204; 532 533 // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets 534 // from/to the peer port on the active leader with a randomized time 535 // duration. And waits for "delay-ms" until recovery. 536 // The expected behavior is that cluster may elect a new leader, and 537 // once packet delay operation is undone, the (old) leader comes back 538 // and tries to catch up with latest changes from cluster. As always, 539 // after recovery, each member must be able to process client requests. 540 RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205; 541 542 // DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays 543 // outgoing/incoming packets from/to the peer port on the active leader, 544 // and waits for most up-to-date node (current or new leader) applies the 545 // snapshot count of entries since the delay operation. 546 // The expected behavior is that cluster may elect a new leader, and 547 // the old leader gets isolated and behind the current active leader, 548 // and once delay operation is undone, the slow follower comes back 549 // and catches up, likely receiving a snapshot from the active leader. 550 // As always, after recovery, each member must be able to process client 551 // requests. 552 DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206; 553 554 // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays 555 // outgoing/incoming packets from/to the peer port on the active leader, 556 // with a randomized time duration. And it waits for most up-to-date node 557 // (current or new leader) applies the snapshot count of entries since the 558 // delay operation. 559 // The expected behavior is that cluster may elect a new leader, and 560 // the old leader gets isolated and behind the current active leader, 561 // and once delay operation is undone, the slow follower comes back 562 // and catches up, likely receiving a snapshot from the active leader. 563 // As always, after recovery, each member must be able to process client 564 // requests. 565 RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207; 566 567 // DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to 568 // the peer ports on majority nodes of cluster. And it waits for 569 // "delay-ms" until recovery, likely to trigger election timeouts. 570 // The expected behavior is that cluster may elect a new leader, while 571 // quorum of nodes struggle with slow networks, and once delay operation 572 // is undone, nodes come back and cluster comes back operative. As always, 573 // after recovery, each member must be able to process client requests. 574 DELAY_PEER_PORT_TX_RX_QUORUM = 208; 575 576 // RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets 577 // from/to the peer ports on majority nodes of cluster, with randomized 578 // time durations. And it waits for "delay-ms" until recovery, likely 579 // to trigger election timeouts. 580 // The expected behavior is that cluster may elect a new leader, while 581 // quorum of nodes struggle with slow networks, and once delay operation 582 // is undone, nodes come back and cluster comes back operative. As always, 583 // after recovery, each member must be able to process client requests. 584 RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209; 585 586 // DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the 587 // peer ports on all nodes. And it waits for "delay-ms" until recovery, 588 // likely to trigger election timeouts. 589 // The expected behavior is that cluster may become totally inoperable, 590 // struggling with slow networks across the whole cluster. Once delay 591 // operation is undone, nodes come back and cluster comes back operative. 592 // As always, after recovery, each member must be able to process client 593 // requests. 594 DELAY_PEER_PORT_TX_RX_ALL = 210; 595 596 // RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets 597 // from/to the peer ports on all nodes, with randomized time durations. 598 // And it waits for "delay-ms" until recovery, likely to trigger 599 // election timeouts. 600 // The expected behavior is that cluster may become totally inoperable, 601 // struggling with slow networks across the whole cluster. Once delay 602 // operation is undone, nodes come back and cluster comes back operative. 603 // As always, after recovery, each member must be able to process client 604 // requests. 605 RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211; 606 607 // NO_FAIL_WITH_STRESS stops injecting failures while testing the 608 // consistency and correctness under pressure loads, for the duration of 609 // "delay-ms". Goal is to ensure cluster be still making progress 610 // on recovery, and verify system does not deadlock following a sequence 611 // of failure injections. 612 // The expected behavior is that cluster remains fully operative in healthy 613 // condition. As always, after recovery, each member must be able to process 614 // client requests. 615 NO_FAIL_WITH_STRESS = 300; 616 617 // NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS neither injects failures nor 618 // sends stressig client requests to the cluster, for the duration of 619 // "delay-ms". Goal is to ensure cluster be still making progress 620 // on recovery, and verify system does not deadlock following a sequence 621 // of failure injections. 622 // The expected behavior is that cluster remains fully operative in healthy 623 // condition, and clients requests during liveness period succeed without 624 // errors. 625 // Note: this is how Google Chubby does failure injection testing 626 // https://static.googleusercontent.com/media/research.google.com/en//archive/paxos_made_live.pdf. 627 NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301; 628 629 // FAILPOINTS injects failpoints to etcd server runtime, triggering panics 630 // in critical code paths. 631 FAILPOINTS = 400; 632 633 // FAILPOINTS_WITH_DISK_IO_LATENCY injects high disk I/O latency failure in raftAfterSave code paths. 634 FAILPOINTS_WITH_DISK_IO_LATENCY = 401; 635 636 // EXTERNAL runs external failure injection scripts. 637 EXTERNAL = 500; 638 }