go.etcd.io/etcd@v3.3.27+incompatible/functional/rpcpb/rpc.proto

go.etcd.io/etcd@v3.3.27+incompatible/functional/rpcpb/rpc.proto (about)

     1  syntax = "proto3";
     2  package rpcpb;
     3  
     4  import "github.com/gogo/protobuf/gogoproto/gogo.proto";
     5  
     6  option (gogoproto.marshaler_all) = true;
     7  option (gogoproto.sizer_all) = true;
     8  option (gogoproto.unmarshaler_all) = true;
     9  option (gogoproto.goproto_getters_all) = false;
    10  
    11  message Request {
    12    Operation Operation = 1;
    13    // Member contains the same Member object from tester configuration.
    14    Member Member = 2;
    15    // Tester contains tester configuration.
    16    Tester Tester = 3;
    17  }
    18  
    19  // SnapshotInfo contains SAVE_SNAPSHOT request results.
    20  message SnapshotInfo {
    21    string MemberName = 1;
    22    repeated string MemberClientURLs = 2;
    23    string SnapshotPath = 3;
    24    string SnapshotFileSize = 4;
    25    string SnapshotTotalSize = 5;
    26    int64 SnapshotTotalKey = 6;
    27    int64 SnapshotHash = 7;
    28    int64 SnapshotRevision = 8;
    29    string Took = 9;
    30  }
    31  
    32  message Response {
    33    bool Success = 1;
    34    string Status = 2;
    35  
    36    // Member contains the same Member object from tester request.
    37    Member Member = 3;
    38  
    39    // SnapshotInfo contains SAVE_SNAPSHOT request results.
    40    SnapshotInfo SnapshotInfo = 4;
    41  }
    42  
    43  service Transport {
    44    rpc Transport(stream Request) returns (stream Response) {}
    45  }
    46  
    47  message Member {
    48    // EtcdExecPath is the executable etcd binary path in agent server.
    49    string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
    50    // TODO: support embedded etcd
    51  
    52    // AgentAddr is the agent HTTP server address.
    53    string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
    54    // FailpointHTTPAddr is the agent's failpoints HTTP server address.
    55    string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
    56  
    57    // BaseDir is the base directory where all logs and etcd data are stored.
    58    string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
    59    // EtcdLogPath is the log file to store current etcd server logs.
    60    string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
    61  
    62    // EtcdClientProxy is true when client traffic needs to be proxied.
    63    // If true, listen client URL port must be different than advertise client URL port.
    64    bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
    65    // EtcdPeerProxy is true when peer traffic needs to be proxied.
    66    // If true, listen peer URL port must be different than advertise peer URL port.
    67    bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
    68  
    69    // EtcdClientEndpoint is the etcd client endpoint.
    70    string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
    71    // Etcd defines etcd binary configuration flags.
    72    Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
    73    // EtcdOnSnapshotRestore defines one-time use configuration during etcd
    74    // snapshot recovery process.
    75    Etcd EtcdOnSnapshotRestore = 303;
    76  
    77    // ClientCertData contains cert file contents from this member's etcd server.
    78    string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
    79    string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
    80    // ClientKeyData contains key file contents from this member's etcd server.
    81    string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
    82    string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
    83    // ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
    84    string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
    85    string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
    86  
    87    // PeerCertData contains cert file contents from this member's etcd server.
    88    string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
    89    string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
    90    // PeerKeyData contains key file contents from this member's etcd server.
    91    string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
    92    string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
    93    // PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
    94    string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
    95    string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
    96  
    97    // SnapshotPath is the snapshot file path to store or restore from.
    98    string SnapshotPath = 601 [(gogoproto.moretags) = "yaml:\"snapshot-path\""];
    99    // SnapshotInfo contains last SAVE_SNAPSHOT request results.
   100    SnapshotInfo SnapshotInfo = 602;
   101  }
   102  
   103  message Tester {
   104    string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
   105    string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
   106    string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
   107  
   108    // DelayLatencyMsRv is the delay latency in milliseconds,
   109    // to inject to simulated slow network.
   110    uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
   111    // DelayLatencyMsRv is the delay latency random variable in milliseconds.
   112    uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
   113    // UpdatedDelayLatencyMs is the update delay latency in milliseconds,
   114    // to inject to simulated slow network. It's the final latency to apply,
   115    // in case the latency numbers are randomly generated from given delay latency field.
   116    uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
   117  
   118    // RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
   119    int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
   120    // ExitOnCaseFail is true, then exit tester on first failure.
   121    bool ExitOnCaseFail = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
   122    // EnablePprof is true to enable profiler.
   123    bool EnablePprof = 23 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
   124  
   125    // CaseDelayMs is the delay duration after failure is injected.
   126    // Useful when triggering snapshot or no-op failure cases.
   127    uint32 CaseDelayMs = 31 [(gogoproto.moretags) = "yaml:\"case-delay-ms\""];
   128    // CaseShuffle is true to randomize failure injecting order.
   129    bool CaseShuffle = 32 [(gogoproto.moretags) = "yaml:\"case-shuffle\""];
   130    // Cases is the selected test cases to schedule.
   131    // If empty, run all failure cases.
   132    repeated string Cases = 33 [(gogoproto.moretags) = "yaml:\"cases\""];
   133    // FailpointCommands is the list of "gofail" commands
   134    // (e.g. panic("etcd-tester"),1*sleep(1000).
   135    repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
   136  
   137    // RunnerExecPath is a path of etcd-runner binary.
   138    string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
   139    // ExternalExecPath is a path of script for enabling/disabling an external fault injector.
   140    string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
   141  
   142    // Stressers is the list of stresser types:
   143    // KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER.
   144    repeated string Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""];
   145    // Checkers is the list of consistency checker types:
   146    // KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER.
   147    // Leave empty to skip consistency checks.
   148    repeated string Checkers = 102 [(gogoproto.moretags) = "yaml:\"checkers\""];
   149  
   150    // StressKeySize is the size of each small key written into etcd.
   151    int32 StressKeySize = 201 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
   152    // StressKeySizeLarge is the size of each large key written into etcd.
   153    int32 StressKeySizeLarge = 202 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
   154    // StressKeySuffixRange is the count of key range written into etcd.
   155    // Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
   156    int32 StressKeySuffixRange = 203 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
   157    // StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
   158    // Stress keys are created with "fmt.Sprintf("/k%03d", i)".
   159    int32 StressKeySuffixRangeTxn = 204 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
   160    // StressKeyTxnOps is the number of operations per a transaction (max 64).
   161    int32 StressKeyTxnOps = 205 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
   162  
   163    // StressClients is the number of concurrent stressing clients
   164    // with "one" shared TCP connection.
   165    int32 StressClients = 301 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
   166    // StressQPS is the maximum number of stresser requests per second.
   167    int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
   168  }
   169  
   170  message Etcd {
   171    string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
   172    string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
   173    string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
   174  
   175    // HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
   176    // Default value is 100, which is 100ms.
   177    int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
   178    // ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
   179    // Default value is 1000, which is 1s.
   180    int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
   181  
   182    repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
   183    repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
   184    bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
   185    bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
   186    string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
   187    string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
   188    string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
   189  
   190    repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
   191    repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
   192    bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
   193    bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
   194    string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
   195    string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
   196    string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
   197  
   198    string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
   199    string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
   200    string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
   201  
   202    int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
   203    int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
   204  
   205    bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
   206    bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
   207  }
   208  
   209  enum Operation {
   210    // NOT_STARTED is the agent status before etcd first start.
   211    NOT_STARTED = 0;
   212  
   213    // INITIAL_START_ETCD is only called to start etcd, the very first time.
   214    INITIAL_START_ETCD = 10;
   215    // RESTART_ETCD is sent to restart killed etcd.
   216    RESTART_ETCD = 11;
   217  
   218    // SIGTERM_ETCD pauses etcd process while keeping data directories
   219    // and previous etcd configurations.
   220    SIGTERM_ETCD = 20;
   221    // SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data
   222    // directories to simulate destroying the whole machine.
   223    SIGQUIT_ETCD_AND_REMOVE_DATA = 21;
   224  
   225    // SAVE_SNAPSHOT is sent to trigger local member to download its snapshot
   226    // onto its local disk with the specified path from tester.
   227    SAVE_SNAPSHOT = 30;
   228    // RESTORE_RESTART_FROM_SNAPSHOT is sent to trigger local member to
   229    // restore a cluster from existing snapshot from disk, and restart
   230    // an etcd instance from recovered data.
   231    RESTORE_RESTART_FROM_SNAPSHOT = 31;
   232    // RESTART_FROM_SNAPSHOT is sent to trigger local member to restart
   233    // and join an existing cluster that has been recovered from a snapshot.
   234    // Local member joins this cluster with fresh data.
   235    RESTART_FROM_SNAPSHOT = 32;
   236  
   237    // SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed,
   238    // thus need to archive etcd data directories.
   239    SIGQUIT_ETCD_AND_ARCHIVE_DATA = 40;
   240    // SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process,
   241    // etcd data, and agent server.
   242    SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 41;
   243  
   244    // BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to
   245    // the peer port on target member's peer port.
   246    BLACKHOLE_PEER_PORT_TX_RX = 100;
   247    // UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping.
   248    UNBLACKHOLE_PEER_PORT_TX_RX = 101;
   249  
   250    // DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to
   251    // the peer port on target member's peer port.
   252    DELAY_PEER_PORT_TX_RX = 200;
   253    // UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays.
   254    UNDELAY_PEER_PORT_TX_RX = 201;
   255  }
   256  
   257  // Case defines various system faults or test case in distributed systems,
   258  // in order to verify correct behavior of etcd servers and clients.
   259  enum Case {
   260    // SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
   261    // but does not delete its data directories on disk for next restart.
   262    // It waits "delay-ms" before recovering this failure.
   263    // The expected behavior is that the follower comes back online
   264    // and rejoins the cluster, and then each member continues to process
   265    // client requests ('Put' request that requires Raft consensus).
   266    SIGTERM_ONE_FOLLOWER = 0;
   267  
   268    // SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
   269    // follower but does not delete its data directories on disk for next
   270    // restart. And waits until most up-to-date node (leader) applies the
   271    // snapshot count of entries since the stop operation.
   272    // The expected behavior is that the follower comes back online and
   273    // rejoins the cluster, and then active leader sends snapshot
   274    // to the follower to force it to follow the leader's log.
   275    // As always, after recovery, each member must be able to process
   276    // client requests.
   277    SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
   278  
   279    // SIGTERM_LEADER stops the active leader node but does not delete its
   280    // data directories on disk for next restart. Then it waits "delay-ms"
   281    // before recovering this failure, in order to trigger election timeouts.
   282    // The expected behavior is that a new leader gets elected, and the
   283    // old leader comes back online and rejoins the cluster as a follower.
   284    // As always, after recovery, each member must be able to process
   285    // client requests.
   286    SIGTERM_LEADER = 2;
   287  
   288    // SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
   289    // but does not delete its data directories on disk for next restart.
   290    // And waits until most up-to-date node ("new" leader) applies the
   291    // snapshot count of entries since the stop operation.
   292    // The expected behavior is that cluster elects a new leader, and the
   293    // old leader comes back online and rejoins the cluster as a follower.
   294    // And it receives the snapshot from the new leader to overwrite its
   295    // store. As always, after recovery, each member must be able to
   296    // process client requests.
   297    SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
   298  
   299    // SIGTERM_QUORUM stops majority number of nodes to make the whole cluster
   300    // inoperable but does not delete data directories on stopped nodes
   301    // for next restart. And it waits "delay-ms" before recovering failure.
   302    // The expected behavior is that nodes come back online, thus cluster
   303    // comes back operative as well. As always, after recovery, each member
   304    // must be able to process client requests.
   305    SIGTERM_QUORUM = 4;
   306  
   307    // SIGTERM_ALL stops the whole cluster but does not delete data directories
   308    // on disk for next restart. And it waits "delay-ms" before  recovering
   309    // this failure.
   310    // The expected behavior is that nodes come back online, thus cluster
   311    // comes back operative as well. As always, after recovery, each member
   312    // must be able to process client requests.
   313    SIGTERM_ALL = 5;
   314  
   315    // SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower
   316    // (non-leader), deletes its data directories on disk, and removes
   317    // this member from cluster (membership reconfiguration). On recovery,
   318    // tester adds a new member, and this member joins the existing cluster
   319    // with fresh data. It waits "delay-ms" before recovering this
   320    // failure. This simulates destroying one follower machine, where operator
   321    // needs to add a new member from a fresh machine.
   322    // The expected behavior is that a new member joins the existing cluster,
   323    // and then each member continues to process client requests.
   324    SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10;
   325  
   326    // SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly
   327    // chosen follower, deletes its data directories on disk, and removes
   328    // this member from cluster (membership reconfiguration). On recovery,
   329    // tester adds a new member, and this member joins the existing cluster
   330    // restart. On member remove, cluster waits until most up-to-date node
   331    // (leader) applies the snapshot count of entries since the stop operation.
   332    // This simulates destroying a leader machine, where operator needs to add
   333    // a new member from a fresh machine.
   334    // The expected behavior is that a new member joins the existing cluster,
   335    // and receives a snapshot from the active leader. As always, after
   336    // recovery, each member must be able to process client requests.
   337    SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11;
   338  
   339    // SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its
   340    // data directories on disk, and removes this member from cluster.
   341    // On recovery, tester adds a new member, and this member joins the
   342    // existing cluster with fresh data. It waits "delay-ms" before
   343    // recovering this failure. This simulates destroying a leader machine,
   344    // where operator needs to add a new member from a fresh machine.
   345    // The expected behavior is that a new member joins the existing cluster,
   346    // and then each member continues to process client requests.
   347    SIGQUIT_AND_REMOVE_LEADER = 12;
   348  
   349    // SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader,
   350    // deletes its data directories on disk, and removes this member from
   351    // cluster (membership reconfiguration). On recovery, tester adds a new
   352    // member, and this member joins the existing cluster restart. On member
   353    // remove, cluster waits until most up-to-date node (new leader) applies
   354    // the snapshot count of entries since the stop operation. This simulates
   355    // destroying a leader machine, where operator needs to add a new member
   356    // from a fresh machine.
   357    // The expected behavior is that on member remove, cluster elects a new
   358    // leader, and a new member joins the existing cluster and receives a
   359    // snapshot from the newly elected leader. As always, after recovery, each
   360    // member must be able to process client requests.
   361    SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13;
   362  
   363    // SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH first
   364    // stops majority number of nodes, deletes data directories on those quorum
   365    // nodes, to make the whole cluster inoperable. Now that quorum and their
   366    // data are totally destroyed, cluster cannot even remove unavailable nodes
   367    // (e.g. 2 out of 3 are lost, so no leader can be elected).
   368    // Let's assume 3-node cluster of node A, B, and C. One day, node A and B
   369    // are destroyed and all their data are gone. The only viable solution is
   370    // to recover from C's latest snapshot.
   371    //
   372    // To simulate:
   373    //  1. Assume node C is the current leader with most up-to-date data.
   374    //  2. Download snapshot from node C, before destroying node A and B.
   375    //  3. Destroy node A and B, and make the whole cluster inoperable.
   376    //  4. Now node C cannot operate either.
   377    //  5. SIGTERM node C and remove its data directories.
   378    //  6. Restore a new seed member from node C's latest snapshot file.
   379    //  7. Add another member to establish 2-node cluster.
   380    //  8. Add another member to establish 3-node cluster.
   381    //  9. Add more if any.
   382    //
   383    // The expected behavior is that etcd successfully recovers from such
   384    // disastrous situation as only 1-node survives out of 3-node cluster,
   385    // new members joins the existing cluster, and previous data from snapshot
   386    // are still preserved after recovery process. As always, after recovery,
   387    // each member must be able to process client requests.
   388    SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH = 14;
   389  
   390    // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
   391    // packets from/to the peer port on a randomly chosen follower
   392    // (non-leader), and waits for "delay-ms" until recovery.
   393    // The expected behavior is that once dropping operation is undone,
   394    // each member must be able to process client requests.
   395    BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
   396  
   397    // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
   398    // all outgoing/incoming packets from/to the peer port on a randomly
   399    // chosen follower (non-leader), and waits for most up-to-date node
   400    // (leader) applies the snapshot count of entries since the blackhole
   401    // operation.
   402    // The expected behavior is that once packet drop operation is undone,
   403    // the slow follower tries to catch up, possibly receiving the snapshot
   404    // from the active leader. As always, after recovery, each member must
   405    // be able to process client requests.
   406    BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
   407  
   408    // BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
   409    // from/to the peer port on the active leader (isolated), and waits for
   410    // "delay-ms" until recovery, in order to trigger election timeout.
   411    // The expected behavior is that after election timeout, a new leader gets
   412    // elected, and once dropping operation is undone, the old leader comes
   413    // back and rejoins the cluster as a follower. As always, after recovery,
   414    // each member must be able to process client requests.
   415    BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
   416  
   417    // BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
   418    // outgoing/incoming packets from/to the peer port on the active leader,
   419    // and waits for most up-to-date node (leader) applies the snapshot
   420    // count of entries since the blackhole operation.
   421    // The expected behavior is that cluster elects a new leader, and once
   422    // dropping operation is undone, the old leader comes back and rejoins
   423    // the cluster as a follower. The slow follower tries to catch up, likely
   424    // receiving the snapshot from the new active leader. As always, after
   425    // recovery, each member must be able to process client requests.
   426    BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
   427  
   428    // BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
   429    // from/to the peer ports on majority nodes of cluster, thus losing its
   430    // leader and cluster being inoperable. And it waits for "delay-ms"
   431    // until recovery.
   432    // The expected behavior is that once packet drop operation is undone,
   433    // nodes come back online, thus cluster comes back operative. As always,
   434    // after recovery, each member must be able to process client requests.
   435    BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
   436  
   437    // BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
   438    // from/to the peer ports on all nodes, thus making cluster totally
   439    // inoperable. It waits for "delay-ms" until recovery.
   440    // The expected behavior is that once packet drop operation is undone,
   441    // nodes come back online, thus cluster comes back operative. As always,
   442    // after recovery, each member must be able to process client requests.
   443    BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
   444  
   445    // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
   446    // from/to the peer port on a randomly chosen follower (non-leader).
   447    // It waits for "delay-ms" until recovery.
   448    // The expected behavior is that once packet delay operation is undone,
   449    // the follower comes back and tries to catch up with latest changes from
   450    // cluster. And as always, after recovery, each member must be able to
   451    // process client requests.
   452    DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
   453  
   454    // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
   455    // packets from/to the peer port on a randomly chosen follower
   456    // (non-leader) with a randomized time duration (thus isolated). It
   457    // waits for "delay-ms" until recovery.
   458    // The expected behavior is that once packet delay operation is undone,
   459    // each member must be able to process client requests.
   460    RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
   461  
   462    // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
   463    // outgoing/incoming packets from/to the peer port on a randomly chosen
   464    // follower (non-leader), and waits for most up-to-date node (leader)
   465    // applies the snapshot count of entries since the delay operation.
   466    // The expected behavior is that the delayed follower gets isolated
   467    // and behind the current active leader, and once delay operation is undone,
   468    // the slow follower comes back and catches up possibly receiving snapshot
   469    // from the active leader. As always, after recovery, each member must be
   470    // able to process client requests.
   471    DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
   472  
   473    // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
   474    // outgoing/incoming packets from/to the peer port on a randomly chosen
   475    // follower (non-leader) with a randomized time duration, and waits for
   476    // most up-to-date node (leader) applies the snapshot count of entries
   477    // since the delay operation.
   478    // The expected behavior is that the delayed follower gets isolated
   479    // and behind the current active leader, and once delay operation is undone,
   480    // the slow follower comes back and catches up, possibly receiving a
   481    // snapshot from the active leader. As always, after recovery, each member
   482    // must be able to process client requests.
   483    RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
   484  
   485    // DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
   486    // the peer port on the active leader. And waits for "delay-ms" until
   487    // recovery.
   488    // The expected behavior is that cluster may elect a new leader, and
   489    // once packet delay operation is undone, the (old) leader comes back
   490    // and tries to catch up with latest changes from cluster. As always,
   491    // after recovery, each member must be able to process client requests.
   492    DELAY_PEER_PORT_TX_RX_LEADER = 204;
   493  
   494    // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
   495    // from/to the peer port on the active leader with a randomized time
   496    // duration. And waits for "delay-ms" until recovery.
   497    // The expected behavior is that cluster may elect a new leader, and
   498    // once packet delay operation is undone, the (old) leader comes back
   499    // and tries to catch up with latest changes from cluster. As always,
   500    // after recovery, each member must be able to process client requests.
   501    RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
   502  
   503    // DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
   504    // outgoing/incoming packets from/to the peer port on the active leader,
   505    // and waits for most up-to-date node (current or new leader) applies the
   506    // snapshot count of entries since the delay operation.
   507    // The expected behavior is that cluster may elect a new leader, and
   508    // the old leader gets isolated and behind the current active leader,
   509    // and once delay operation is undone, the slow follower comes back
   510    // and catches up, likely receiving a snapshot from the active leader.
   511    // As always, after recovery, each member must be able to process client
   512    // requests.
   513    DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
   514  
   515    // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
   516    // outgoing/incoming packets from/to the peer port on the active leader,
   517    // with a randomized time duration. And it waits for most up-to-date node
   518    // (current or new leader) applies the snapshot count of entries since the
   519    // delay operation.
   520    // The expected behavior is that cluster may elect a new leader, and
   521    // the old leader gets isolated and behind the current active leader,
   522    // and once delay operation is undone, the slow follower comes back
   523    // and catches up, likely receiving a snapshot from the active leader.
   524    // As always, after recovery, each member must be able to process client
   525    // requests.
   526    RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
   527  
   528    // DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
   529    // the peer ports on majority nodes of cluster. And it waits for
   530    // "delay-ms" until recovery, likely to trigger election timeouts.
   531    // The expected behavior is that cluster may elect a new leader, while
   532    // quorum of nodes struggle with slow networks, and once delay operation
   533    // is undone, nodes come back and cluster comes back operative. As always,
   534    // after recovery, each member must be able to process client requests.
   535    DELAY_PEER_PORT_TX_RX_QUORUM = 208;
   536  
   537    // RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
   538    // from/to the peer ports on majority nodes of cluster, with randomized
   539    // time durations. And it waits for "delay-ms" until recovery, likely
   540    // to trigger election timeouts.
   541    // The expected behavior is that cluster may elect a new leader, while
   542    // quorum of nodes struggle with slow networks, and once delay operation
   543    // is undone, nodes come back and cluster comes back operative. As always,
   544    // after recovery, each member must be able to process client requests.
   545    RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
   546  
   547    // DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
   548    // peer ports on all nodes. And it waits for "delay-ms" until recovery,
   549    // likely to trigger election timeouts.
   550    // The expected behavior is that cluster may become totally inoperable,
   551    // struggling with slow networks across the whole cluster. Once delay
   552    // operation is undone, nodes come back and cluster comes back operative.
   553    // As always, after recovery, each member must be able to process client
   554    // requests.
   555    DELAY_PEER_PORT_TX_RX_ALL = 210;
   556  
   557    // RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
   558    // from/to the peer ports on all nodes, with randomized time durations.
   559    // And it waits for "delay-ms" until recovery, likely to trigger
   560    // election timeouts.
   561    // The expected behavior is that cluster may become totally inoperable,
   562    // struggling with slow networks across the whole cluster. Once delay
   563    // operation is undone, nodes come back and cluster comes back operative.
   564    // As always, after recovery, each member must be able to process client
   565    // requests.
   566    RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
   567  
   568    // NO_FAIL_WITH_STRESS stops injecting failures while testing the
   569    // consistency and correctness under pressure loads, for the duration of
   570    // "delay-ms". Goal is to ensure cluster be still making progress
   571    // on recovery, and verify system does not deadlock following a sequence
   572    // of failure injections.
   573    // The expected behavior is that cluster remains fully operative in healthy
   574    // condition. As always, after recovery, each member must be able to process
   575    // client requests.
   576    NO_FAIL_WITH_STRESS = 300;
   577  
   578    // NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS neither injects failures nor
   579    // sends stressig client requests to the cluster, for the duration of
   580    // "delay-ms". Goal is to ensure cluster be still making progress
   581    // on recovery, and verify system does not deadlock following a sequence
   582    // of failure injections.
   583    // The expected behavior is that cluster remains fully operative in healthy
   584    // condition, and clients requests during liveness period succeed without
   585    // errors.
   586    // Note: this is how Google Chubby does failure injection testing
   587    // https://static.googleusercontent.com/media/research.google.com/en//archive/paxos_made_live.pdf.
   588    NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
   589  
   590    // FAILPOINTS injects failpoints to etcd server runtime, triggering panics
   591    // in critical code paths.
   592    FAILPOINTS = 400;
   593  
   594    // EXTERNAL runs external failure injection scripts.
   595    EXTERNAL = 500;
   596  }
   597  
   598  enum Stresser {
   599    KV = 0;
   600    LEASE = 1;
   601    ELECTION_RUNNER = 2;
   602    WATCH_RUNNER = 3;
   603    LOCK_RACER_RUNNER = 4;
   604    LEASE_RUNNER = 5;
   605  }
   606  
   607  enum Checker {
   608    KV_HASH = 0;
   609    LEASE_EXPIRE = 1;
   610    RUNNER = 2;
   611    NO_CHECK = 3;
   612  }