github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/rpcpb/rpc.proto

github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/rpcpb/rpc.proto (about)

     1  syntax = "proto3";
     2  package rpcpb;
     3  
     4  import "gogoproto/gogo.proto";
     5  
     6  option (gogoproto.marshaler_all) = true;
     7  option (gogoproto.sizer_all) = true;
     8  option (gogoproto.unmarshaler_all) = true;
     9  option (gogoproto.goproto_getters_all) = false;
    10  
    11  message Request {
    12    Operation Operation = 1;
    13    // Member contains the same Member object from tester configuration.
    14    Member Member = 2;
    15    // Tester contains tester configuration.
    16    Tester Tester = 3;
    17  }
    18  
    19  // SnapshotInfo contains SAVE_SNAPSHOT request results.
    20  message SnapshotInfo {
    21    string MemberName = 1;
    22    repeated string MemberClientURLs = 2;
    23    string SnapshotPath = 3;
    24    string SnapshotFileSize = 4;
    25    string SnapshotTotalSize = 5;
    26    int64 SnapshotTotalKey = 6;
    27    int64 SnapshotHash = 7;
    28    int64 SnapshotRevision = 8;
    29    string Took = 9;
    30    string Version = 10;
    31  }
    32  
    33  message Response {
    34    bool Success = 1;
    35    string Status = 2;
    36  
    37    // Member contains the same Member object from tester request.
    38    Member Member = 3;
    39  
    40    // SnapshotInfo contains SAVE_SNAPSHOT request results.
    41    SnapshotInfo SnapshotInfo = 4;
    42  }
    43  
    44  service Transport {
    45    rpc Transport(stream Request) returns (stream Response) {}
    46  }
    47  
    48  message Member {
    49    // EtcdExec is the executable etcd binary path in agent server.
    50    string EtcdExec = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec\""];
    51  
    52    // AgentAddr is the agent HTTP server address.
    53    string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
    54    // FailpointHTTPAddr is the agent's failpoints HTTP server address.
    55    string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
    56  
    57    // BaseDir is the base directory where all logs and etcd data are stored.
    58    string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
    59  
    60    // EtcdClientProxy is true when client traffic needs to be proxied.
    61    // If true, listen client URL port must be different than advertise client URL port.
    62    bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
    63    // EtcdPeerProxy is true when peer traffic needs to be proxied.
    64    // If true, listen peer URL port must be different than advertise peer URL port.
    65    bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
    66  
    67    // EtcdClientEndpoint is the etcd client endpoint.
    68    string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
    69    // Etcd defines etcd binary configuration flags.
    70    Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
    71    // EtcdOnSnapshotRestore defines one-time use configuration during etcd
    72    // snapshot recovery process.
    73    Etcd EtcdOnSnapshotRestore = 303;
    74  
    75    // ClientCertData contains cert file contents from this member's etcd server.
    76    string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
    77    string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
    78    // ClientKeyData contains key file contents from this member's etcd server.
    79    string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
    80    string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
    81    // ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
    82    string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
    83    string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
    84  
    85    // PeerCertData contains cert file contents from this member's etcd server.
    86    string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
    87    string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
    88    // PeerKeyData contains key file contents from this member's etcd server.
    89    string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
    90    string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
    91    // PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
    92    string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
    93    string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
    94  
    95    // SnapshotPath is the snapshot file path to store or restore from.
    96    string SnapshotPath = 601 [(gogoproto.moretags) = "yaml:\"snapshot-path\""];
    97    // SnapshotInfo contains last SAVE_SNAPSHOT request results.
    98    SnapshotInfo SnapshotInfo = 602;
    99  
   100    // Failpoints is the GOFAIL_FAILPOINTS environment variable value to use when starting etcd.
   101    string Failpoints = 701  [(gogoproto.moretags) = "yaml:\"failpoints\""];
   102  }
   103  
   104  message Tester {
   105    string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
   106    string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
   107    string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
   108  
   109    // DelayLatencyMsRv is the delay latency in milliseconds,
   110    // to inject to simulated slow network.
   111    uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
   112    // DelayLatencyMsRv is the delay latency random variable in milliseconds.
   113    uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
   114    // UpdatedDelayLatencyMs is the update delay latency in milliseconds,
   115    // to inject to simulated slow network. It's the final latency to apply,
   116    // in case the latency numbers are randomly generated from given delay latency field.
   117    uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
   118  
   119    // RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
   120    int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
   121    // ExitOnCaseFail is true, then exit tester on first failure.
   122    bool ExitOnCaseFail = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
   123    // EnablePprof is true to enable profiler.
   124    bool EnablePprof = 23 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
   125  
   126    // CaseDelayMs is the delay duration after failure is injected.
   127    // Useful when triggering snapshot or no-op failure cases.
   128    uint32 CaseDelayMs = 31 [(gogoproto.moretags) = "yaml:\"case-delay-ms\""];
   129    // CaseShuffle is true to randomize failure injecting order.
   130    bool CaseShuffle = 32 [(gogoproto.moretags) = "yaml:\"case-shuffle\""];
   131    // Cases is the selected test cases to schedule.
   132    // If empty, run all failure cases.
   133    repeated string Cases = 33 [(gogoproto.moretags) = "yaml:\"cases\""];
   134    // FailpointCommands is the list of "gofail" commands
   135    // (e.g. panic("etcd-tester"),1*sleep(1000).
   136    repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
   137  
   138    // RunnerExecPath is a path of etcd-runner binary.
   139    string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
   140    // ExternalExecPath is a path of script for enabling/disabling an external fault injector.
   141    string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
   142  
   143    // Stressers is the list of stresser types:
   144    // KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER.
   145    repeated Stresser Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""];
   146    // Checkers is the list of consistency checker types:
   147    // KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER.
   148    // Leave empty to skip consistency checks.
   149    repeated string Checkers = 102 [(gogoproto.moretags) = "yaml:\"checkers\""];
   150  
   151    // StressKeySize is the size of each small key written into etcd.
   152    int32 StressKeySize = 201 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
   153    // StressKeySizeLarge is the size of each large key written into etcd.
   154    int32 StressKeySizeLarge = 202 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
   155    // StressKeySuffixRange is the count of key range written into etcd.
   156    // Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
   157    int32 StressKeySuffixRange = 203 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
   158    // StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
   159    // Stress keys are created with "fmt.Sprintf("/k%03d", i)".
   160    int32 StressKeySuffixRangeTxn = 204 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
   161    // StressKeyTxnOps is the number of operations per a transaction (max 64).
   162    int32 StressKeyTxnOps = 205 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
   163  
   164    // StressClients is the number of concurrent stressing clients
   165    // with "one" shared TCP connection.
   166    int32 StressClients = 301 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
   167    // StressQPS is the maximum number of stresser requests per second.
   168    int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
   169  }
   170  
   171  enum StresserType {
   172    KV_WRITE_SMALL = 0;
   173    KV_WRITE_LARGE = 1;
   174    KV_READ_ONE_KEY = 2;
   175    KV_READ_RANGE = 3;
   176    KV_DELETE_ONE_KEY = 4;
   177    KV_DELETE_RANGE = 5;
   178    KV_TXN_WRITE_DELETE = 6;
   179  
   180    LEASE = 10;
   181  
   182    ELECTION_RUNNER = 20;
   183    WATCH_RUNNER = 31;
   184    LOCK_RACER_RUNNER = 41;
   185    LEASE_RUNNER = 51;
   186  }
   187  
   188  message Stresser {
   189    string Type = 1 [(gogoproto.moretags) = "yaml:\"type\""];
   190    double Weight = 2 [(gogoproto.moretags) = "yaml:\"weight\""];
   191  }
   192  
   193  enum Checker {
   194    KV_HASH = 0;
   195    LEASE_EXPIRE = 1;
   196    RUNNER = 2;
   197    NO_CHECK = 3;
   198    SHORT_TTL_LEASE_EXPIRE = 4;
   199  }
   200  
   201  message Etcd {
   202    string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
   203    string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
   204    string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
   205  
   206    // HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
   207    // Default value is 100, which is 100ms.
   208    int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
   209    // ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
   210    // Default value is 1000, which is 1s.
   211    int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
   212  
   213    repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
   214    repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
   215    bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
   216    bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
   217    string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
   218    string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
   219    string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
   220  
   221    repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
   222    repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
   223    bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
   224    bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
   225    string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
   226    string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
   227    string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
   228  
   229    string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
   230    string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
   231    string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
   232  
   233    int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
   234    int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
   235  
   236    bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
   237    bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
   238  
   239    string Logger = 71 [(gogoproto.moretags) = "yaml:\"logger\""];
   240    // LogOutputs is the log file to store current etcd server logs.
   241    repeated string LogOutputs = 72 [(gogoproto.moretags) = "yaml:\"log-outputs\""];
   242    string LogLevel = 73 [(gogoproto.moretags) = "yaml:\"log-level\""];
   243  
   244    bool SocketReuseAddress = 81 [(gogoproto.moretags) = "yaml:\"socket-reuse-address\""];
   245    bool SocketReusePort = 82 [(gogoproto.moretags) = "yaml:\"socket-reuse-port\""];
   246  }
   247  
   248  enum Operation {
   249    // NOT_STARTED is the agent status before etcd first start.
   250    NOT_STARTED = 0;
   251  
   252    // INITIAL_START_ETCD is only called to start etcd, the very first time.
   253    INITIAL_START_ETCD = 10;
   254    // RESTART_ETCD is sent to restart killed etcd.
   255    RESTART_ETCD = 11;
   256  
   257    // SIGTERM_ETCD pauses etcd process while keeping data directories
   258    // and previous etcd configurations.
   259    SIGTERM_ETCD = 20;
   260    // SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data
   261    // directories to simulate destroying the whole machine.
   262    SIGQUIT_ETCD_AND_REMOVE_DATA = 21;
   263  
   264    // SAVE_SNAPSHOT is sent to trigger local member to download its snapshot
   265    // onto its local disk with the specified path from tester.
   266    SAVE_SNAPSHOT = 30;
   267    // RESTORE_RESTART_FROM_SNAPSHOT is sent to trigger local member to
   268    // restore a cluster from existing snapshot from disk, and restart
   269    // an etcd instance from recovered data.
   270    RESTORE_RESTART_FROM_SNAPSHOT = 31;
   271    // RESTART_FROM_SNAPSHOT is sent to trigger local member to restart
   272    // and join an existing cluster that has been recovered from a snapshot.
   273    // Local member joins this cluster with fresh data.
   274    RESTART_FROM_SNAPSHOT = 32;
   275  
   276    // SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed,
   277    // thus need to archive etcd data directories.
   278    SIGQUIT_ETCD_AND_ARCHIVE_DATA = 40;
   279    // SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process,
   280    // etcd data, and agent server.
   281    SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 41;
   282  
   283    // BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to
   284    // the peer port on target member's peer port.
   285    BLACKHOLE_PEER_PORT_TX_RX = 100;
   286    // UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping.
   287    UNBLACKHOLE_PEER_PORT_TX_RX = 101;
   288  
   289    // DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to
   290    // the peer port on target member's peer port.
   291    DELAY_PEER_PORT_TX_RX = 200;
   292    // UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays.
   293    UNDELAY_PEER_PORT_TX_RX = 201;
   294  }
   295  
   296  // Case defines various system faults or test case in distributed systems,
   297  // in order to verify correct behavior of etcd servers and clients.
   298  enum Case {
   299    // SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
   300    // but does not delete its data directories on disk for next restart.
   301    // It waits "delay-ms" before recovering this failure.
   302    // The expected behavior is that the follower comes back online
   303    // and rejoins the cluster, and then each member continues to process
   304    // client requests ('Put' request that requires Raft consensus).
   305    SIGTERM_ONE_FOLLOWER = 0;
   306  
   307    // SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
   308    // follower but does not delete its data directories on disk for next
   309    // restart. And waits until most up-to-date node (leader) applies the
   310    // snapshot count of entries since the stop operation.
   311    // The expected behavior is that the follower comes back online and
   312    // rejoins the cluster, and then active leader sends snapshot
   313    // to the follower to force it to follow the leader's log.
   314    // As always, after recovery, each member must be able to process
   315    // client requests.
   316    SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
   317  
   318    // SIGTERM_LEADER stops the active leader node but does not delete its
   319    // data directories on disk for next restart. Then it waits "delay-ms"
   320    // before recovering this failure, in order to trigger election timeouts.
   321    // The expected behavior is that a new leader gets elected, and the
   322    // old leader comes back online and rejoins the cluster as a follower.
   323    // As always, after recovery, each member must be able to process
   324    // client requests.
   325    SIGTERM_LEADER = 2;
   326  
   327    // SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
   328    // but does not delete its data directories on disk for next restart.
   329    // And waits until most up-to-date node ("new" leader) applies the
   330    // snapshot count of entries since the stop operation.
   331    // The expected behavior is that cluster elects a new leader, and the
   332    // old leader comes back online and rejoins the cluster as a follower.
   333    // And it receives the snapshot from the new leader to overwrite its
   334    // store. As always, after recovery, each member must be able to
   335    // process client requests.
   336    SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
   337  
   338    // SIGTERM_QUORUM stops majority number of nodes to make the whole cluster
   339    // inoperable but does not delete data directories on stopped nodes
   340    // for next restart. And it waits "delay-ms" before recovering failure.
   341    // The expected behavior is that nodes come back online, thus cluster
   342    // comes back operative as well. As always, after recovery, each member
   343    // must be able to process client requests.
   344    SIGTERM_QUORUM = 4;
   345  
   346    // SIGTERM_ALL stops the whole cluster but does not delete data directories
   347    // on disk for next restart. And it waits "delay-ms" before  recovering
   348    // this failure.
   349    // The expected behavior is that nodes come back online, thus cluster
   350    // comes back operative as well. As always, after recovery, each member
   351    // must be able to process client requests.
   352    SIGTERM_ALL = 5;
   353  
   354    // SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower
   355    // (non-leader), deletes its data directories on disk, and removes
   356    // this member from cluster (membership reconfiguration). On recovery,
   357    // tester adds a new member, and this member joins the existing cluster
   358    // with fresh data. It waits "delay-ms" before recovering this
   359    // failure. This simulates destroying one follower machine, where operator
   360    // needs to add a new member from a fresh machine.
   361    // The expected behavior is that a new member joins the existing cluster,
   362    // and then each member continues to process client requests.
   363    SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10;
   364  
   365    // SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly
   366    // chosen follower, deletes its data directories on disk, and removes
   367    // this member from cluster (membership reconfiguration). On recovery,
   368    // tester adds a new member, and this member joins the existing cluster
   369    // restart. On member remove, cluster waits until most up-to-date node
   370    // (leader) applies the snapshot count of entries since the stop operation.
   371    // This simulates destroying a leader machine, where operator needs to add
   372    // a new member from a fresh machine.
   373    // The expected behavior is that a new member joins the existing cluster,
   374    // and receives a snapshot from the active leader. As always, after
   375    // recovery, each member must be able to process client requests.
   376    SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11;
   377  
   378    // SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its
   379    // data directories on disk, and removes this member from cluster.
   380    // On recovery, tester adds a new member, and this member joins the
   381    // existing cluster with fresh data. It waits "delay-ms" before
   382    // recovering this failure. This simulates destroying a leader machine,
   383    // where operator needs to add a new member from a fresh machine.
   384    // The expected behavior is that a new member joins the existing cluster,
   385    // and then each member continues to process client requests.
   386    SIGQUIT_AND_REMOVE_LEADER = 12;
   387  
   388    // SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader,
   389    // deletes its data directories on disk, and removes this member from
   390    // cluster (membership reconfiguration). On recovery, tester adds a new
   391    // member, and this member joins the existing cluster restart. On member
   392    // remove, cluster waits until most up-to-date node (new leader) applies
   393    // the snapshot count of entries since the stop operation. This simulates
   394    // destroying a leader machine, where operator needs to add a new member
   395    // from a fresh machine.
   396    // The expected behavior is that on member remove, cluster elects a new
   397    // leader, and a new member joins the existing cluster and receives a
   398    // snapshot from the newly elected leader. As always, after recovery, each
   399    // member must be able to process client requests.
   400    SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13;
   401  
   402    // SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH first
   403    // stops majority number of nodes, deletes data directories on those quorum
   404    // nodes, to make the whole cluster inoperable. Now that quorum and their
   405    // data are totally destroyed, cluster cannot even remove unavailable nodes
   406    // (e.g. 2 out of 3 are lost, so no leader can be elected).
   407    // Let's assume 3-node cluster of node A, B, and C. One day, node A and B
   408    // are destroyed and all their data are gone. The only viable solution is
   409    // to recover from C's latest snapshot.
   410    //
   411    // To simulate:
   412    //  1. Assume node C is the current leader with most up-to-date data.
   413    //  2. Download snapshot from node C, before destroying node A and B.
   414    //  3. Destroy node A and B, and make the whole cluster inoperable.
   415    //  4. Now node C cannot operate either.
   416    //  5. SIGTERM node C and remove its data directories.
   417    //  6. Restore a new seed member from node C's latest snapshot file.
   418    //  7. Add another member to establish 2-node cluster.
   419    //  8. Add another member to establish 3-node cluster.
   420    //  9. Add more if any.
   421    //
   422    // The expected behavior is that etcd successfully recovers from such
   423    // disastrous situation as only 1-node survives out of 3-node cluster,
   424    // new members joins the existing cluster, and previous data from snapshot
   425    // are still preserved after recovery process. As always, after recovery,
   426    // each member must be able to process client requests.
   427    SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH = 14;
   428  
   429    // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
   430    // packets from/to the peer port on a randomly chosen follower
   431    // (non-leader), and waits for "delay-ms" until recovery.
   432    // The expected behavior is that once dropping operation is undone,
   433    // each member must be able to process client requests.
   434    BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
   435  
   436    // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
   437    // all outgoing/incoming packets from/to the peer port on a randomly
   438    // chosen follower (non-leader), and waits for most up-to-date node
   439    // (leader) applies the snapshot count of entries since the blackhole
   440    // operation.
   441    // The expected behavior is that once packet drop operation is undone,
   442    // the slow follower tries to catch up, possibly receiving the snapshot
   443    // from the active leader. As always, after recovery, each member must
   444    // be able to process client requests.
   445    BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
   446  
   447    // BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
   448    // from/to the peer port on the active leader (isolated), and waits for
   449    // "delay-ms" until recovery, in order to trigger election timeout.
   450    // The expected behavior is that after election timeout, a new leader gets
   451    // elected, and once dropping operation is undone, the old leader comes
   452    // back and rejoins the cluster as a follower. As always, after recovery,
   453    // each member must be able to process client requests.
   454    BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
   455  
   456    // BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
   457    // outgoing/incoming packets from/to the peer port on the active leader,
   458    // and waits for most up-to-date node (leader) applies the snapshot
   459    // count of entries since the blackhole operation.
   460    // The expected behavior is that cluster elects a new leader, and once
   461    // dropping operation is undone, the old leader comes back and rejoins
   462    // the cluster as a follower. The slow follower tries to catch up, likely
   463    // receiving the snapshot from the new active leader. As always, after
   464    // recovery, each member must be able to process client requests.
   465    BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
   466  
   467    // BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
   468    // from/to the peer ports on majority nodes of cluster, thus losing its
   469    // leader and cluster being inoperable. And it waits for "delay-ms"
   470    // until recovery.
   471    // The expected behavior is that once packet drop operation is undone,
   472    // nodes come back online, thus cluster comes back operative. As always,
   473    // after recovery, each member must be able to process client requests.
   474    BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
   475  
   476    // BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
   477    // from/to the peer ports on all nodes, thus making cluster totally
   478    // inoperable. It waits for "delay-ms" until recovery.
   479    // The expected behavior is that once packet drop operation is undone,
   480    // nodes come back online, thus cluster comes back operative. As always,
   481    // after recovery, each member must be able to process client requests.
   482    BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
   483  
   484    // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
   485    // from/to the peer port on a randomly chosen follower (non-leader).
   486    // It waits for "delay-ms" until recovery.
   487    // The expected behavior is that once packet delay operation is undone,
   488    // the follower comes back and tries to catch up with latest changes from
   489    // cluster. And as always, after recovery, each member must be able to
   490    // process client requests.
   491    DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
   492  
   493    // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
   494    // packets from/to the peer port on a randomly chosen follower
   495    // (non-leader) with a randomized time duration (thus isolated). It
   496    // waits for "delay-ms" until recovery.
   497    // The expected behavior is that once packet delay operation is undone,
   498    // each member must be able to process client requests.
   499    RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
   500  
   501    // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
   502    // outgoing/incoming packets from/to the peer port on a randomly chosen
   503    // follower (non-leader), and waits for most up-to-date node (leader)
   504    // applies the snapshot count of entries since the delay operation.
   505    // The expected behavior is that the delayed follower gets isolated
   506    // and behind the current active leader, and once delay operation is undone,
   507    // the slow follower comes back and catches up possibly receiving snapshot
   508    // from the active leader. As always, after recovery, each member must be
   509    // able to process client requests.
   510    DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
   511  
   512    // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
   513    // outgoing/incoming packets from/to the peer port on a randomly chosen
   514    // follower (non-leader) with a randomized time duration, and waits for
   515    // most up-to-date node (leader) applies the snapshot count of entries
   516    // since the delay operation.
   517    // The expected behavior is that the delayed follower gets isolated
   518    // and behind the current active leader, and once delay operation is undone,
   519    // the slow follower comes back and catches up, possibly receiving a
   520    // snapshot from the active leader. As always, after recovery, each member
   521    // must be able to process client requests.
   522    RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
   523  
   524    // DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
   525    // the peer port on the active leader. And waits for "delay-ms" until
   526    // recovery.
   527    // The expected behavior is that cluster may elect a new leader, and
   528    // once packet delay operation is undone, the (old) leader comes back
   529    // and tries to catch up with latest changes from cluster. As always,
   530    // after recovery, each member must be able to process client requests.
   531    DELAY_PEER_PORT_TX_RX_LEADER = 204;
   532  
   533    // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
   534    // from/to the peer port on the active leader with a randomized time
   535    // duration. And waits for "delay-ms" until recovery.
   536    // The expected behavior is that cluster may elect a new leader, and
   537    // once packet delay operation is undone, the (old) leader comes back
   538    // and tries to catch up with latest changes from cluster. As always,
   539    // after recovery, each member must be able to process client requests.
   540    RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
   541  
   542    // DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
   543    // outgoing/incoming packets from/to the peer port on the active leader,
   544    // and waits for most up-to-date node (current or new leader) applies the
   545    // snapshot count of entries since the delay operation.
   546    // The expected behavior is that cluster may elect a new leader, and
   547    // the old leader gets isolated and behind the current active leader,
   548    // and once delay operation is undone, the slow follower comes back
   549    // and catches up, likely receiving a snapshot from the active leader.
   550    // As always, after recovery, each member must be able to process client
   551    // requests.
   552    DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
   553  
   554    // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
   555    // outgoing/incoming packets from/to the peer port on the active leader,
   556    // with a randomized time duration. And it waits for most up-to-date node
   557    // (current or new leader) applies the snapshot count of entries since the
   558    // delay operation.
   559    // The expected behavior is that cluster may elect a new leader, and
   560    // the old leader gets isolated and behind the current active leader,
   561    // and once delay operation is undone, the slow follower comes back
   562    // and catches up, likely receiving a snapshot from the active leader.
   563    // As always, after recovery, each member must be able to process client
   564    // requests.
   565    RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
   566  
   567    // DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
   568    // the peer ports on majority nodes of cluster. And it waits for
   569    // "delay-ms" until recovery, likely to trigger election timeouts.
   570    // The expected behavior is that cluster may elect a new leader, while
   571    // quorum of nodes struggle with slow networks, and once delay operation
   572    // is undone, nodes come back and cluster comes back operative. As always,
   573    // after recovery, each member must be able to process client requests.
   574    DELAY_PEER_PORT_TX_RX_QUORUM = 208;
   575  
   576    // RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
   577    // from/to the peer ports on majority nodes of cluster, with randomized
   578    // time durations. And it waits for "delay-ms" until recovery, likely
   579    // to trigger election timeouts.
   580    // The expected behavior is that cluster may elect a new leader, while
   581    // quorum of nodes struggle with slow networks, and once delay operation
   582    // is undone, nodes come back and cluster comes back operative. As always,
   583    // after recovery, each member must be able to process client requests.
   584    RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
   585  
   586    // DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
   587    // peer ports on all nodes. And it waits for "delay-ms" until recovery,
   588    // likely to trigger election timeouts.
   589    // The expected behavior is that cluster may become totally inoperable,
   590    // struggling with slow networks across the whole cluster. Once delay
   591    // operation is undone, nodes come back and cluster comes back operative.
   592    // As always, after recovery, each member must be able to process client
   593    // requests.
   594    DELAY_PEER_PORT_TX_RX_ALL = 210;
   595  
   596    // RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
   597    // from/to the peer ports on all nodes, with randomized time durations.
   598    // And it waits for "delay-ms" until recovery, likely to trigger
   599    // election timeouts.
   600    // The expected behavior is that cluster may become totally inoperable,
   601    // struggling with slow networks across the whole cluster. Once delay
   602    // operation is undone, nodes come back and cluster comes back operative.
   603    // As always, after recovery, each member must be able to process client
   604    // requests.
   605    RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
   606  
   607    // NO_FAIL_WITH_STRESS stops injecting failures while testing the
   608    // consistency and correctness under pressure loads, for the duration of
   609    // "delay-ms". Goal is to ensure cluster be still making progress
   610    // on recovery, and verify system does not deadlock following a sequence
   611    // of failure injections.
   612    // The expected behavior is that cluster remains fully operative in healthy
   613    // condition. As always, after recovery, each member must be able to process
   614    // client requests.
   615    NO_FAIL_WITH_STRESS = 300;
   616  
   617    // NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS neither injects failures nor
   618    // sends stressig client requests to the cluster, for the duration of
   619    // "delay-ms". Goal is to ensure cluster be still making progress
   620    // on recovery, and verify system does not deadlock following a sequence
   621    // of failure injections.
   622    // The expected behavior is that cluster remains fully operative in healthy
   623    // condition, and clients requests during liveness period succeed without
   624    // errors.
   625    // Note: this is how Google Chubby does failure injection testing
   626    // https://static.googleusercontent.com/media/research.google.com/en//archive/paxos_made_live.pdf.
   627    NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
   628  
   629    // FAILPOINTS injects failpoints to etcd server runtime, triggering panics
   630    // in critical code paths.
   631    FAILPOINTS = 400;
   632  
   633    // FAILPOINTS_WITH_DISK_IO_LATENCY injects high disk I/O latency failure in raftAfterSave code paths.
   634    FAILPOINTS_WITH_DISK_IO_LATENCY = 401;
   635  
   636    // EXTERNAL runs external failure injection scripts.
   637    EXTERNAL = 500;
   638  }