github.com/hashicorp/nomad/api@v0.0.0-20240306165712-3193ac204f65/operator_autopilot.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package api
     5  
     6  import (
     7  	"encoding/json"
     8  	"strconv"
     9  	"time"
    10  )
    11  
    12  // AutopilotConfiguration is used for querying/setting the Autopilot configuration.
    13  // Autopilot helps manage operator tasks related to Nomad servers like removing
    14  // failed servers from the Raft quorum.
    15  type AutopilotConfiguration struct {
    16  	// CleanupDeadServers controls whether to remove dead servers from the Raft
    17  	// peer list when a new server joins
    18  	CleanupDeadServers bool
    19  
    20  	// LastContactThreshold is the limit on the amount of time a server can go
    21  	// without leader contact before being considered unhealthy.
    22  	LastContactThreshold time.Duration
    23  
    24  	// MaxTrailingLogs is the amount of entries in the Raft Log that a server can
    25  	// be behind before being considered unhealthy.
    26  	MaxTrailingLogs uint64
    27  
    28  	// MinQuorum sets the minimum number of servers allowed in a cluster before
    29  	// autopilot can prune dead servers.
    30  	MinQuorum uint
    31  
    32  	// ServerStabilizationTime is the minimum amount of time a server must be
    33  	// in a stable, healthy state before it can be added to the cluster. Only
    34  	// applicable with Raft protocol version 3 or higher.
    35  	ServerStabilizationTime time.Duration
    36  
    37  	// (Enterprise-only) EnableRedundancyZones specifies whether to enable redundancy zones.
    38  	EnableRedundancyZones bool
    39  
    40  	// (Enterprise-only) DisableUpgradeMigration will disable Autopilot's upgrade migration
    41  	// strategy of waiting until enough newer-versioned servers have been added to the
    42  	// cluster before promoting them to voters.
    43  	DisableUpgradeMigration bool
    44  
    45  	// (Enterprise-only) EnableCustomUpgrades specifies whether to enable using custom
    46  	// upgrade versions when performing migrations.
    47  	EnableCustomUpgrades bool
    48  
    49  	// CreateIndex holds the index corresponding the creation of this configuration.
    50  	// This is a read-only field.
    51  	CreateIndex uint64
    52  
    53  	// ModifyIndex will be set to the index of the last update when retrieving the
    54  	// Autopilot configuration. Resubmitting a configuration with
    55  	// AutopilotCASConfiguration will perform a check-and-set operation which ensures
    56  	// there hasn't been a subsequent update since the configuration was retrieved.
    57  	ModifyIndex uint64
    58  }
    59  
    60  func (u *AutopilotConfiguration) MarshalJSON() ([]byte, error) {
    61  	type Alias AutopilotConfiguration
    62  	return json.Marshal(&struct {
    63  		LastContactThreshold    string
    64  		ServerStabilizationTime string
    65  		*Alias
    66  	}{
    67  		LastContactThreshold:    u.LastContactThreshold.String(),
    68  		ServerStabilizationTime: u.ServerStabilizationTime.String(),
    69  		Alias:                   (*Alias)(u),
    70  	})
    71  }
    72  
    73  func (u *AutopilotConfiguration) UnmarshalJSON(data []byte) error {
    74  	type Alias AutopilotConfiguration
    75  	aux := &struct {
    76  		LastContactThreshold    string
    77  		ServerStabilizationTime string
    78  		*Alias
    79  	}{
    80  		Alias: (*Alias)(u),
    81  	}
    82  	if err := json.Unmarshal(data, &aux); err != nil {
    83  		return err
    84  	}
    85  	var err error
    86  	if aux.LastContactThreshold != "" {
    87  		if u.LastContactThreshold, err = time.ParseDuration(aux.LastContactThreshold); err != nil {
    88  			return err
    89  		}
    90  	}
    91  	if aux.ServerStabilizationTime != "" {
    92  		if u.ServerStabilizationTime, err = time.ParseDuration(aux.ServerStabilizationTime); err != nil {
    93  			return err
    94  		}
    95  	}
    96  	return nil
    97  }
    98  
    99  // ServerHealth is the health (from the leader's point of view) of a server.
   100  type ServerHealth struct {
   101  	// ID is the raft ID of the server.
   102  	ID string
   103  
   104  	// Name is the node name of the server.
   105  	Name string
   106  
   107  	// Address is the address of the server.
   108  	Address string
   109  
   110  	// The status of the SerfHealth check for the server.
   111  	SerfStatus string
   112  
   113  	// Version is the Nomad version of the server.
   114  	Version string
   115  
   116  	// Leader is whether this server is currently the leader.
   117  	Leader bool
   118  
   119  	// LastContact is the time since this node's last contact with the leader.
   120  	LastContact time.Duration
   121  
   122  	// LastTerm is the highest leader term this server has a record of in its Raft log.
   123  	LastTerm uint64
   124  
   125  	// LastIndex is the last log index this server has a record of in its Raft log.
   126  	LastIndex uint64
   127  
   128  	// Healthy is whether or not the server is healthy according to the current
   129  	// Autopilot config.
   130  	Healthy bool
   131  
   132  	// Voter is whether this is a voting server.
   133  	Voter bool
   134  
   135  	// StableSince is the last time this server's Healthy value changed.
   136  	StableSince time.Time
   137  }
   138  
   139  func (u *ServerHealth) MarshalJSON() ([]byte, error) {
   140  	type Alias ServerHealth
   141  	return json.Marshal(&struct {
   142  		LastContact string
   143  		*Alias
   144  	}{
   145  		LastContact: u.LastContact.String(),
   146  		Alias:       (*Alias)(u),
   147  	})
   148  }
   149  
   150  func (u *ServerHealth) UnmarshalJSON(data []byte) error {
   151  	type Alias ServerHealth
   152  	aux := &struct {
   153  		LastContact string
   154  		*Alias
   155  	}{
   156  		Alias: (*Alias)(u),
   157  	}
   158  	if err := json.Unmarshal(data, &aux); err != nil {
   159  		return err
   160  	}
   161  	var err error
   162  	if aux.LastContact != "" {
   163  		if u.LastContact, err = time.ParseDuration(aux.LastContact); err != nil {
   164  			return err
   165  		}
   166  	}
   167  	return nil
   168  }
   169  
   170  // OperatorHealthReply is a representation of the overall health of the cluster
   171  type OperatorHealthReply struct {
   172  	// Healthy is true if all the servers in the cluster are healthy.
   173  	Healthy bool
   174  
   175  	// FailureTolerance is the number of healthy servers that could be lost without
   176  	// an outage occurring.
   177  	FailureTolerance int
   178  
   179  	// Servers holds the health of each server.
   180  	Servers []ServerHealth
   181  }
   182  
   183  // AutopilotGetConfiguration is used to query the current Autopilot configuration.
   184  func (op *Operator) AutopilotGetConfiguration(q *QueryOptions) (*AutopilotConfiguration, *QueryMeta, error) {
   185  	var resp AutopilotConfiguration
   186  	qm, err := op.c.query("/v1/operator/autopilot/configuration", &resp, q)
   187  	if err != nil {
   188  		return nil, nil, err
   189  	}
   190  	return &resp, qm, nil
   191  }
   192  
   193  // AutopilotSetConfiguration is used to set the current Autopilot configuration.
   194  func (op *Operator) AutopilotSetConfiguration(conf *AutopilotConfiguration, q *WriteOptions) (*WriteMeta, error) {
   195  	var out bool
   196  	wm, err := op.c.put("/v1/operator/autopilot/configuration", conf, &out, q)
   197  	if err != nil {
   198  		return nil, err
   199  	}
   200  	return wm, nil
   201  }
   202  
   203  // AutopilotCASConfiguration is used to perform a Check-And-Set update on the
   204  // Autopilot configuration. The ModifyIndex value will be respected. Returns
   205  // true on success or false on failures.
   206  func (op *Operator) AutopilotCASConfiguration(conf *AutopilotConfiguration, q *WriteOptions) (bool, *WriteMeta, error) {
   207  	var out bool
   208  	wm, err := op.c.put("/v1/operator/autopilot/configuration?cas="+strconv.FormatUint(conf.ModifyIndex, 10), conf, &out, q)
   209  	if err != nil {
   210  		return false, nil, err
   211  	}
   212  
   213  	return out, wm, nil
   214  }
   215  
   216  // AutopilotServerHealth is used to query Autopilot's top-level view of the health
   217  // of each Nomad server.
   218  func (op *Operator) AutopilotServerHealth(q *QueryOptions) (*OperatorHealthReply, *QueryMeta, error) {
   219  	var out OperatorHealthReply
   220  	qm, err := op.c.query("/v1/operator/autopilot/health", &out, q)
   221  	if err != nil {
   222  		return nil, nil, err
   223  	}
   224  	return &out, qm, nil
   225  }