github.com/hernad/nomad@v1.6.112/command/agent/retry_join.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package agent
     5  
     6  import (
     7  	"fmt"
     8  	golog "log"
     9  	"strings"
    10  	"time"
    11  
    12  	log "github.com/hashicorp/go-hclog"
    13  )
    14  
    15  // DiscoverInterface is an interface for the Discover type in the go-discover
    16  // library. Using an interface allows for ease of testing.
    17  type DiscoverInterface interface {
    18  	// Addrs discovers ip addresses of nodes that match the given filter
    19  	// criteria.
    20  	// The config string must have the format 'provider=xxx key=val key=val ...'
    21  	// where the keys and values are provider specific. The values are URL
    22  	// encoded.
    23  	Addrs(string, *golog.Logger) ([]string, error)
    24  
    25  	// Help describes the format of the configuration string for address
    26  	// discovery and the various provider specific options.
    27  	Help() string
    28  
    29  	// Names returns the names of the configured providers.
    30  	Names() []string
    31  }
    32  
    33  // retryJoiner is used to handle retrying a join until it succeeds or all of
    34  // its tries are exhausted.
    35  type retryJoiner struct {
    36  	// serverJoin adds the specified servers to the serf cluster
    37  	serverJoin func([]string) (int, error)
    38  
    39  	// serverEnabled indicates whether the nomad agent will run in server mode
    40  	serverEnabled bool
    41  
    42  	// clientJoin adds the specified servers to the serf cluster
    43  	clientJoin func([]string) (int, error)
    44  
    45  	// clientEnabled indicates whether the nomad agent will run in client mode
    46  	clientEnabled bool
    47  
    48  	// discover is of type Discover, where this is either the go-discover
    49  	// implementation or a mock used for testing
    50  	discover DiscoverInterface
    51  
    52  	// errCh is used to communicate with the agent when the max retry attempt
    53  	// limit has been reached
    54  	errCh chan struct{}
    55  
    56  	// logger is the retry joiners logger
    57  	logger log.Logger
    58  }
    59  
    60  // Validate ensures that the configuration passes validity checks for the
    61  // retry_join block. If the configuration is not valid, returns an error that
    62  // will be displayed to the operator, otherwise nil.
    63  func (r *retryJoiner) Validate(config *Config) error {
    64  
    65  	// If retry_join is defined for the server, ensure that deprecated
    66  	// fields and the server_join block are not both set
    67  	if config.Server != nil && config.Server.ServerJoin != nil && len(config.Server.ServerJoin.RetryJoin) != 0 {
    68  		if len(config.Server.RetryJoin) != 0 {
    69  			return fmt.Errorf("server_join and retry_join cannot both be defined; prefer setting the server_join block")
    70  		}
    71  		if len(config.Server.StartJoin) != 0 {
    72  			return fmt.Errorf("server_join and start_join cannot both be defined; prefer setting the server_join block")
    73  		}
    74  		if config.Server.RetryMaxAttempts != 0 {
    75  			return fmt.Errorf("server_join and retry_max cannot both be defined; prefer setting the server_join block")
    76  		}
    77  
    78  		if config.Server.RetryInterval != 0 {
    79  			return fmt.Errorf("server_join and retry_interval cannot both be defined; prefer setting the server_join block")
    80  		}
    81  
    82  		if len(config.Server.ServerJoin.StartJoin) != 0 {
    83  			return fmt.Errorf("retry_join and start_join cannot both be defined")
    84  		}
    85  	}
    86  
    87  	// if retry_join is defined for the client, ensure that start_join is not
    88  	// set as this configuration is only defined for servers.
    89  	if config.Client != nil && config.Client.ServerJoin != nil {
    90  		if config.Client.ServerJoin.StartJoin != nil {
    91  			return fmt.Errorf("start_join is not supported for Nomad clients")
    92  		}
    93  	}
    94  
    95  	return nil
    96  }
    97  
    98  // retryJoin is used to handle retrying a join until it succeeds or all retries
    99  // are exhausted.
   100  func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
   101  	if len(serverJoin.RetryJoin) == 0 {
   102  		return
   103  	}
   104  
   105  	attempt := 0
   106  
   107  	addrsToJoin := strings.Join(serverJoin.RetryJoin, " ")
   108  	r.logger.Info("starting retry join", "servers", addrsToJoin)
   109  
   110  	standardLogger := r.logger.StandardLogger(&log.StandardLoggerOptions{InferLevels: true})
   111  	for {
   112  		var addrs []string
   113  		var n int
   114  		var err error
   115  
   116  		for _, addr := range serverJoin.RetryJoin {
   117  			switch {
   118  			case strings.HasPrefix(addr, "provider="):
   119  				servers, err := r.discover.Addrs(addr, standardLogger)
   120  				if err != nil {
   121  					r.logger.Error("determining join addresses failed", "error", err)
   122  				} else {
   123  					addrs = append(addrs, servers...)
   124  				}
   125  			default:
   126  				addrs = append(addrs, addr)
   127  			}
   128  		}
   129  
   130  		if len(addrs) > 0 {
   131  			if r.serverEnabled && r.serverJoin != nil {
   132  				n, err = r.serverJoin(addrs)
   133  				if err == nil {
   134  					r.logger.Info("retry join completed", "initial_servers", n, "agent_mode", "server")
   135  					return
   136  				}
   137  			}
   138  			if r.clientEnabled && r.clientJoin != nil {
   139  				n, err = r.clientJoin(addrs)
   140  				if err == nil {
   141  					r.logger.Info("retry join completed", "initial_servers", n, "agent_mode", "client")
   142  					return
   143  				}
   144  			}
   145  		}
   146  
   147  		attempt++
   148  		if serverJoin.RetryMaxAttempts > 0 && attempt > serverJoin.RetryMaxAttempts {
   149  			r.logger.Error("max join retry exhausted, exiting")
   150  			close(r.errCh)
   151  			return
   152  		}
   153  
   154  		if err != nil {
   155  			r.logger.Warn("join failed", "error", err, "retry", serverJoin.RetryInterval)
   156  		}
   157  		time.Sleep(serverJoin.RetryInterval)
   158  	}
   159  }