github.com/cilium/cilium@v1.16.2/clustermesh-apiserver/etcdinit/root.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package etcdinit
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"os"
    11  	"os/exec"
    12  	"path"
    13  	"strings"
    14  	"syscall"
    15  	"time"
    16  
    17  	"github.com/sirupsen/logrus"
    18  	"github.com/spf13/cobra"
    19  	"github.com/spf13/viper"
    20  	clientv3 "go.etcd.io/etcd/client/v3"
    21  
    22  	"github.com/cilium/cilium/pkg/defaults"
    23  	kvstoreEtcdInit "github.com/cilium/cilium/pkg/kvstore/etcdinit"
    24  	"github.com/cilium/cilium/pkg/logging"
    25  	"github.com/cilium/cilium/pkg/logging/logfields"
    26  	"github.com/cilium/cilium/pkg/option"
    27  	"github.com/cilium/cilium/pkg/version"
    28  )
    29  
    30  // EtcdBinaryLocation is hardcoded because we expect this command to be run inside a Cilium container that places the
    31  // etcd binary in a specific location.
    32  const EtcdBinaryLocation = "/usr/bin/etcd"
    33  
    34  var (
    35  	log = logging.DefaultLogger.WithField(logfields.LogSubsys, "etcdinit")
    36  	vp  = viper.New()
    37  )
    38  
    39  func NewCmd() *cobra.Command {
    40  	rootCmd := &cobra.Command{
    41  		Use:   "etcdinit",
    42  		Short: "Initialise an etcd data directory for use by the etcd sidecar of clustermesh-apiserver",
    43  		PreRun: func(cmd *cobra.Command, args []string) {
    44  			option.LogRegisteredOptions(vp, log)
    45  			log.Infof("Cilium ClusterMesh etcd init %s", version.Version)
    46  		},
    47  		Run: func(cmd *cobra.Command, args []string) {
    48  			err := InitEtcdLocal()
    49  			// The error has already been handled and logged by InitEtcdLocal. We just use it to determine the exit code
    50  			if err != nil {
    51  				os.Exit(-1)
    52  			}
    53  		},
    54  	}
    55  	rootCmd.Flags().String("etcd-data-dir", "/var/run/etcd", "Etcd data directory. Should have read/write permissions here.")
    56  	rootCmd.Flags().String("etcd-initial-cluster-token", "clustermesh-apiserver", "Etcd initial cluster token. Used to prevent accidentally joining other etcd clusters that are reachable on the same L2 network domain.")
    57  	rootCmd.Flags().String("etcd-cluster-name", "clustermesh-apiserver", "Name of the etcd cluster. Must match what etcd is later started with.")
    58  	rootCmd.Flags().String("cluster-name", defaults.ClusterName, "Name of the Cilium cluster, used to set the username of the admin user in etcd. This is distinct from the etcd cluster's name.")
    59  	rootCmd.Flags().Duration("timeout", time.Minute*2, "How long to wait for operations before exiting.")
    60  	rootCmd.Flags().Bool("debug", false, "Debug log output.")
    61  	// Use Viper for configuration so that we can parse both command line flags and environment variables
    62  	vp.BindPFlags(rootCmd.Flags())
    63  	vp.SetEnvPrefix("cilium")
    64  	vp.AutomaticEnv()
    65  	vp.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
    66  	return rootCmd
    67  }
    68  
    69  func InitEtcdLocal() (returnErr error) {
    70  	// Get configuration values
    71  	etcdDataDir := vp.GetString("etcd-data-dir")
    72  	etcdInitialClusterToken := vp.GetString("etcd-initial-cluster-token")
    73  	etcdClusterName := vp.GetString("etcd-cluster-name")
    74  	ciliumClusterName := vp.GetString("cluster-name")
    75  	debug := vp.GetBool("debug")
    76  	timeout := vp.GetDuration("timeout")
    77  	// We have returnErr has a named variable, so we can set it in the deferred cleanup function if needed
    78  	log.WithFields(logrus.Fields{
    79  		"timeout":                 timeout,
    80  		"etcdDataDir":             etcdDataDir,
    81  		"etcdClusterName":         etcdClusterName,
    82  		logfields.ClusterName:     ciliumClusterName,
    83  		"etcdInitialClusterToken": etcdInitialClusterToken,
    84  	}).
    85  		Info("Starting first-time initialisation of etcd for Cilium Clustermesh")
    86  
    87  	ctx, cancelFn := context.WithTimeout(context.Background(), timeout)
    88  	defer cancelFn()
    89  
    90  	if debug {
    91  		logging.SetLogLevelToDebug()
    92  	}
    93  	log.Debug("Debug logging enabled")
    94  
    95  	// When the clustermesh-apiserver is launched we create a new etcd. We don't support persistence, so it is safe to
    96  	// delete the contents of the data directory before we start. It should be empty as we use a Kubernetes emptyDir for
    97  	// this purpose, but if the initialization failed Kubernetes may re-run this operation and emptyDir is tied to the
    98  	// lifecycle of the whole pod. Therefore, it could contain files from a previously failed initialization attempt.
    99  	log.WithField("etcdDataDir", etcdDataDir).
   100  		Info("Deleting contents of data directory")
   101  	// We don't use os.RemoveAll on the etcdDataDirectory because we don't want to remove the directory itself, just
   102  	// everything inside of it. In most cases that directory will be a mount anyway.
   103  	dir, err := os.ReadDir(etcdDataDir)
   104  	if err != nil {
   105  		log.WithField("etcdDataDir", etcdDataDir).
   106  			WithError(err).
   107  			Error("Failed to read from the etcd data directory while attempting to delete existing files")
   108  		return err
   109  	}
   110  	for _, d := range dir {
   111  		log.WithField("etcdDataDir", etcdDataDir).
   112  			WithField("path", d.Name()).
   113  			Debug("Removing file/directory in data dir")
   114  		err = os.RemoveAll(path.Join(etcdDataDir, d.Name()))
   115  		if err != nil {
   116  			log.WithField("etcdDataDir", etcdDataDir).
   117  				WithField("path", d.Name()).
   118  				WithError(err).
   119  				Error("Failed to remove pre-existing file/directory in etcd data directory")
   120  			return err
   121  		}
   122  	}
   123  
   124  	// Use "localhost" (instead of "http://127.0.0.1:2379" or "http://[::1]:2379") so it works in both the IPv4 and
   125  	// IPv6 cases.
   126  	loopbackEndpoint := "http://localhost:2379"
   127  	log.WithFields(logrus.Fields{
   128  		"etcdDataDir":             etcdDataDir,
   129  		"etcdListenClientUrl":     loopbackEndpoint,
   130  		"etcdClusterName":         etcdClusterName,
   131  		"etcdInitialClusterToken": etcdInitialClusterToken,
   132  	}).
   133  		Info("Starting localhost-only etcd process")
   134  	// Specify the full path to the etcd binary to avoid any PATH search binary replacement nonsense
   135  	etcdCmd := exec.CommandContext(ctx, EtcdBinaryLocation,
   136  		fmt.Sprintf("--data-dir=%s", etcdDataDir),
   137  		fmt.Sprintf("--name=%s", etcdClusterName),
   138  		fmt.Sprintf("--listen-client-urls=%s", loopbackEndpoint),
   139  		fmt.Sprintf("--advertise-client-urls=%s", loopbackEndpoint),
   140  		fmt.Sprintf("--initial-cluster-token=%s", etcdInitialClusterToken),
   141  		"--initial-cluster-state=new")
   142  	log.WithField("etcdBinary", EtcdBinaryLocation).
   143  		WithField("etcdFlags", etcdCmd.Args).
   144  		Debug("Executing etcd")
   145  
   146  	// Exec the etcd binary, which ultimately calls fork(2) under the hood. We don't wait on its completion, because
   147  	// it'll never complete of course.
   148  	err = etcdCmd.Start()
   149  	if err != nil {
   150  		log.WithField("etcdBinary", EtcdBinaryLocation).
   151  			WithField("etcdFlags", etcdCmd.Args).
   152  			WithError(err).
   153  			Error("Failed to launch etcd process")
   154  		return err
   155  	}
   156  	etcdPid := etcdCmd.Process.Pid
   157  	log.WithField("etcdPID", etcdPid).
   158  		Info("Local etcd server process started")
   159  
   160  	// Defer etcd process cleanup
   161  	defer func() {
   162  		log := log.WithField("etcdPID", etcdPid)
   163  		log.Debug("Cleaning up etcd process")
   164  		// Send the process a SIGTERM. SIGTERM is the "gentle" shutdown signal, and etcd should close down its resources
   165  		// cleanly and then exit.
   166  		log.Info("Sending SIGTERM signal to etcd process")
   167  		err := etcdCmd.Process.Signal(syscall.SIGTERM)
   168  		if err != nil {
   169  			log.WithError(err).
   170  				Error("Failed to send SIGTERM signal to etcd process")
   171  			// Return both this error, and the main function's return error (if there is one).
   172  			returnErr = errors.Join(returnErr, err)
   173  			return
   174  		}
   175  
   176  		// Wait for the etcd process to finish, and cleanup resources.
   177  		log.Info("Waiting for etcd process to exit")
   178  		err = etcdCmd.Wait()
   179  		if err != nil {
   180  			exitError := &exec.ExitError{}
   181  			if errors.As(err, &exitError) {
   182  				if exitError.ExitCode() == -1 {
   183  					// We SIGTERMed the etcd process, so a nonzero exit code is expected.
   184  					// Check the context as a last sanity check
   185  					if ctx.Err() != nil {
   186  						// Don't log the error itself here, if the context is timed out it'll be cancelled, so the error
   187  						// will just say "context cancelled" and not be useful — and possibly even misleading. It's
   188  						// possible that the timeout expires at the moment between etcd exiting normally and this check,
   189  						// which would report a false error. That's very unlikely, so we don't worry about it here.
   190  						log.WithField("timeout", timeout).
   191  							Error("etcd exited, but our context has expired. etcd may have been terminated due to timeout. Consider increasing the value of the timeout using the --timeout flag or CILIUM_TIMEOUT environment variable.")
   192  						// Return both this error, and the main function's return error (if there is one). This is just
   193  						// to make sure that the calling code correctly detects that an error occurs.
   194  						returnErr = errors.Join(returnErr, ctx.Err())
   195  						return
   196  					}
   197  					// This is the "good state", the context hasn't expired, the etcd process has exited, and we're
   198  					// okay with a nonzero exit code because we exited it with a SIGTERM.
   199  					log.Info("etcd process exited")
   200  					return
   201  				}
   202  				log.WithError(err).
   203  					WithField("etcdExitCode", exitError.ExitCode()).
   204  					Error("etcd process exited improperly")
   205  				// Return both this error, and the main function's return error (if there is one).
   206  				returnErr = errors.Join(returnErr, err)
   207  				return
   208  			} else {
   209  				// Some other kind of error
   210  				log.WithError(err).
   211  					Error("Failed to wait on etcd process finishing")
   212  				// Return both this error, and the main function's return error (if there is one).
   213  				returnErr = errors.Join(returnErr, err)
   214  				return
   215  			}
   216  		}
   217  		log.Info("etcd process exited")
   218  	}()
   219  
   220  	// With the etcd server process launched, we need to construct an etcd client
   221  	config := clientv3.Config{
   222  		Context:   ctx,
   223  		Endpoints: []string{loopbackEndpoint},
   224  	}
   225  	log.WithField("etcdClientConfig", fmt.Sprintf("%+v", config)).
   226  		Debug("Constructed etcd client config")
   227  	etcdClient, err := clientv3.New(config)
   228  	if err != nil {
   229  		log.WithField("etcdClientConfig", fmt.Sprintf("%+v", config)).
   230  			WithError(err).
   231  			Error("Failed to construct etcd client from configuration")
   232  		return err
   233  	}
   234  	defer etcdClient.Close()
   235  
   236  	// Run the init commands
   237  	log.WithField(logfields.ClusterName, ciliumClusterName).
   238  		Info("Starting etcd init")
   239  	err = kvstoreEtcdInit.ClusterMeshEtcdInit(ctx, log, etcdClient, ciliumClusterName)
   240  	if err != nil {
   241  		log.WithError(err).
   242  			WithField(logfields.ClusterName, ciliumClusterName).
   243  			Error("Failed to initialise etcd")
   244  		return err
   245  	}
   246  	log.Info("Etcd init completed")
   247  	return nil
   248  }