github.com/cilium/cilium@v1.16.2/clustermesh-apiserver/etcdinit/root.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package etcdinit 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "os" 11 "os/exec" 12 "path" 13 "strings" 14 "syscall" 15 "time" 16 17 "github.com/sirupsen/logrus" 18 "github.com/spf13/cobra" 19 "github.com/spf13/viper" 20 clientv3 "go.etcd.io/etcd/client/v3" 21 22 "github.com/cilium/cilium/pkg/defaults" 23 kvstoreEtcdInit "github.com/cilium/cilium/pkg/kvstore/etcdinit" 24 "github.com/cilium/cilium/pkg/logging" 25 "github.com/cilium/cilium/pkg/logging/logfields" 26 "github.com/cilium/cilium/pkg/option" 27 "github.com/cilium/cilium/pkg/version" 28 ) 29 30 // EtcdBinaryLocation is hardcoded because we expect this command to be run inside a Cilium container that places the 31 // etcd binary in a specific location. 32 const EtcdBinaryLocation = "/usr/bin/etcd" 33 34 var ( 35 log = logging.DefaultLogger.WithField(logfields.LogSubsys, "etcdinit") 36 vp = viper.New() 37 ) 38 39 func NewCmd() *cobra.Command { 40 rootCmd := &cobra.Command{ 41 Use: "etcdinit", 42 Short: "Initialise an etcd data directory for use by the etcd sidecar of clustermesh-apiserver", 43 PreRun: func(cmd *cobra.Command, args []string) { 44 option.LogRegisteredOptions(vp, log) 45 log.Infof("Cilium ClusterMesh etcd init %s", version.Version) 46 }, 47 Run: func(cmd *cobra.Command, args []string) { 48 err := InitEtcdLocal() 49 // The error has already been handled and logged by InitEtcdLocal. We just use it to determine the exit code 50 if err != nil { 51 os.Exit(-1) 52 } 53 }, 54 } 55 rootCmd.Flags().String("etcd-data-dir", "/var/run/etcd", "Etcd data directory. Should have read/write permissions here.") 56 rootCmd.Flags().String("etcd-initial-cluster-token", "clustermesh-apiserver", "Etcd initial cluster token. Used to prevent accidentally joining other etcd clusters that are reachable on the same L2 network domain.") 57 rootCmd.Flags().String("etcd-cluster-name", "clustermesh-apiserver", "Name of the etcd cluster. Must match what etcd is later started with.") 58 rootCmd.Flags().String("cluster-name", defaults.ClusterName, "Name of the Cilium cluster, used to set the username of the admin user in etcd. This is distinct from the etcd cluster's name.") 59 rootCmd.Flags().Duration("timeout", time.Minute*2, "How long to wait for operations before exiting.") 60 rootCmd.Flags().Bool("debug", false, "Debug log output.") 61 // Use Viper for configuration so that we can parse both command line flags and environment variables 62 vp.BindPFlags(rootCmd.Flags()) 63 vp.SetEnvPrefix("cilium") 64 vp.AutomaticEnv() 65 vp.SetEnvKeyReplacer(strings.NewReplacer("-", "_")) 66 return rootCmd 67 } 68 69 func InitEtcdLocal() (returnErr error) { 70 // Get configuration values 71 etcdDataDir := vp.GetString("etcd-data-dir") 72 etcdInitialClusterToken := vp.GetString("etcd-initial-cluster-token") 73 etcdClusterName := vp.GetString("etcd-cluster-name") 74 ciliumClusterName := vp.GetString("cluster-name") 75 debug := vp.GetBool("debug") 76 timeout := vp.GetDuration("timeout") 77 // We have returnErr has a named variable, so we can set it in the deferred cleanup function if needed 78 log.WithFields(logrus.Fields{ 79 "timeout": timeout, 80 "etcdDataDir": etcdDataDir, 81 "etcdClusterName": etcdClusterName, 82 logfields.ClusterName: ciliumClusterName, 83 "etcdInitialClusterToken": etcdInitialClusterToken, 84 }). 85 Info("Starting first-time initialisation of etcd for Cilium Clustermesh") 86 87 ctx, cancelFn := context.WithTimeout(context.Background(), timeout) 88 defer cancelFn() 89 90 if debug { 91 logging.SetLogLevelToDebug() 92 } 93 log.Debug("Debug logging enabled") 94 95 // When the clustermesh-apiserver is launched we create a new etcd. We don't support persistence, so it is safe to 96 // delete the contents of the data directory before we start. It should be empty as we use a Kubernetes emptyDir for 97 // this purpose, but if the initialization failed Kubernetes may re-run this operation and emptyDir is tied to the 98 // lifecycle of the whole pod. Therefore, it could contain files from a previously failed initialization attempt. 99 log.WithField("etcdDataDir", etcdDataDir). 100 Info("Deleting contents of data directory") 101 // We don't use os.RemoveAll on the etcdDataDirectory because we don't want to remove the directory itself, just 102 // everything inside of it. In most cases that directory will be a mount anyway. 103 dir, err := os.ReadDir(etcdDataDir) 104 if err != nil { 105 log.WithField("etcdDataDir", etcdDataDir). 106 WithError(err). 107 Error("Failed to read from the etcd data directory while attempting to delete existing files") 108 return err 109 } 110 for _, d := range dir { 111 log.WithField("etcdDataDir", etcdDataDir). 112 WithField("path", d.Name()). 113 Debug("Removing file/directory in data dir") 114 err = os.RemoveAll(path.Join(etcdDataDir, d.Name())) 115 if err != nil { 116 log.WithField("etcdDataDir", etcdDataDir). 117 WithField("path", d.Name()). 118 WithError(err). 119 Error("Failed to remove pre-existing file/directory in etcd data directory") 120 return err 121 } 122 } 123 124 // Use "localhost" (instead of "http://127.0.0.1:2379" or "http://[::1]:2379") so it works in both the IPv4 and 125 // IPv6 cases. 126 loopbackEndpoint := "http://localhost:2379" 127 log.WithFields(logrus.Fields{ 128 "etcdDataDir": etcdDataDir, 129 "etcdListenClientUrl": loopbackEndpoint, 130 "etcdClusterName": etcdClusterName, 131 "etcdInitialClusterToken": etcdInitialClusterToken, 132 }). 133 Info("Starting localhost-only etcd process") 134 // Specify the full path to the etcd binary to avoid any PATH search binary replacement nonsense 135 etcdCmd := exec.CommandContext(ctx, EtcdBinaryLocation, 136 fmt.Sprintf("--data-dir=%s", etcdDataDir), 137 fmt.Sprintf("--name=%s", etcdClusterName), 138 fmt.Sprintf("--listen-client-urls=%s", loopbackEndpoint), 139 fmt.Sprintf("--advertise-client-urls=%s", loopbackEndpoint), 140 fmt.Sprintf("--initial-cluster-token=%s", etcdInitialClusterToken), 141 "--initial-cluster-state=new") 142 log.WithField("etcdBinary", EtcdBinaryLocation). 143 WithField("etcdFlags", etcdCmd.Args). 144 Debug("Executing etcd") 145 146 // Exec the etcd binary, which ultimately calls fork(2) under the hood. We don't wait on its completion, because 147 // it'll never complete of course. 148 err = etcdCmd.Start() 149 if err != nil { 150 log.WithField("etcdBinary", EtcdBinaryLocation). 151 WithField("etcdFlags", etcdCmd.Args). 152 WithError(err). 153 Error("Failed to launch etcd process") 154 return err 155 } 156 etcdPid := etcdCmd.Process.Pid 157 log.WithField("etcdPID", etcdPid). 158 Info("Local etcd server process started") 159 160 // Defer etcd process cleanup 161 defer func() { 162 log := log.WithField("etcdPID", etcdPid) 163 log.Debug("Cleaning up etcd process") 164 // Send the process a SIGTERM. SIGTERM is the "gentle" shutdown signal, and etcd should close down its resources 165 // cleanly and then exit. 166 log.Info("Sending SIGTERM signal to etcd process") 167 err := etcdCmd.Process.Signal(syscall.SIGTERM) 168 if err != nil { 169 log.WithError(err). 170 Error("Failed to send SIGTERM signal to etcd process") 171 // Return both this error, and the main function's return error (if there is one). 172 returnErr = errors.Join(returnErr, err) 173 return 174 } 175 176 // Wait for the etcd process to finish, and cleanup resources. 177 log.Info("Waiting for etcd process to exit") 178 err = etcdCmd.Wait() 179 if err != nil { 180 exitError := &exec.ExitError{} 181 if errors.As(err, &exitError) { 182 if exitError.ExitCode() == -1 { 183 // We SIGTERMed the etcd process, so a nonzero exit code is expected. 184 // Check the context as a last sanity check 185 if ctx.Err() != nil { 186 // Don't log the error itself here, if the context is timed out it'll be cancelled, so the error 187 // will just say "context cancelled" and not be useful — and possibly even misleading. It's 188 // possible that the timeout expires at the moment between etcd exiting normally and this check, 189 // which would report a false error. That's very unlikely, so we don't worry about it here. 190 log.WithField("timeout", timeout). 191 Error("etcd exited, but our context has expired. etcd may have been terminated due to timeout. Consider increasing the value of the timeout using the --timeout flag or CILIUM_TIMEOUT environment variable.") 192 // Return both this error, and the main function's return error (if there is one). This is just 193 // to make sure that the calling code correctly detects that an error occurs. 194 returnErr = errors.Join(returnErr, ctx.Err()) 195 return 196 } 197 // This is the "good state", the context hasn't expired, the etcd process has exited, and we're 198 // okay with a nonzero exit code because we exited it with a SIGTERM. 199 log.Info("etcd process exited") 200 return 201 } 202 log.WithError(err). 203 WithField("etcdExitCode", exitError.ExitCode()). 204 Error("etcd process exited improperly") 205 // Return both this error, and the main function's return error (if there is one). 206 returnErr = errors.Join(returnErr, err) 207 return 208 } else { 209 // Some other kind of error 210 log.WithError(err). 211 Error("Failed to wait on etcd process finishing") 212 // Return both this error, and the main function's return error (if there is one). 213 returnErr = errors.Join(returnErr, err) 214 return 215 } 216 } 217 log.Info("etcd process exited") 218 }() 219 220 // With the etcd server process launched, we need to construct an etcd client 221 config := clientv3.Config{ 222 Context: ctx, 223 Endpoints: []string{loopbackEndpoint}, 224 } 225 log.WithField("etcdClientConfig", fmt.Sprintf("%+v", config)). 226 Debug("Constructed etcd client config") 227 etcdClient, err := clientv3.New(config) 228 if err != nil { 229 log.WithField("etcdClientConfig", fmt.Sprintf("%+v", config)). 230 WithError(err). 231 Error("Failed to construct etcd client from configuration") 232 return err 233 } 234 defer etcdClient.Close() 235 236 // Run the init commands 237 log.WithField(logfields.ClusterName, ciliumClusterName). 238 Info("Starting etcd init") 239 err = kvstoreEtcdInit.ClusterMeshEtcdInit(ctx, log, etcdClient, ciliumClusterName) 240 if err != nil { 241 log.WithError(err). 242 WithField(logfields.ClusterName, ciliumClusterName). 243 Error("Failed to initialise etcd") 244 return err 245 } 246 log.Info("Etcd init completed") 247 return nil 248 }