k8s.io/kubernetes@v1.29.3/test/e2e_node/services/server.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package services
    18  
    19  import (
    20  	"flag"
    21  	"fmt"
    22  	"net/http"
    23  	"os"
    24  	"os/exec"
    25  	"path"
    26  	"reflect"
    27  	"strconv"
    28  	"strings"
    29  	"syscall"
    30  	"time"
    31  
    32  	"k8s.io/klog/v2"
    33  
    34  	"k8s.io/kubernetes/test/e2e/framework"
    35  )
    36  
    37  var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120, "Time to wait for each server to become healthy.")
    38  
    39  // A server manages a separate server process started and killed with
    40  // commands.
    41  type server struct {
    42  	// name is the name of the server, it is only used for logging.
    43  	name string
    44  	// startCommand is the command used to start the server
    45  	startCommand *exec.Cmd
    46  	// killCommand is the command used to stop the server. It is not required. If it
    47  	// is not specified, `kill` will be used to stop the server.
    48  	killCommand *exec.Cmd
    49  	// restartCommand is the command used to restart the server. If provided, it will be used
    50  	// instead of startCommand when restarting the server.
    51  	restartCommand *exec.Cmd
    52  	// healthCheckUrls is the urls used to check whether the server is ready.
    53  	healthCheckUrls []string
    54  	// outFilename is the name of the log file. The stdout and stderr of the server
    55  	// will be redirected to this file.
    56  	outFilename string
    57  	// monitorParent determines whether the server should watch its parent process and exit
    58  	// if its parent is gone.
    59  	monitorParent bool
    60  	// restartOnExit determines whether a restart loop is launched with the server
    61  	restartOnExit bool
    62  	// Writing to this channel, if it is not nil, stops the restart loop.
    63  	// When tearing down a server, you should check for this channel and write to it if it exists.
    64  	stopRestartingCh chan<- bool
    65  	// Read from this to confirm that the restart loop has stopped.
    66  	ackStopRestartingCh <-chan bool
    67  	// The systemd unit name for the service if it exists. If server is not managed by systemd, field is empty.
    68  	systemdUnitName string
    69  }
    70  
    71  // newServer returns a new server with the given name, commands, health check
    72  // URLs, etc.
    73  func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, monitorParent, restartOnExit bool, systemdUnitName string) *server {
    74  	return &server{
    75  		name:            name,
    76  		startCommand:    start,
    77  		killCommand:     kill,
    78  		restartCommand:  restart,
    79  		healthCheckUrls: urls,
    80  		outFilename:     outputFileName,
    81  		monitorParent:   monitorParent,
    82  		restartOnExit:   restartOnExit,
    83  		systemdUnitName: systemdUnitName,
    84  	}
    85  }
    86  
    87  // commandToString format command to string.
    88  func commandToString(c *exec.Cmd) string {
    89  	if c == nil {
    90  		return ""
    91  	}
    92  	return strings.Join(append([]string{c.Path}, c.Args[1:]...), " ")
    93  }
    94  
    95  func (s *server) String() string {
    96  	return fmt.Sprintf("server %q start-command: `%s`, kill-command: `%s`, restart-command: `%s`, health-check: %v, output-file: %q", s.name,
    97  		commandToString(s.startCommand), commandToString(s.killCommand), commandToString(s.restartCommand), s.healthCheckUrls, s.outFilename)
    98  }
    99  
   100  // start starts the server by running its commands, monitors it with a health
   101  // check, and ensures that it is restarted if applicable.
   102  //
   103  // Note: restartOnExit == true requires len(s.healthCheckUrls) > 0 to work properly.
   104  func (s *server) start() error {
   105  	klog.Infof("Starting server %q with command %q", s.name, commandToString(s.startCommand))
   106  	errCh := make(chan error)
   107  
   108  	// Set up restart channels if the server is configured for restart on exit.
   109  	var stopRestartingCh, ackStopRestartingCh chan bool
   110  	if s.restartOnExit {
   111  		if len(s.healthCheckUrls) == 0 {
   112  			return fmt.Errorf("tried to start %s which has s.restartOnExit == true, but no health check urls provided", s)
   113  		}
   114  
   115  		stopRestartingCh = make(chan bool)
   116  		ackStopRestartingCh = make(chan bool)
   117  
   118  		s.stopRestartingCh = stopRestartingCh
   119  		s.ackStopRestartingCh = ackStopRestartingCh
   120  	}
   121  
   122  	// This goroutine actually runs the start command for the server.
   123  	go func() {
   124  		defer close(errCh)
   125  
   126  		// Create the output filename
   127  		outPath := path.Join(framework.TestContext.ReportDir, s.outFilename)
   128  		outfile, err := os.Create(outPath)
   129  		if err != nil {
   130  			errCh <- fmt.Errorf("failed to create file %q for `%s` %v", outPath, s, err)
   131  			return
   132  		}
   133  		klog.Infof("Output file for server %q: %v", s.name, outfile.Name())
   134  		defer outfile.Close()
   135  		defer outfile.Sync()
   136  
   137  		// Set the command to write the output file
   138  		s.startCommand.Stdout = outfile
   139  		s.startCommand.Stderr = outfile
   140  
   141  		// If monitorParent is set, set Pdeathsig when starting the server.
   142  		if s.monitorParent {
   143  			// Death of this test process should kill the server as well.
   144  			attrs := &syscall.SysProcAttr{}
   145  			// Hack to set linux-only field without build tags.
   146  			deathSigField := reflect.ValueOf(attrs).Elem().FieldByName("Pdeathsig")
   147  			if deathSigField.IsValid() {
   148  				deathSigField.Set(reflect.ValueOf(syscall.SIGTERM))
   149  			} else {
   150  				errCh <- fmt.Errorf("failed to set Pdeathsig field (non-linux build)")
   151  				return
   152  			}
   153  			s.startCommand.SysProcAttr = attrs
   154  		}
   155  
   156  		// Start the command
   157  		err = s.startCommand.Start()
   158  		if err != nil {
   159  			errCh <- fmt.Errorf("failed to run %s: %w", s, err)
   160  			return
   161  		}
   162  		if !s.restartOnExit {
   163  			klog.Infof("Waiting for server %q start command to complete", s.name)
   164  			// If we aren't planning on restarting, ok to Wait() here to release resources.
   165  			// Otherwise, we Wait() in the restart loop.
   166  			err = s.startCommand.Wait()
   167  			if err != nil {
   168  				errCh <- fmt.Errorf("failed to run start command for server %q: %w", s.name, err)
   169  				return
   170  			}
   171  		} else {
   172  			usedStartCmd := true
   173  			for {
   174  				klog.Infof("Running health check for service %q", s.name)
   175  				// Wait for an initial health check to pass, so that we are sure the server started.
   176  				err := readinessCheck(s.name, s.healthCheckUrls, nil)
   177  				if err != nil {
   178  					if usedStartCmd {
   179  						klog.Infof("Waiting for server %q start command to complete after initial health check failed", s.name)
   180  						s.startCommand.Wait() // Release resources if necessary.
   181  					}
   182  					// This should not happen, immediately stop the e2eService process.
   183  					klog.Fatalf("Restart loop readinessCheck failed for %q", s.name)
   184  				} else {
   185  					klog.Infof("Initial health check passed for service %q", s.name)
   186  				}
   187  
   188  				// Initial health check passed, wait until a health check fails again.
   189  			stillAlive:
   190  				for {
   191  					select {
   192  					case <-stopRestartingCh:
   193  						ackStopRestartingCh <- true
   194  						return
   195  					case <-time.After(time.Second):
   196  						for _, url := range s.healthCheckUrls {
   197  							resp, err := http.Head(url)
   198  							if err != nil || resp.StatusCode != http.StatusOK {
   199  								break stillAlive
   200  							}
   201  						}
   202  					}
   203  				}
   204  
   205  				if usedStartCmd {
   206  					s.startCommand.Wait() // Release resources from last cmd
   207  					usedStartCmd = false
   208  				}
   209  				if s.restartCommand != nil {
   210  					// Always make a fresh copy of restartCommand before
   211  					// running, we may have to restart multiple times
   212  					s.restartCommand = &exec.Cmd{
   213  						Path:        s.restartCommand.Path,
   214  						Args:        s.restartCommand.Args,
   215  						Env:         s.restartCommand.Env,
   216  						Dir:         s.restartCommand.Dir,
   217  						Stdin:       s.restartCommand.Stdin,
   218  						Stdout:      s.restartCommand.Stdout,
   219  						Stderr:      s.restartCommand.Stderr,
   220  						ExtraFiles:  s.restartCommand.ExtraFiles,
   221  						SysProcAttr: s.restartCommand.SysProcAttr,
   222  					}
   223  					// Run and wait for exit. This command is assumed to have
   224  					// short duration, e.g. systemctl restart
   225  					klog.Infof("Restarting server %q with restart command", s.name)
   226  					err = s.restartCommand.Run()
   227  					if err != nil {
   228  						// This should not happen, immediately stop the e2eService process.
   229  						klog.Fatalf("Restarting server %s with restartCommand failed. Error: %v.", s, err)
   230  					}
   231  				} else {
   232  					s.startCommand = &exec.Cmd{
   233  						Path:        s.startCommand.Path,
   234  						Args:        s.startCommand.Args,
   235  						Env:         s.startCommand.Env,
   236  						Dir:         s.startCommand.Dir,
   237  						Stdin:       s.startCommand.Stdin,
   238  						Stdout:      s.startCommand.Stdout,
   239  						Stderr:      s.startCommand.Stderr,
   240  						ExtraFiles:  s.startCommand.ExtraFiles,
   241  						SysProcAttr: s.startCommand.SysProcAttr,
   242  					}
   243  					klog.Infof("Restarting server %q with start command", s.name)
   244  					err = s.startCommand.Start()
   245  					usedStartCmd = true
   246  					if err != nil {
   247  						// This should not happen, immediately stop the e2eService process.
   248  						klog.Fatalf("Restarting server %s with startCommand failed. Error: %v.", s, err)
   249  					}
   250  				}
   251  			}
   252  		}
   253  	}()
   254  
   255  	return readinessCheck(s.name, s.healthCheckUrls, errCh)
   256  }
   257  
   258  // kill runs the server's kill command.
   259  func (s *server) kill() error {
   260  	klog.Infof("Kill server %q", s.name)
   261  	name := s.name
   262  	cmd := s.startCommand
   263  
   264  	// If s has a restart loop, turn it off.
   265  	if s.restartOnExit {
   266  		s.stopRestartingCh <- true
   267  		<-s.ackStopRestartingCh
   268  	}
   269  
   270  	if s.killCommand != nil {
   271  		return s.killCommand.Run()
   272  	}
   273  
   274  	if cmd == nil {
   275  		return fmt.Errorf("could not kill %q because both `killCommand` and `startCommand` are nil", name)
   276  	}
   277  
   278  	if cmd.Process == nil {
   279  		klog.V(2).Infof("%q not running", name)
   280  		return nil
   281  	}
   282  	pid := cmd.Process.Pid
   283  	if pid <= 1 {
   284  		return fmt.Errorf("invalid PID %d for %q", pid, name)
   285  	}
   286  
   287  	// Attempt to shut down the process in a friendly manner before forcing it.
   288  	waitChan := make(chan error)
   289  	go func() {
   290  		_, err := cmd.Process.Wait()
   291  		waitChan <- err
   292  		close(waitChan)
   293  	}()
   294  
   295  	const timeout = 10 * time.Second
   296  	for _, signal := range []string{"-TERM", "-KILL"} {
   297  		klog.V(2).Infof("Killing process %d (%s) with %s", pid, name, signal)
   298  		cmd := exec.Command("kill", signal, strconv.Itoa(pid))
   299  		_, err := cmd.Output()
   300  		if err != nil {
   301  			klog.Errorf("Error signaling process %d (%s) with %s: %v", pid, name, signal, err)
   302  			continue
   303  		}
   304  
   305  		select {
   306  		case err := <-waitChan:
   307  			if err != nil {
   308  				return fmt.Errorf("error stopping %q: %w", name, err)
   309  			}
   310  			// Success!
   311  			return nil
   312  		case <-time.After(timeout):
   313  			// Continue.
   314  		}
   315  	}
   316  
   317  	return fmt.Errorf("unable to stop %q", name)
   318  }
   319  
   320  func (s *server) stopUnit() error {
   321  	klog.Infof("Stopping systemd unit for server %q with unit name: %q", s.name, s.systemdUnitName)
   322  	if s.systemdUnitName != "" {
   323  		err := exec.Command("sudo", "systemctl", "stop", s.systemdUnitName).Run()
   324  		if err != nil {
   325  			return fmt.Errorf("Failed to stop systemd unit name: %q: %w", s.systemdUnitName, err)
   326  		}
   327  	}
   328  	return nil
   329  }