k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/chaos/nodes.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package chaos
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"math/rand"
    23  	"strings"
    24  	"sync"
    25  	"time"
    26  
    27  	"k8s.io/perf-tests/clusterloader2/api"
    28  	"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
    29  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    30  
    31  	v1 "k8s.io/api/core/v1"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/util/sets"
    34  	"k8s.io/apimachinery/pkg/util/wait"
    35  	clientset "k8s.io/client-go/kubernetes"
    36  	"k8s.io/klog/v2"
    37  	"k8s.io/perf-tests/clusterloader2/pkg/provider"
    38  )
    39  
    40  const (
    41  	monitoringNamespace = "monitoring"
    42  	prometheusLabel     = "prometheus=k8s"
    43  )
    44  
    45  // NodeKiller is a utility to simulate node failures.
    46  type NodeKiller struct {
    47  	config api.NodeFailureConfig
    48  	client clientset.Interface
    49  	// killedNodes stores names of the nodes that have been killed by NodeKiller.
    50  	killedNodes sets.String
    51  	recorder    *eventRecorder
    52  	ssh         util.SSHExecutor
    53  }
    54  
    55  type nodeAction string
    56  
    57  const (
    58  	stopServices nodeAction = "stopService"
    59  	rebootNode              = "rebootNode"
    60  )
    61  
    62  type event struct {
    63  	time     time.Time
    64  	action   nodeAction
    65  	nodeName string
    66  }
    67  
    68  type eventRecorder struct {
    69  	events []event
    70  	mux    sync.Mutex
    71  }
    72  
    73  func newEventRecorder() *eventRecorder {
    74  	return &eventRecorder{[]event{}, sync.Mutex{}}
    75  }
    76  
    77  func (r *eventRecorder) record(a nodeAction, nodeName string) {
    78  	e := event{time.Now(), a, nodeName}
    79  	r.mux.Lock()
    80  	r.events = append(r.events, e)
    81  	r.mux.Unlock()
    82  }
    83  
    84  // NewNodeKiller creates new NodeKiller.
    85  func NewNodeKiller(config api.NodeFailureConfig, client clientset.Interface, killedNodes sets.String, provider provider.Provider) (*NodeKiller, error) {
    86  	// TODO(#1399): node-killing code is provider specific, move it into provider
    87  	if !provider.Features().SupportNodeKiller {
    88  		return nil, fmt.Errorf("provider %q is not supported by NodeKiller", provider)
    89  	}
    90  	sshExecutor := &util.GCloudSSHExecutor{}
    91  	return &NodeKiller{config, client, killedNodes, newEventRecorder(), sshExecutor}, nil
    92  }
    93  
    94  // Run starts NodeKiller until stopCh is closed.
    95  func (k *NodeKiller) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) {
    96  	defer wg.Done()
    97  	// wait.JitterUntil starts work immediately, so wait first.
    98  	sleepInterrupt(wait.Jitter(time.Duration(k.config.Interval), k.config.JitterFactor), stopCh)
    99  	wait.JitterUntil(func() {
   100  		nodes, err := k.pickNodes()
   101  		if err != nil {
   102  			klog.Errorf("%s: Unable to pick nodes to kill: %v", k, err)
   103  			return
   104  		}
   105  		k.kill(nodes, stopCh)
   106  	}, time.Duration(k.config.Interval), k.config.JitterFactor, true, stopCh)
   107  }
   108  
   109  func (k *NodeKiller) pickNodes() ([]v1.Node, error) {
   110  	allNodes, err := util.GetSchedulableUntainedNodes(k.client)
   111  	if err != nil {
   112  		return nil, err
   113  	}
   114  
   115  	prometheusPods, err := client.ListPodsWithOptions(k.client, monitoringNamespace, metav1.ListOptions{
   116  		LabelSelector: prometheusLabel,
   117  	})
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  	nodesHasPrometheusPod := sets.NewString()
   122  	for i := range prometheusPods {
   123  		if prometheusPods[i].Spec.NodeName != "" {
   124  			nodesHasPrometheusPod.Insert(prometheusPods[i].Spec.NodeName)
   125  			klog.V(2).Infof("%s: Node %s removed from killing. Runs pod %s", k, prometheusPods[i].Spec.NodeName, prometheusPods[i].Name)
   126  		}
   127  	}
   128  
   129  	nodes := allNodes[:0]
   130  	for _, node := range allNodes {
   131  		if !nodesHasPrometheusPod.Has(node.Name) && !k.killedNodes.Has(node.Name) {
   132  			nodes = append(nodes, node)
   133  		}
   134  	}
   135  	rand.Shuffle(len(nodes), func(i, j int) {
   136  		nodes[i], nodes[j] = nodes[j], nodes[i]
   137  	})
   138  	numNodes := int(math.Ceil(k.config.FailureRate * float64(len(nodes))))
   139  	klog.V(2).Infof("%s: %d nodes available, wants to fail %d nodes", k, len(nodes), numNodes)
   140  	if len(nodes) > numNodes {
   141  		nodes = nodes[:numNodes]
   142  	}
   143  	for _, node := range nodes {
   144  		klog.V(2).Infof("%s: Node %q schedule for failure", k, node.Name)
   145  	}
   146  	return nodes, nil
   147  }
   148  
   149  func (k *NodeKiller) kill(nodes []v1.Node, stopCh <-chan struct{}) {
   150  	wg := sync.WaitGroup{}
   151  	wg.Add(len(nodes))
   152  	for _, node := range nodes {
   153  		k.killedNodes.Insert(node.Name)
   154  		node := node
   155  		go func() {
   156  			defer wg.Done()
   157  
   158  			klog.V(2).Infof("%s: Stopping docker and kubelet on %q to simulate failure", k, node.Name)
   159  			k.addStopServicesEvent(node.Name)
   160  			err := k.ssh.Exec("sudo systemctl stop docker kubelet", &node, nil)
   161  			if err != nil {
   162  				klog.Errorf("%s: ERROR while stopping node %q: %v", k, node.Name, err)
   163  				return
   164  			}
   165  
   166  			// Listening for interruptions on stopCh or wait for the simulated downtime
   167  			sleepInterrupt(time.Duration(k.config.SimulatedDowntime), stopCh)
   168  
   169  			klog.V(2).Infof("%s: Rebooting %q to repair the node", k, node.Name)
   170  			k.addRebootEvent(node.Name)
   171  			err = k.ssh.Exec("sudo reboot", &node, nil)
   172  			if err != nil {
   173  				klog.Errorf("%s: Error while rebooting node %q: %v", k, node.Name, err)
   174  				return
   175  			}
   176  		}()
   177  	}
   178  	wg.Wait()
   179  }
   180  
   181  func (k *NodeKiller) addStopServicesEvent(nodeName string) {
   182  	k.recorder.record(stopServices, nodeName)
   183  }
   184  
   185  func (k *NodeKiller) addRebootEvent(nodeName string) {
   186  	k.recorder.record(rebootNode, nodeName)
   187  }
   188  
   189  // Summary logs NodeKiller execution
   190  func (k *NodeKiller) Summary() string {
   191  	var sb strings.Builder
   192  	sb.WriteString(fmt.Sprintf("%s: Recorded following events\n", k))
   193  	for _, e := range k.recorder.events {
   194  		sb.WriteString(fmt.Sprintf("%s: At %v %v happend for node %s\n", k, e.time.Format(time.UnixDate), e.action, e.nodeName))
   195  	}
   196  	return sb.String()
   197  }
   198  
   199  func (k *NodeKiller) String() string {
   200  	return "NodeKiller"
   201  }
   202  
   203  // Utility method to put the current routine to sleep or break the sleep if stopCh closes
   204  // Note of warning: if stopCh is already closed the process may not sleep at all.
   205  func sleepInterrupt(duration time.Duration, stopCh <-chan struct{}) {
   206  	select {
   207  	case <-stopCh:
   208  		break
   209  	case <-time.After(duration):
   210  		break
   211  	}
   212  }