k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/chaos/nodes.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package chaos 18 19 import ( 20 "fmt" 21 "math" 22 "math/rand" 23 "strings" 24 "sync" 25 "time" 26 27 "k8s.io/perf-tests/clusterloader2/api" 28 "k8s.io/perf-tests/clusterloader2/pkg/framework/client" 29 "k8s.io/perf-tests/clusterloader2/pkg/util" 30 31 v1 "k8s.io/api/core/v1" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/util/sets" 34 "k8s.io/apimachinery/pkg/util/wait" 35 clientset "k8s.io/client-go/kubernetes" 36 "k8s.io/klog/v2" 37 "k8s.io/perf-tests/clusterloader2/pkg/provider" 38 ) 39 40 const ( 41 monitoringNamespace = "monitoring" 42 prometheusLabel = "prometheus=k8s" 43 ) 44 45 // NodeKiller is a utility to simulate node failures. 46 type NodeKiller struct { 47 config api.NodeFailureConfig 48 client clientset.Interface 49 // killedNodes stores names of the nodes that have been killed by NodeKiller. 50 killedNodes sets.String 51 recorder *eventRecorder 52 ssh util.SSHExecutor 53 } 54 55 type nodeAction string 56 57 const ( 58 stopServices nodeAction = "stopService" 59 rebootNode = "rebootNode" 60 ) 61 62 type event struct { 63 time time.Time 64 action nodeAction 65 nodeName string 66 } 67 68 type eventRecorder struct { 69 events []event 70 mux sync.Mutex 71 } 72 73 func newEventRecorder() *eventRecorder { 74 return &eventRecorder{[]event{}, sync.Mutex{}} 75 } 76 77 func (r *eventRecorder) record(a nodeAction, nodeName string) { 78 e := event{time.Now(), a, nodeName} 79 r.mux.Lock() 80 r.events = append(r.events, e) 81 r.mux.Unlock() 82 } 83 84 // NewNodeKiller creates new NodeKiller. 85 func NewNodeKiller(config api.NodeFailureConfig, client clientset.Interface, killedNodes sets.String, provider provider.Provider) (*NodeKiller, error) { 86 // TODO(#1399): node-killing code is provider specific, move it into provider 87 if !provider.Features().SupportNodeKiller { 88 return nil, fmt.Errorf("provider %q is not supported by NodeKiller", provider) 89 } 90 sshExecutor := &util.GCloudSSHExecutor{} 91 return &NodeKiller{config, client, killedNodes, newEventRecorder(), sshExecutor}, nil 92 } 93 94 // Run starts NodeKiller until stopCh is closed. 95 func (k *NodeKiller) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) { 96 defer wg.Done() 97 // wait.JitterUntil starts work immediately, so wait first. 98 sleepInterrupt(wait.Jitter(time.Duration(k.config.Interval), k.config.JitterFactor), stopCh) 99 wait.JitterUntil(func() { 100 nodes, err := k.pickNodes() 101 if err != nil { 102 klog.Errorf("%s: Unable to pick nodes to kill: %v", k, err) 103 return 104 } 105 k.kill(nodes, stopCh) 106 }, time.Duration(k.config.Interval), k.config.JitterFactor, true, stopCh) 107 } 108 109 func (k *NodeKiller) pickNodes() ([]v1.Node, error) { 110 allNodes, err := util.GetSchedulableUntainedNodes(k.client) 111 if err != nil { 112 return nil, err 113 } 114 115 prometheusPods, err := client.ListPodsWithOptions(k.client, monitoringNamespace, metav1.ListOptions{ 116 LabelSelector: prometheusLabel, 117 }) 118 if err != nil { 119 return nil, err 120 } 121 nodesHasPrometheusPod := sets.NewString() 122 for i := range prometheusPods { 123 if prometheusPods[i].Spec.NodeName != "" { 124 nodesHasPrometheusPod.Insert(prometheusPods[i].Spec.NodeName) 125 klog.V(2).Infof("%s: Node %s removed from killing. Runs pod %s", k, prometheusPods[i].Spec.NodeName, prometheusPods[i].Name) 126 } 127 } 128 129 nodes := allNodes[:0] 130 for _, node := range allNodes { 131 if !nodesHasPrometheusPod.Has(node.Name) && !k.killedNodes.Has(node.Name) { 132 nodes = append(nodes, node) 133 } 134 } 135 rand.Shuffle(len(nodes), func(i, j int) { 136 nodes[i], nodes[j] = nodes[j], nodes[i] 137 }) 138 numNodes := int(math.Ceil(k.config.FailureRate * float64(len(nodes)))) 139 klog.V(2).Infof("%s: %d nodes available, wants to fail %d nodes", k, len(nodes), numNodes) 140 if len(nodes) > numNodes { 141 nodes = nodes[:numNodes] 142 } 143 for _, node := range nodes { 144 klog.V(2).Infof("%s: Node %q schedule for failure", k, node.Name) 145 } 146 return nodes, nil 147 } 148 149 func (k *NodeKiller) kill(nodes []v1.Node, stopCh <-chan struct{}) { 150 wg := sync.WaitGroup{} 151 wg.Add(len(nodes)) 152 for _, node := range nodes { 153 k.killedNodes.Insert(node.Name) 154 node := node 155 go func() { 156 defer wg.Done() 157 158 klog.V(2).Infof("%s: Stopping docker and kubelet on %q to simulate failure", k, node.Name) 159 k.addStopServicesEvent(node.Name) 160 err := k.ssh.Exec("sudo systemctl stop docker kubelet", &node, nil) 161 if err != nil { 162 klog.Errorf("%s: ERROR while stopping node %q: %v", k, node.Name, err) 163 return 164 } 165 166 // Listening for interruptions on stopCh or wait for the simulated downtime 167 sleepInterrupt(time.Duration(k.config.SimulatedDowntime), stopCh) 168 169 klog.V(2).Infof("%s: Rebooting %q to repair the node", k, node.Name) 170 k.addRebootEvent(node.Name) 171 err = k.ssh.Exec("sudo reboot", &node, nil) 172 if err != nil { 173 klog.Errorf("%s: Error while rebooting node %q: %v", k, node.Name, err) 174 return 175 } 176 }() 177 } 178 wg.Wait() 179 } 180 181 func (k *NodeKiller) addStopServicesEvent(nodeName string) { 182 k.recorder.record(stopServices, nodeName) 183 } 184 185 func (k *NodeKiller) addRebootEvent(nodeName string) { 186 k.recorder.record(rebootNode, nodeName) 187 } 188 189 // Summary logs NodeKiller execution 190 func (k *NodeKiller) Summary() string { 191 var sb strings.Builder 192 sb.WriteString(fmt.Sprintf("%s: Recorded following events\n", k)) 193 for _, e := range k.recorder.events { 194 sb.WriteString(fmt.Sprintf("%s: At %v %v happend for node %s\n", k, e.time.Format(time.UnixDate), e.action, e.nodeName)) 195 } 196 return sb.String() 197 } 198 199 func (k *NodeKiller) String() string { 200 return "NodeKiller" 201 } 202 203 // Utility method to put the current routine to sleep or break the sleep if stopCh closes 204 // Note of warning: if stopCh is already closed the process may not sleep at all. 205 func sleepInterrupt(duration time.Duration, stopCh <-chan struct{}) { 206 select { 207 case <-stopCh: 208 break 209 case <-time.After(duration): 210 break 211 } 212 }