github.com/IBM-Blockchain/fabric-operator@v1.0.4/pkg/restart/staggerrestarts/staggerrestarts.go (about) 1 /* 2 * Copyright contributors to the Hyperledger Fabric Operator project 3 * 4 * SPDX-License-Identifier: Apache-2.0 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at: 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 package staggerrestarts 20 21 import ( 22 "context" 23 "crypto/rand" 24 "fmt" 25 "math/big" 26 "strings" 27 "time" 28 29 current "github.com/IBM-Blockchain/fabric-operator/api/v1beta1" 30 "github.com/IBM-Blockchain/fabric-operator/pkg/action" 31 k8sclient "github.com/IBM-Blockchain/fabric-operator/pkg/k8s/controllerclient" 32 "github.com/IBM-Blockchain/fabric-operator/pkg/restart/configmap" 33 "github.com/pkg/errors" 34 35 corev1 "k8s.io/api/core/v1" 36 v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/labels" 38 "k8s.io/apimachinery/pkg/util/wait" 39 "sigs.k8s.io/controller-runtime/pkg/client" 40 logf "sigs.k8s.io/controller-runtime/pkg/log" 41 ) 42 43 var log = logf.Log.WithName("stagger_restart_service") 44 45 type Instance interface { 46 v1.Object 47 GetMSPID() string 48 } 49 50 type StaggerRestartsService struct { 51 Client k8sclient.Client 52 ConfigMapManager *configmap.Manager 53 Timeout time.Duration 54 } 55 56 func New(client k8sclient.Client, timeout time.Duration) *StaggerRestartsService { 57 return &StaggerRestartsService{ 58 Client: client, 59 Timeout: timeout, 60 ConfigMapManager: configmap.NewManager(client), 61 } 62 } 63 64 // Restart is called by the restart manager. 65 // For CA/Peer/Orderer: adds component to the queue for restart. 66 // For Console: restarts the component directly as there is only one ibpconsole 67 // instance per network. We bypass the queue logic for ibpconsoles. 68 func (s *StaggerRestartsService) Restart(instance Instance, reason string) error { 69 switch instance.(type) { 70 case *current.IBPConsole: 71 if err := s.RestartImmediately("console", instance, reason); err != nil { 72 return errors.Wrapf(err, "failed to restart %s", instance.GetName()) 73 } 74 default: 75 if err := s.AddToQueue(instance, reason); err != nil { 76 return errors.Wrapf(err, "failed to add restart request to queue for %s", instance.GetName()) 77 } 78 } 79 80 return nil 81 } 82 83 // AddToQueue is called by the restart manager and handles adding the 84 // restart request to the queue associated with the instance's MSPID 85 // in the <ca/peer/orderer>-restart-config CM. 86 func (s *StaggerRestartsService) AddToQueue(instance Instance, reason string) error { 87 var componentType string 88 switch instance.(type) { 89 case *current.IBPCA: 90 componentType = "ca" 91 case *current.IBPOrderer: 92 componentType = "orderer" 93 case *current.IBPPeer: 94 componentType = "peer" 95 96 } 97 98 err := wait.Poll(time.Second, 3*time.Second, func() (bool, error) { 99 err := s.addToQueue(componentType, instance, reason) 100 if err != nil { 101 log.Error(err, "failed to add to queue") 102 return false, nil 103 } 104 return true, nil 105 }) 106 107 if err != nil { 108 return errors.Wrapf(err, "failed to add %s to queue", instance.GetName()) 109 } 110 111 return nil 112 } 113 114 func (s *StaggerRestartsService) addToQueue(componentType string, instance Instance, reason string) error { 115 component := &Component{ 116 CRName: instance.GetName(), 117 Reason: reason, 118 Status: Pending, 119 } 120 121 restartConfig, err := s.GetConfig(componentType, instance.GetNamespace()) 122 if err != nil { 123 return err 124 } 125 126 // Add component to queue 127 restartConfig.AddToQueue(instance.GetMSPID(), component) 128 129 err = s.UpdateConfig(componentType, instance.GetNamespace(), restartConfig) 130 if err != nil { 131 return err 132 } 133 134 return nil 135 } 136 137 func (s *StaggerRestartsService) RestartImmediately(componentType string, instance Instance, reason string) error { 138 log.Info(fmt.Sprintf("Restarting %s...", instance.GetName())) 139 err := s.RestartDeployment(instance.GetName(), instance.GetNamespace()) 140 if err != nil { 141 return err 142 } 143 144 component := &Component{ 145 CRName: instance.GetName(), 146 Reason: reason, 147 Status: Restarted, 148 LastCheckedTimestamp: time.Now().UTC().String(), 149 } 150 151 restartConfig, err := s.GetConfig(componentType, instance.GetNamespace()) 152 if err != nil { 153 return err 154 } 155 restartConfig.AddToLog(component) 156 157 err = s.UpdateConfig(componentType, instance.GetNamespace(), restartConfig) 158 if err != nil { 159 return err 160 } 161 162 return nil 163 } 164 165 // Reconcile is called by the ca/peer/orderer reconcile loops via the restart 166 // manager when an update to the <ca/peer/orderer>-restart-config CM is detected 167 // and handles the different states of the first component of each queue. 168 // 169 // Returns true if the controller needs to requeue the request to reconcile the restart manager. 170 func (s *StaggerRestartsService) Reconcile(componentType, namespace string) (bool, error) { 171 requeue := false 172 173 restartConfig, err := s.GetConfig(componentType, namespace) 174 if err != nil { 175 return requeue, err 176 } 177 178 updated := false 179 // Check front component of each queue 180 for mspid, queue := range restartConfig.Queues { 181 if len(queue) == 0 { 182 // queue is empty - do nothing 183 continue 184 } 185 186 component := queue[0] 187 name := component.CRName 188 189 switch component.Status { 190 case Pending: 191 log.Info(fmt.Sprintf("%s in pending status, restarting deployment", component.CRName)) 192 193 // Save pod name 194 pods, err := s.GetRunningPods(name, namespace) 195 if err != nil { 196 return requeue, errors.Wrapf(err, "failed to get running pods for %s", name) 197 } 198 199 if len(pods) > 0 { 200 component.PodName = pods[0].Name 201 } 202 203 // Restart component 204 err = s.RestartDeployment(name, namespace) 205 if err != nil { 206 return requeue, errors.Wrapf(err, "failed to restart deployment %s", name) 207 } 208 209 // Update config 210 component.Status = Waiting 211 component.LastCheckedTimestamp = time.Now().UTC().String() 212 component.CheckUntilTimestamp = time.Now().Add(s.Timeout).UTC().String() 213 214 updated = true 215 216 case Waiting: 217 pods, err := s.GetRunningPods(name, namespace) 218 if err != nil { 219 return requeue, errors.Wrapf(err, "failed to get running pods for %s", name) 220 } 221 222 // Scenario 1: the pod has restarted 223 if len(pods) == 1 { 224 if component.PodName != pods[0].Name { 225 // Pod has restarted as the old pod has disappeared 226 log.Info(fmt.Sprintf("%s in completed status, removing from %s restart queue", component.CRName, mspid)) 227 component.Status = Completed 228 229 restartConfig.AddToLog(component) 230 restartConfig.PopFromQueue(mspid) 231 232 log.Info(fmt.Sprintf("Remaining restart queue(s) to reconcile: %s", queuesToString(restartConfig.Queues))) 233 updated = true 234 235 continue 236 } 237 } 238 239 // Scenario 2: the pod has not restarted and the wait period has timed out 240 checkUntil, err := parseTime(component.CheckUntilTimestamp) 241 if err != nil { 242 return requeue, errors.Wrap(err, "failed to parse checkUntilTimestamp") 243 } 244 if time.Now().UTC().After(checkUntil) { 245 log.Info(fmt.Sprintf("%s in expired status, has not restarted within %s", component.CRName, s.Timeout.String())) 246 // Pod has not restarted within s.timeout, move to log 247 component.Status = Expired 248 249 restartConfig.AddToLog(component) 250 restartConfig.PopFromQueue(mspid) 251 252 log.Info(fmt.Sprintf("Remaining restart queue(s) to reconcile: %s", queuesToString(restartConfig.Queues))) 253 updated = true 254 255 continue 256 } 257 258 // Scenario 3: the pod has not yet restarted but there is still time remaining 259 // to wait for the pod to restart. 260 261 // To prevent the restart manager from overwritting the config map and losing 262 // data, the config map updates that trigger reconciles only occur every 10-30 263 // seconds. If the lastCheckedInterval amount of time has not yet passed since 264 // the lastCheckedTimestamp, then we return true to tell the controllers to 265 // requeue the request to reconcile the restart config map to ensure that 266 // a reconcile will occur again even when the config map is not updated. 267 268 lastCheckedInterval := time.Duration(randomInt(10, 30)) * time.Second 269 lastChecked, err := parseTime(component.LastCheckedTimestamp) 270 if err != nil { 271 return requeue, errors.Wrap(err, "failed to parse lastCheckedTimestamp") 272 } 273 274 if lastChecked.Add(lastCheckedInterval).Before(time.Now()) { 275 component.LastCheckedTimestamp = time.Now().UTC().String() 276 updated = true 277 } else { 278 requeue = true 279 } 280 281 default: 282 // Expired or Completed status - should not reach this case as Waiting case handles moving components to log 283 log.Info(fmt.Sprintf("%s restart status is %s, removing from %s restart queue", component.CRName, component.Status, mspid)) 284 285 restartConfig.AddToLog(component) 286 restartConfig.PopFromQueue(mspid) 287 288 updated = true 289 } 290 } 291 292 if updated { 293 err = s.UpdateConfig(componentType, namespace, restartConfig) 294 if err != nil { 295 return requeue, err 296 } 297 } 298 299 return requeue, nil 300 } 301 302 func (s *StaggerRestartsService) GetConfig(componentType, namespace string) (*RestartConfig, error) { 303 cmName := fmt.Sprintf("%s-restart-config", componentType) 304 305 cfg := &RestartConfig{ 306 Queues: map[string][]*Component{}, 307 } 308 err := s.ConfigMapManager.GetRestartConfigFrom(cmName, namespace, cfg) 309 if err != nil { 310 return nil, err 311 } 312 313 return cfg, nil 314 } 315 316 func (s *StaggerRestartsService) UpdateConfig(componentType, namespace string, cfg *RestartConfig) error { 317 cmName := fmt.Sprintf("%s-restart-config", componentType) 318 return s.ConfigMapManager.UpdateConfig(cmName, namespace, cfg) 319 } 320 321 func (s *StaggerRestartsService) RestartDeployment(name, namespace string) error { 322 log.Info(fmt.Sprintf("Restarting deployment %s", name)) 323 324 err := action.Restart(s.Client, name, namespace) 325 if err != nil { 326 return err 327 } 328 329 return nil 330 } 331 332 func (s *StaggerRestartsService) GetRunningPods(name, namespace string) ([]corev1.Pod, error) { 333 pods := []corev1.Pod{} 334 335 labelSelector, err := labels.Parse(fmt.Sprintf("app=%s", name)) 336 if err != nil { 337 return pods, errors.Wrap(err, "failed to parse label selector for app name") 338 } 339 340 listOptions := &client.ListOptions{ 341 LabelSelector: labelSelector, 342 Namespace: namespace, 343 } 344 345 podList := &corev1.PodList{} 346 err = s.Client.List(context.TODO(), podList, listOptions) 347 if err != nil { 348 log.Error(err, "failed to get pod list for %s", name) 349 // return empty pods list 350 // NOTE: decided not to return error here since this funtion will be called multiple 351 // times throughout the process of old pods terminating and new pods starting up. 352 // We don't want to error out prematurely if this client call isn't able to retrieve 353 // a list of pods during the restart process. 354 return pods, nil 355 } 356 357 for _, pod := range podList.Items { 358 switch pod.Status.Phase { 359 case corev1.PodRunning: 360 containerStatuses := pod.Status.ContainerStatuses 361 362 readyContainers := 0 363 numContainers := len(containerStatuses) 364 365 for _, status := range containerStatuses { 366 // TODO: is it required to check status.Ready? 367 if status.Ready && status.State.Running != nil { 368 readyContainers++ 369 } 370 } 371 if readyContainers == numContainers { 372 pods = append(pods, pod) 373 } 374 } 375 } 376 377 return pods, nil 378 } 379 380 func queuesToString(queues map[string][]*Component) string { 381 lst := []string{} 382 for org, queue := range queues { 383 str := org + ": [ " 384 if org == "" { 385 // This is a ca queue 386 str = "[ " 387 } 388 for _, comp := range queue { 389 str += comp.CRName + " " 390 } 391 str += " ]" 392 393 lst = append(lst, str) 394 } 395 396 return strings.Join(lst, ",") 397 } 398 399 func parseTime(t string) (time.Time, error) { 400 format := "2006-01-02 15:04:05.999999999 -0700 MST" 401 return time.Parse(format, t) 402 } 403 404 // Returns a random integer between min and max. 405 func randomInt(min, max int) int { 406 randomNum, _ := rand.Int(rand.Reader, big.NewInt(int64(max-min))) 407 return int(randomNum.Int64()) + min 408 }