github.com/IBM-Blockchain/fabric-operator@v1.0.4/pkg/restart/staggerrestarts/staggerrestarts.go (about)

     1  /*
     2   * Copyright contributors to the Hyperledger Fabric Operator project
     3   *
     4   * SPDX-License-Identifier: Apache-2.0
     5   *
     6   * Licensed under the Apache License, Version 2.0 (the "License");
     7   * you may not use this file except in compliance with the License.
     8   * You may obtain a copy of the License at:
     9   *
    10   * 	  http://www.apache.org/licenses/LICENSE-2.0
    11   *
    12   * Unless required by applicable law or agreed to in writing, software
    13   * distributed under the License is distributed on an "AS IS" BASIS,
    14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15   * See the License for the specific language governing permissions and
    16   * limitations under the License.
    17   */
    18  
    19  package staggerrestarts
    20  
    21  import (
    22  	"context"
    23  	"crypto/rand"
    24  	"fmt"
    25  	"math/big"
    26  	"strings"
    27  	"time"
    28  
    29  	current "github.com/IBM-Blockchain/fabric-operator/api/v1beta1"
    30  	"github.com/IBM-Blockchain/fabric-operator/pkg/action"
    31  	k8sclient "github.com/IBM-Blockchain/fabric-operator/pkg/k8s/controllerclient"
    32  	"github.com/IBM-Blockchain/fabric-operator/pkg/restart/configmap"
    33  	"github.com/pkg/errors"
    34  
    35  	corev1 "k8s.io/api/core/v1"
    36  	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    37  	"k8s.io/apimachinery/pkg/labels"
    38  	"k8s.io/apimachinery/pkg/util/wait"
    39  	"sigs.k8s.io/controller-runtime/pkg/client"
    40  	logf "sigs.k8s.io/controller-runtime/pkg/log"
    41  )
    42  
    43  var log = logf.Log.WithName("stagger_restart_service")
    44  
    45  type Instance interface {
    46  	v1.Object
    47  	GetMSPID() string
    48  }
    49  
    50  type StaggerRestartsService struct {
    51  	Client           k8sclient.Client
    52  	ConfigMapManager *configmap.Manager
    53  	Timeout          time.Duration
    54  }
    55  
    56  func New(client k8sclient.Client, timeout time.Duration) *StaggerRestartsService {
    57  	return &StaggerRestartsService{
    58  		Client:           client,
    59  		Timeout:          timeout,
    60  		ConfigMapManager: configmap.NewManager(client),
    61  	}
    62  }
    63  
    64  // Restart is called by the restart manager.
    65  // For CA/Peer/Orderer: adds component to the queue for restart.
    66  // For Console: 		restarts the component directly as there is only one ibpconsole
    67  //						instance per network. We bypass the queue logic for ibpconsoles.
    68  func (s *StaggerRestartsService) Restart(instance Instance, reason string) error {
    69  	switch instance.(type) {
    70  	case *current.IBPConsole:
    71  		if err := s.RestartImmediately("console", instance, reason); err != nil {
    72  			return errors.Wrapf(err, "failed to restart %s", instance.GetName())
    73  		}
    74  	default:
    75  		if err := s.AddToQueue(instance, reason); err != nil {
    76  			return errors.Wrapf(err, "failed to add restart request to queue for %s", instance.GetName())
    77  		}
    78  	}
    79  
    80  	return nil
    81  }
    82  
    83  // AddToQueue is called by the restart manager and handles adding the
    84  // restart request to the queue associated with the instance's MSPID
    85  // in the <ca/peer/orderer>-restart-config CM.
    86  func (s *StaggerRestartsService) AddToQueue(instance Instance, reason string) error {
    87  	var componentType string
    88  	switch instance.(type) {
    89  	case *current.IBPCA:
    90  		componentType = "ca"
    91  	case *current.IBPOrderer:
    92  		componentType = "orderer"
    93  	case *current.IBPPeer:
    94  		componentType = "peer"
    95  
    96  	}
    97  
    98  	err := wait.Poll(time.Second, 3*time.Second, func() (bool, error) {
    99  		err := s.addToQueue(componentType, instance, reason)
   100  		if err != nil {
   101  			log.Error(err, "failed to add to queue")
   102  			return false, nil
   103  		}
   104  		return true, nil
   105  	})
   106  
   107  	if err != nil {
   108  		return errors.Wrapf(err, "failed to add %s to queue", instance.GetName())
   109  	}
   110  
   111  	return nil
   112  }
   113  
   114  func (s *StaggerRestartsService) addToQueue(componentType string, instance Instance, reason string) error {
   115  	component := &Component{
   116  		CRName: instance.GetName(),
   117  		Reason: reason,
   118  		Status: Pending,
   119  	}
   120  
   121  	restartConfig, err := s.GetConfig(componentType, instance.GetNamespace())
   122  	if err != nil {
   123  		return err
   124  	}
   125  
   126  	// Add component to queue
   127  	restartConfig.AddToQueue(instance.GetMSPID(), component)
   128  
   129  	err = s.UpdateConfig(componentType, instance.GetNamespace(), restartConfig)
   130  	if err != nil {
   131  		return err
   132  	}
   133  
   134  	return nil
   135  }
   136  
   137  func (s *StaggerRestartsService) RestartImmediately(componentType string, instance Instance, reason string) error {
   138  	log.Info(fmt.Sprintf("Restarting %s...", instance.GetName()))
   139  	err := s.RestartDeployment(instance.GetName(), instance.GetNamespace())
   140  	if err != nil {
   141  		return err
   142  	}
   143  
   144  	component := &Component{
   145  		CRName:               instance.GetName(),
   146  		Reason:               reason,
   147  		Status:               Restarted,
   148  		LastCheckedTimestamp: time.Now().UTC().String(),
   149  	}
   150  
   151  	restartConfig, err := s.GetConfig(componentType, instance.GetNamespace())
   152  	if err != nil {
   153  		return err
   154  	}
   155  	restartConfig.AddToLog(component)
   156  
   157  	err = s.UpdateConfig(componentType, instance.GetNamespace(), restartConfig)
   158  	if err != nil {
   159  		return err
   160  	}
   161  
   162  	return nil
   163  }
   164  
   165  // Reconcile is called by the ca/peer/orderer reconcile loops via the restart
   166  // manager when an update to the <ca/peer/orderer>-restart-config CM is detected
   167  // and handles the different states of the first component of each queue.
   168  //
   169  // Returns true if the controller needs to requeue the request to reconcile the restart manager.
   170  func (s *StaggerRestartsService) Reconcile(componentType, namespace string) (bool, error) {
   171  	requeue := false
   172  
   173  	restartConfig, err := s.GetConfig(componentType, namespace)
   174  	if err != nil {
   175  		return requeue, err
   176  	}
   177  
   178  	updated := false
   179  	// Check front component of each queue
   180  	for mspid, queue := range restartConfig.Queues {
   181  		if len(queue) == 0 {
   182  			// queue is empty - do nothing
   183  			continue
   184  		}
   185  
   186  		component := queue[0]
   187  		name := component.CRName
   188  
   189  		switch component.Status {
   190  		case Pending:
   191  			log.Info(fmt.Sprintf("%s in pending status, restarting deployment", component.CRName))
   192  
   193  			// Save pod name
   194  			pods, err := s.GetRunningPods(name, namespace)
   195  			if err != nil {
   196  				return requeue, errors.Wrapf(err, "failed to get running pods for %s", name)
   197  			}
   198  
   199  			if len(pods) > 0 {
   200  				component.PodName = pods[0].Name
   201  			}
   202  
   203  			// Restart component
   204  			err = s.RestartDeployment(name, namespace)
   205  			if err != nil {
   206  				return requeue, errors.Wrapf(err, "failed to restart deployment %s", name)
   207  			}
   208  
   209  			// Update config
   210  			component.Status = Waiting
   211  			component.LastCheckedTimestamp = time.Now().UTC().String()
   212  			component.CheckUntilTimestamp = time.Now().Add(s.Timeout).UTC().String()
   213  
   214  			updated = true
   215  
   216  		case Waiting:
   217  			pods, err := s.GetRunningPods(name, namespace)
   218  			if err != nil {
   219  				return requeue, errors.Wrapf(err, "failed to get running pods for %s", name)
   220  			}
   221  
   222  			// Scenario 1: the pod has restarted
   223  			if len(pods) == 1 {
   224  				if component.PodName != pods[0].Name {
   225  					// Pod has restarted as the old pod has disappeared
   226  					log.Info(fmt.Sprintf("%s in completed status, removing from %s restart queue", component.CRName, mspid))
   227  					component.Status = Completed
   228  
   229  					restartConfig.AddToLog(component)
   230  					restartConfig.PopFromQueue(mspid)
   231  
   232  					log.Info(fmt.Sprintf("Remaining restart queue(s) to reconcile: %s", queuesToString(restartConfig.Queues)))
   233  					updated = true
   234  
   235  					continue
   236  				}
   237  			}
   238  
   239  			// Scenario 2: the pod has not restarted and the wait period has timed out
   240  			checkUntil, err := parseTime(component.CheckUntilTimestamp)
   241  			if err != nil {
   242  				return requeue, errors.Wrap(err, "failed to parse checkUntilTimestamp")
   243  			}
   244  			if time.Now().UTC().After(checkUntil) {
   245  				log.Info(fmt.Sprintf("%s in expired status, has not restarted within %s", component.CRName, s.Timeout.String()))
   246  				// Pod has not restarted within s.timeout, move to log
   247  				component.Status = Expired
   248  
   249  				restartConfig.AddToLog(component)
   250  				restartConfig.PopFromQueue(mspid)
   251  
   252  				log.Info(fmt.Sprintf("Remaining restart queue(s) to reconcile: %s", queuesToString(restartConfig.Queues)))
   253  				updated = true
   254  
   255  				continue
   256  			}
   257  
   258  			// Scenario 3: the pod has not yet restarted but there is still time remaining
   259  			// to wait for the pod to restart.
   260  
   261  			// To prevent the restart manager from overwritting the config map and losing
   262  			// data, the config map updates that trigger reconciles only occur every 10-30
   263  			// seconds. If the lastCheckedInterval amount of time has not yet passed since
   264  			// the lastCheckedTimestamp, then we return true to tell the controllers to
   265  			// requeue the request to reconcile the restart config map to ensure that
   266  			// a reconcile will occur again even when the config map is not updated.
   267  
   268  			lastCheckedInterval := time.Duration(randomInt(10, 30)) * time.Second
   269  			lastChecked, err := parseTime(component.LastCheckedTimestamp)
   270  			if err != nil {
   271  				return requeue, errors.Wrap(err, "failed to parse lastCheckedTimestamp")
   272  			}
   273  
   274  			if lastChecked.Add(lastCheckedInterval).Before(time.Now()) {
   275  				component.LastCheckedTimestamp = time.Now().UTC().String()
   276  				updated = true
   277  			} else {
   278  				requeue = true
   279  			}
   280  
   281  		default:
   282  			// Expired or Completed status - should not reach this case as Waiting case handles moving components to log
   283  			log.Info(fmt.Sprintf("%s restart status is %s, removing from %s restart queue", component.CRName, component.Status, mspid))
   284  
   285  			restartConfig.AddToLog(component)
   286  			restartConfig.PopFromQueue(mspid)
   287  
   288  			updated = true
   289  		}
   290  	}
   291  
   292  	if updated {
   293  		err = s.UpdateConfig(componentType, namespace, restartConfig)
   294  		if err != nil {
   295  			return requeue, err
   296  		}
   297  	}
   298  
   299  	return requeue, nil
   300  }
   301  
   302  func (s *StaggerRestartsService) GetConfig(componentType, namespace string) (*RestartConfig, error) {
   303  	cmName := fmt.Sprintf("%s-restart-config", componentType)
   304  
   305  	cfg := &RestartConfig{
   306  		Queues: map[string][]*Component{},
   307  	}
   308  	err := s.ConfigMapManager.GetRestartConfigFrom(cmName, namespace, cfg)
   309  	if err != nil {
   310  		return nil, err
   311  	}
   312  
   313  	return cfg, nil
   314  }
   315  
   316  func (s *StaggerRestartsService) UpdateConfig(componentType, namespace string, cfg *RestartConfig) error {
   317  	cmName := fmt.Sprintf("%s-restart-config", componentType)
   318  	return s.ConfigMapManager.UpdateConfig(cmName, namespace, cfg)
   319  }
   320  
   321  func (s *StaggerRestartsService) RestartDeployment(name, namespace string) error {
   322  	log.Info(fmt.Sprintf("Restarting deployment %s", name))
   323  
   324  	err := action.Restart(s.Client, name, namespace)
   325  	if err != nil {
   326  		return err
   327  	}
   328  
   329  	return nil
   330  }
   331  
   332  func (s *StaggerRestartsService) GetRunningPods(name, namespace string) ([]corev1.Pod, error) {
   333  	pods := []corev1.Pod{}
   334  
   335  	labelSelector, err := labels.Parse(fmt.Sprintf("app=%s", name))
   336  	if err != nil {
   337  		return pods, errors.Wrap(err, "failed to parse label selector for app name")
   338  	}
   339  
   340  	listOptions := &client.ListOptions{
   341  		LabelSelector: labelSelector,
   342  		Namespace:     namespace,
   343  	}
   344  
   345  	podList := &corev1.PodList{}
   346  	err = s.Client.List(context.TODO(), podList, listOptions)
   347  	if err != nil {
   348  		log.Error(err, "failed to get pod list for %s", name)
   349  		// return empty pods list
   350  		// NOTE: decided not to return error here since this funtion will be called multiple
   351  		// times throughout the process of old pods terminating and new pods starting up.
   352  		// We don't want to error out prematurely if this client call isn't able to retrieve
   353  		// a list of pods during the restart process.
   354  		return pods, nil
   355  	}
   356  
   357  	for _, pod := range podList.Items {
   358  		switch pod.Status.Phase {
   359  		case corev1.PodRunning:
   360  			containerStatuses := pod.Status.ContainerStatuses
   361  
   362  			readyContainers := 0
   363  			numContainers := len(containerStatuses)
   364  
   365  			for _, status := range containerStatuses {
   366  				// TODO: is it required to check status.Ready?
   367  				if status.Ready && status.State.Running != nil {
   368  					readyContainers++
   369  				}
   370  			}
   371  			if readyContainers == numContainers {
   372  				pods = append(pods, pod)
   373  			}
   374  		}
   375  	}
   376  
   377  	return pods, nil
   378  }
   379  
   380  func queuesToString(queues map[string][]*Component) string {
   381  	lst := []string{}
   382  	for org, queue := range queues {
   383  		str := org + ": [ "
   384  		if org == "" {
   385  			// This is a ca queue
   386  			str = "[ "
   387  		}
   388  		for _, comp := range queue {
   389  			str += comp.CRName + " "
   390  		}
   391  		str += " ]"
   392  
   393  		lst = append(lst, str)
   394  	}
   395  
   396  	return strings.Join(lst, ",")
   397  }
   398  
   399  func parseTime(t string) (time.Time, error) {
   400  	format := "2006-01-02 15:04:05.999999999 -0700 MST"
   401  	return time.Parse(format, t)
   402  }
   403  
   404  // Returns a random integer between min and max.
   405  func randomInt(min, max int) int {
   406  	randomNum, _ := rand.Int(rand.Reader, big.NewInt(int64(max-min)))
   407  	return int(randomNum.Int64()) + min
   408  }