github.com/percona/percona-xtradb-cluster-operator@v1.14.0/pkg/controller/pxc/full_crash_recovery.go (about)

     1  package pxc
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"strconv"
     8  	"strings"
     9  	"time"
    10  
    11  	v1 "github.com/percona/percona-xtradb-cluster-operator/pkg/apis/pxc/v1"
    12  	"github.com/pkg/errors"
    13  	corev1 "k8s.io/api/core/v1"
    14  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    15  	"k8s.io/apimachinery/pkg/types"
    16  	logf "sigs.k8s.io/controller-runtime/pkg/log"
    17  )
    18  
    19  var (
    20  	ErrNotAllPXCPodsRunning = errors.New("not all pxc pods are running")
    21  	logLinesRequired        = int64(8)
    22  )
    23  
    24  const logPrefix = `#####################################################LAST_LINE`
    25  
    26  func (r *ReconcilePerconaXtraDBCluster) recoverFullClusterCrashIfNeeded(ctx context.Context, cr *v1.PerconaXtraDBCluster) error {
    27  	if cr.Spec.PXC.Size <= 0 {
    28  		return nil
    29  	}
    30  
    31  	err := r.checkIfPodsRunning(cr)
    32  	if err != nil {
    33  		if err == ErrNotAllPXCPodsRunning {
    34  			return nil
    35  		}
    36  		return err
    37  	}
    38  
    39  	isWaiting, _, err := r.isPodWaitingForRecovery(cr.Namespace, cr.Name+"-pxc-0")
    40  	if err != nil {
    41  		return errors.Wrap(err, "failed to check if pxc pod 0 is waiting for recovery")
    42  	}
    43  
    44  	if isWaiting {
    45  		return r.doFullCrashRecovery(ctx, cr.Name, cr.Namespace, int(cr.Spec.PXC.Size))
    46  	}
    47  
    48  	return nil
    49  }
    50  
    51  func (r *ReconcilePerconaXtraDBCluster) isPodWaitingForRecovery(namespace, podName string) (bool, int64, error) {
    52  	logOpts := &corev1.PodLogOptions{
    53  		Container: "pxc",
    54  		TailLines: &logLinesRequired,
    55  	}
    56  	logLines, err := r.clientcmd.PodLogs(namespace, podName, logOpts)
    57  	if err != nil {
    58  		return false, -1, errors.Wrapf(err, "get logs from %s pod", podName)
    59  	}
    60  
    61  	for i := len(logLines) - 1; i >= 0; i-- {
    62  		if strings.HasPrefix(logLines[i], logPrefix) {
    63  			seq, err := parseSequence(logLines[i])
    64  			return true, seq, err
    65  		}
    66  	}
    67  
    68  	return false, -1, nil
    69  }
    70  
    71  func parseSequence(log string) (int64, error) {
    72  	logsSplitted := strings.Split(log, ":")
    73  	if len(logsSplitted) != 4 {
    74  		return -1, errors.New("invalid log format. Log: " + log)
    75  	}
    76  
    77  	seq, err := strconv.ParseInt(logsSplitted[2], 10, 64)
    78  	if err != nil {
    79  		return -1, errors.Wrapf(err, "parse sequence %s", logsSplitted[2])
    80  	}
    81  
    82  	return seq, nil
    83  }
    84  
    85  func (r *ReconcilePerconaXtraDBCluster) doFullCrashRecovery(ctx context.Context, crName, namespace string, pxcSize int) error {
    86  	maxSeq := int64(-100)
    87  	maxSeqPod := ""
    88  
    89  	for i := 0; i < pxcSize; i++ {
    90  		podName := fmt.Sprintf("%s-pxc-%d", crName, i)
    91  		isPodWaitingForRecovery, seq, err := r.isPodWaitingForRecovery(namespace, podName)
    92  		if err != nil {
    93  			return errors.Wrapf(err, "parse %s pod logs", podName)
    94  		}
    95  
    96  		if !isPodWaitingForRecovery {
    97  			return nil
    98  		}
    99  
   100  		if seq > maxSeq {
   101  			maxSeq = seq
   102  			maxSeqPod = podName
   103  		}
   104  	}
   105  	log := logf.FromContext(ctx)
   106  	log.Info("We are in full cluster crash, starting recovery")
   107  	log.Info("Results of scanning sequences", "pod", maxSeqPod, "maxSeq", maxSeq)
   108  
   109  	pod := &corev1.Pod{}
   110  	err := r.client.Get(context.TODO(), types.NamespacedName{
   111  		Namespace: namespace,
   112  		Name:      maxSeqPod,
   113  	}, pod)
   114  	if err != nil {
   115  		return errors.Wrap(err, "get pods defenition")
   116  	}
   117  
   118  	stderrBuf := &bytes.Buffer{}
   119  	err = r.clientcmd.Exec(pod, "pxc", []string{"/bin/sh", "-c", "kill -s USR1 1"}, nil, nil, stderrBuf, false)
   120  	if err != nil {
   121  		return errors.Wrap(err, "exec command in pod")
   122  	}
   123  
   124  	if stderrBuf.Len() != 0 {
   125  		return errors.New("invalid exec command return: " + stderrBuf.String())
   126  	}
   127  
   128  	// sleep there a little to start script and do not send
   129  	// a lot of signals to the same pod
   130  	time.Sleep(30 * time.Second)
   131  
   132  	return nil
   133  }
   134  
   135  func (r *ReconcilePerconaXtraDBCluster) checkIfPodsRunning(cr *v1.PerconaXtraDBCluster) error {
   136  	for i := 0; i < int(cr.Spec.PXC.Size); i++ {
   137  		podName := fmt.Sprintf("%s-pxc-%d", cr.Name, i)
   138  		ok, err := r.clientcmd.IsPodRunning(cr.Namespace, podName)
   139  		if err != nil {
   140  			if k8serrors.IsNotFound(err) {
   141  				return ErrNotAllPXCPodsRunning
   142  			}
   143  			return errors.Wrapf(err, "can't check pod %s state", podName)
   144  		}
   145  		if !ok {
   146  			return ErrNotAllPXCPodsRunning
   147  		}
   148  	}
   149  	return nil
   150  }