github.com/percona/percona-xtradb-cluster-operator@v1.14.0/pkg/controller/pxc/full_crash_recovery.go (about) 1 package pxc 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "strconv" 8 "strings" 9 "time" 10 11 v1 "github.com/percona/percona-xtradb-cluster-operator/pkg/apis/pxc/v1" 12 "github.com/pkg/errors" 13 corev1 "k8s.io/api/core/v1" 14 k8serrors "k8s.io/apimachinery/pkg/api/errors" 15 "k8s.io/apimachinery/pkg/types" 16 logf "sigs.k8s.io/controller-runtime/pkg/log" 17 ) 18 19 var ( 20 ErrNotAllPXCPodsRunning = errors.New("not all pxc pods are running") 21 logLinesRequired = int64(8) 22 ) 23 24 const logPrefix = `#####################################################LAST_LINE` 25 26 func (r *ReconcilePerconaXtraDBCluster) recoverFullClusterCrashIfNeeded(ctx context.Context, cr *v1.PerconaXtraDBCluster) error { 27 if cr.Spec.PXC.Size <= 0 { 28 return nil 29 } 30 31 err := r.checkIfPodsRunning(cr) 32 if err != nil { 33 if err == ErrNotAllPXCPodsRunning { 34 return nil 35 } 36 return err 37 } 38 39 isWaiting, _, err := r.isPodWaitingForRecovery(cr.Namespace, cr.Name+"-pxc-0") 40 if err != nil { 41 return errors.Wrap(err, "failed to check if pxc pod 0 is waiting for recovery") 42 } 43 44 if isWaiting { 45 return r.doFullCrashRecovery(ctx, cr.Name, cr.Namespace, int(cr.Spec.PXC.Size)) 46 } 47 48 return nil 49 } 50 51 func (r *ReconcilePerconaXtraDBCluster) isPodWaitingForRecovery(namespace, podName string) (bool, int64, error) { 52 logOpts := &corev1.PodLogOptions{ 53 Container: "pxc", 54 TailLines: &logLinesRequired, 55 } 56 logLines, err := r.clientcmd.PodLogs(namespace, podName, logOpts) 57 if err != nil { 58 return false, -1, errors.Wrapf(err, "get logs from %s pod", podName) 59 } 60 61 for i := len(logLines) - 1; i >= 0; i-- { 62 if strings.HasPrefix(logLines[i], logPrefix) { 63 seq, err := parseSequence(logLines[i]) 64 return true, seq, err 65 } 66 } 67 68 return false, -1, nil 69 } 70 71 func parseSequence(log string) (int64, error) { 72 logsSplitted := strings.Split(log, ":") 73 if len(logsSplitted) != 4 { 74 return -1, errors.New("invalid log format. Log: " + log) 75 } 76 77 seq, err := strconv.ParseInt(logsSplitted[2], 10, 64) 78 if err != nil { 79 return -1, errors.Wrapf(err, "parse sequence %s", logsSplitted[2]) 80 } 81 82 return seq, nil 83 } 84 85 func (r *ReconcilePerconaXtraDBCluster) doFullCrashRecovery(ctx context.Context, crName, namespace string, pxcSize int) error { 86 maxSeq := int64(-100) 87 maxSeqPod := "" 88 89 for i := 0; i < pxcSize; i++ { 90 podName := fmt.Sprintf("%s-pxc-%d", crName, i) 91 isPodWaitingForRecovery, seq, err := r.isPodWaitingForRecovery(namespace, podName) 92 if err != nil { 93 return errors.Wrapf(err, "parse %s pod logs", podName) 94 } 95 96 if !isPodWaitingForRecovery { 97 return nil 98 } 99 100 if seq > maxSeq { 101 maxSeq = seq 102 maxSeqPod = podName 103 } 104 } 105 log := logf.FromContext(ctx) 106 log.Info("We are in full cluster crash, starting recovery") 107 log.Info("Results of scanning sequences", "pod", maxSeqPod, "maxSeq", maxSeq) 108 109 pod := &corev1.Pod{} 110 err := r.client.Get(context.TODO(), types.NamespacedName{ 111 Namespace: namespace, 112 Name: maxSeqPod, 113 }, pod) 114 if err != nil { 115 return errors.Wrap(err, "get pods defenition") 116 } 117 118 stderrBuf := &bytes.Buffer{} 119 err = r.clientcmd.Exec(pod, "pxc", []string{"/bin/sh", "-c", "kill -s USR1 1"}, nil, nil, stderrBuf, false) 120 if err != nil { 121 return errors.Wrap(err, "exec command in pod") 122 } 123 124 if stderrBuf.Len() != 0 { 125 return errors.New("invalid exec command return: " + stderrBuf.String()) 126 } 127 128 // sleep there a little to start script and do not send 129 // a lot of signals to the same pod 130 time.Sleep(30 * time.Second) 131 132 return nil 133 } 134 135 func (r *ReconcilePerconaXtraDBCluster) checkIfPodsRunning(cr *v1.PerconaXtraDBCluster) error { 136 for i := 0; i < int(cr.Spec.PXC.Size); i++ { 137 podName := fmt.Sprintf("%s-pxc-%d", cr.Name, i) 138 ok, err := r.clientcmd.IsPodRunning(cr.Namespace, podName) 139 if err != nil { 140 if k8serrors.IsNotFound(err) { 141 return ErrNotAllPXCPodsRunning 142 } 143 return errors.Wrapf(err, "can't check pod %s state", podName) 144 } 145 if !ok { 146 return ErrNotAllPXCPodsRunning 147 } 148 } 149 return nil 150 }