github.com/percona/percona-xtradb-cluster-operator@v1.14.0/pkg/controller/pxcrestore/controller.go (about)

     1  package pxcrestore
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"time"
     8  
     9  	"github.com/pkg/errors"
    10  	corev1 "k8s.io/api/core/v1"
    11  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    12  	"k8s.io/apimachinery/pkg/api/meta"
    13  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    14  	"k8s.io/apimachinery/pkg/labels"
    15  	"k8s.io/apimachinery/pkg/runtime"
    16  	"k8s.io/apimachinery/pkg/types"
    17  	k8sretry "k8s.io/client-go/util/retry"
    18  	"sigs.k8s.io/controller-runtime/pkg/builder"
    19  	"sigs.k8s.io/controller-runtime/pkg/client"
    20  	"sigs.k8s.io/controller-runtime/pkg/handler"
    21  	logf "sigs.k8s.io/controller-runtime/pkg/log"
    22  	"sigs.k8s.io/controller-runtime/pkg/manager"
    23  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    24  
    25  	"github.com/percona/percona-xtradb-cluster-operator/clientcmd"
    26  	api "github.com/percona/percona-xtradb-cluster-operator/pkg/apis/pxc/v1"
    27  	"github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/app"
    28  	"github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/app/statefulset"
    29  	"github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/backup"
    30  	"github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/backup/storage"
    31  	"github.com/percona/percona-xtradb-cluster-operator/version"
    32  )
    33  
    34  // Add creates a new PerconaXtraDBClusterRestore Controller and adds it to the Manager. The Manager will set fields on the Controller
    35  // and Start it when the Manager is Started.
    36  func Add(mgr manager.Manager) error {
    37  	r, err := newReconciler(mgr)
    38  	if err != nil {
    39  		return err
    40  	}
    41  	return add(mgr, r)
    42  }
    43  
    44  // newReconciler returns a new reconcile.Reconciler
    45  func newReconciler(mgr manager.Manager) (reconcile.Reconciler, error) {
    46  	sv, err := version.Server()
    47  	if err != nil {
    48  		return nil, fmt.Errorf("get version: %v", err)
    49  	}
    50  
    51  	cli, err := clientcmd.NewClient()
    52  	if err != nil {
    53  		return nil, errors.Wrap(err, "create clientcmd")
    54  	}
    55  
    56  	return &ReconcilePerconaXtraDBClusterRestore{
    57  		client:               mgr.GetClient(),
    58  		clientcmd:            cli,
    59  		scheme:               mgr.GetScheme(),
    60  		serverVersion:        sv,
    61  		newStorageClientFunc: storage.NewClient,
    62  	}, nil
    63  }
    64  
    65  // add adds a new Controller to mgr with r as the reconcile.Reconciler
    66  func add(mgr manager.Manager, r reconcile.Reconciler) error {
    67  	return builder.ControllerManagedBy(mgr).
    68  		Named("pxcrestore-controller").
    69  		Watches(&api.PerconaXtraDBClusterRestore{}, &handler.EnqueueRequestForObject{}).
    70  		Complete(r)
    71  }
    72  
    73  var _ reconcile.Reconciler = &ReconcilePerconaXtraDBClusterRestore{}
    74  
    75  // ReconcilePerconaXtraDBClusterRestore reconciles a PerconaXtraDBClusterRestore object
    76  type ReconcilePerconaXtraDBClusterRestore struct {
    77  	// This client, initialized using mgr.Client() above, is a split client
    78  	// that reads objects from the cache and writes to the apiserver
    79  	client    client.Client
    80  	clientcmd *clientcmd.Client
    81  	scheme    *runtime.Scheme
    82  
    83  	serverVersion *version.ServerVersion
    84  
    85  	newStorageClientFunc storage.NewClientFunc
    86  }
    87  
    88  // Reconcile reads that state of the cluster for a PerconaXtraDBClusterRestore object and makes changes based on the state read
    89  // and what is in the PerconaXtraDBClusterRestore.Spec
    90  // Note:
    91  // The Controller will requeue the Request to be processed again if the returned error is non-nil or
    92  // Result.Requeue is true, otherwise upon completion it will remove the work from the queue.
    93  func (r *ReconcilePerconaXtraDBClusterRestore) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) {
    94  	log := logf.FromContext(ctx)
    95  
    96  	rr := reconcile.Result{}
    97  
    98  	cr := &api.PerconaXtraDBClusterRestore{}
    99  	err := r.client.Get(context.TODO(), request.NamespacedName, cr)
   100  	if err != nil {
   101  		if k8serrors.IsNotFound(err) {
   102  			// Request object not found, could have been deleted after reconcile request.
   103  			return rr, nil
   104  		}
   105  		// Error reading the object - requeue the request.
   106  		return rr, err
   107  	}
   108  	if cr.Status.State != api.RestoreNew {
   109  		return rr, nil
   110  	}
   111  
   112  	log.Info("backup restore request")
   113  
   114  	err = r.setStatus(cr, api.RestoreStarting, "")
   115  	if err != nil {
   116  		return rr, errors.Wrap(err, "set status")
   117  	}
   118  	rJobsList := &api.PerconaXtraDBClusterRestoreList{}
   119  	err = r.client.List(
   120  		context.TODO(),
   121  		rJobsList,
   122  		&client.ListOptions{
   123  			Namespace: cr.Namespace,
   124  		},
   125  	)
   126  	if err != nil {
   127  		return rr, errors.Wrap(err, "get restore jobs list")
   128  	}
   129  
   130  	returnMsg := fmt.Sprintf(backupRestoredMsg, cr.Name, cr.Spec.PXCCluster, cr.Name)
   131  
   132  	defer func() {
   133  		status := api.BcpRestoreStates(api.RestoreSucceeded)
   134  		if err != nil {
   135  			status = api.RestoreFailed
   136  			returnMsg = err.Error()
   137  		}
   138  		err := r.setStatus(cr, status, returnMsg)
   139  		if err != nil {
   140  			return
   141  		}
   142  	}()
   143  
   144  	for _, j := range rJobsList.Items {
   145  		if j.Spec.PXCCluster == cr.Spec.PXCCluster &&
   146  			j.Name != cr.Name && j.Status.State != api.RestoreFailed &&
   147  			j.Status.State != api.RestoreSucceeded {
   148  			err = errors.Errorf("unable to continue, concurent restore job %s running now.", j.Name)
   149  			return rr, err
   150  		}
   151  	}
   152  
   153  	err = cr.CheckNsetDefaults()
   154  	if err != nil {
   155  		return rr, err
   156  	}
   157  
   158  	cluster := new(api.PerconaXtraDBCluster)
   159  	err = r.client.Get(context.TODO(), types.NamespacedName{Name: cr.Spec.PXCCluster, Namespace: cr.Namespace}, cluster)
   160  	if err != nil {
   161  		err = errors.Wrapf(err, "get cluster %s", cr.Spec.PXCCluster)
   162  		return rr, err
   163  	}
   164  	clusterOrig := cluster.DeepCopy()
   165  
   166  	err = cluster.CheckNSetDefaults(r.serverVersion, log)
   167  	if err != nil {
   168  		return reconcile.Result{}, fmt.Errorf("wrong PXC options: %v", err)
   169  	}
   170  
   171  	err = backup.CheckPITRErrors(ctx, r.client, r.clientcmd, cluster)
   172  	if err != nil {
   173  		return reconcile.Result{}, err
   174  	}
   175  
   176  	bcp, err := r.getBackup(ctx, cr)
   177  	if err != nil {
   178  		return rr, errors.Wrap(err, "get backup")
   179  	}
   180  
   181  	annotations := cr.GetAnnotations()
   182  	_, unsafePITR := annotations[api.AnnotationUnsafePITR]
   183  	cond := meta.FindStatusCondition(bcp.Status.Conditions, api.BackupConditionPITRReady)
   184  	if cond != nil && cond.Status == metav1.ConditionFalse && !unsafePITR {
   185  		msg := fmt.Sprintf("Backup doesn't guarantee consistent recovery with PITR. Annotate PerconaXtraDBClusterRestore with %s to force it.", api.AnnotationUnsafePITR)
   186  		err = errors.New(msg)
   187  		return reconcile.Result{}, nil
   188  	}
   189  
   190  	err = r.validate(ctx, cr, bcp, cluster)
   191  	if err != nil {
   192  		err = errors.Wrap(err, "failed to validate restore job")
   193  		return rr, err
   194  	}
   195  
   196  	log.Info("stopping cluster", "cluster", cr.Spec.PXCCluster)
   197  	err = r.setStatus(cr, api.RestoreStopCluster, "")
   198  	if err != nil {
   199  		err = errors.Wrap(err, "set status")
   200  		return rr, err
   201  	}
   202  	err = r.stopCluster(cluster.DeepCopy())
   203  	if err != nil {
   204  		err = errors.Wrapf(err, "stop cluster %s", cluster.Name)
   205  		return rr, err
   206  	}
   207  
   208  	log.Info("starting restore", "cluster", cr.Spec.PXCCluster, "backup", cr.Spec.BackupName)
   209  	err = r.setStatus(cr, api.RestoreRestore, "")
   210  	if err != nil {
   211  		err = errors.Wrap(err, "set status")
   212  		return rr, err
   213  	}
   214  
   215  	err = r.restore(ctx, cr, bcp, cluster)
   216  	if err != nil {
   217  		err = errors.Wrap(err, "run restore")
   218  		return rr, err
   219  	}
   220  
   221  	log.Info("starting cluster", "cluster", cr.Spec.PXCCluster)
   222  	err = r.setStatus(cr, api.RestoreStartCluster, "")
   223  	if err != nil {
   224  		err = errors.Wrap(err, "set status")
   225  		return rr, err
   226  	}
   227  
   228  	if cr.Spec.PITR != nil {
   229  		oldSize := cluster.Spec.PXC.Size
   230  		oldUnsafe := cluster.Spec.AllowUnsafeConfig
   231  		cluster.Spec.PXC.Size = 1
   232  		cluster.Spec.AllowUnsafeConfig = true
   233  
   234  		if err := r.startCluster(cluster); err != nil {
   235  			return rr, errors.Wrap(err, "restart cluster for pitr")
   236  		}
   237  
   238  		log.Info("point-in-time recovering", "cluster", cr.Spec.PXCCluster)
   239  		err = r.setStatus(cr, api.RestorePITR, "")
   240  		if err != nil {
   241  			return rr, errors.Wrap(err, "set status")
   242  		}
   243  
   244  		err = r.pitr(ctx, cr, bcp, cluster)
   245  		if err != nil {
   246  			return rr, errors.Wrap(err, "run pitr")
   247  		}
   248  
   249  		cluster.Spec.PXC.Size = oldSize
   250  		cluster.Spec.AllowUnsafeConfig = oldUnsafe
   251  
   252  		log.Info("starting cluster", "cluster", cr.Spec.PXCCluster)
   253  		err = r.setStatus(cr, api.RestoreStartCluster, "")
   254  		if err != nil {
   255  			err = errors.Wrap(err, "set status")
   256  			return rr, err
   257  		}
   258  	}
   259  
   260  	err = r.startCluster(clusterOrig)
   261  	if err != nil {
   262  		err = errors.Wrap(err, "restart cluster")
   263  		return rr, err
   264  	}
   265  
   266  	log.Info(returnMsg)
   267  
   268  	return rr, err
   269  }
   270  
   271  func (r *ReconcilePerconaXtraDBClusterRestore) getBackup(ctx context.Context, cr *api.PerconaXtraDBClusterRestore) (*api.PerconaXtraDBClusterBackup, error) {
   272  	if cr.Spec.BackupSource != nil {
   273  		status := cr.Spec.BackupSource.DeepCopy()
   274  		status.State = api.BackupSucceeded
   275  		status.CompletedAt = nil
   276  		status.LastScheduled = nil
   277  		return &api.PerconaXtraDBClusterBackup{
   278  			ObjectMeta: metav1.ObjectMeta{
   279  				Name:      cr.Name,
   280  				Namespace: cr.Namespace,
   281  			},
   282  			Spec: api.PXCBackupSpec{
   283  				PXCCluster:  cr.Spec.PXCCluster,
   284  				StorageName: cr.Spec.BackupSource.StorageName,
   285  			},
   286  			Status: *status,
   287  		}, nil
   288  	}
   289  
   290  	bcp := &api.PerconaXtraDBClusterBackup{}
   291  	err := r.client.Get(ctx, types.NamespacedName{Name: cr.Spec.BackupName, Namespace: cr.Namespace}, bcp)
   292  	if err != nil {
   293  		err = errors.Wrapf(err, "get backup %s", cr.Spec.BackupName)
   294  		return bcp, err
   295  	}
   296  	if bcp.Status.State != api.BackupSucceeded {
   297  		err = errors.Errorf("backup %s didn't finished yet, current state: %s", bcp.Name, bcp.Status.State)
   298  		return bcp, err
   299  	}
   300  
   301  	return bcp, nil
   302  }
   303  
   304  const backupRestoredMsg = `You can view xtrabackup log:
   305  $ kubectl logs job/restore-job-%s-%s
   306  If everything is fine, you can cleanup the job:
   307  $ kubectl delete pxc-restore/%s
   308  `
   309  
   310  func (r *ReconcilePerconaXtraDBClusterRestore) stopCluster(c *api.PerconaXtraDBCluster) error {
   311  	var gracePeriodSec int64
   312  
   313  	if c.Spec.PXC != nil && c.Spec.PXC.TerminationGracePeriodSeconds != nil {
   314  		gracePeriodSec = int64(c.Spec.PXC.Size) * *c.Spec.PXC.TerminationGracePeriodSeconds
   315  	}
   316  
   317  	patch := client.MergeFrom(c.DeepCopy())
   318  	c.Spec.Pause = true
   319  	err := r.client.Patch(context.TODO(), c, patch)
   320  	if err != nil {
   321  		return errors.Wrap(err, "shutdown pods")
   322  	}
   323  
   324  	ls := statefulset.NewNode(c).Labels()
   325  	err = r.waitForPodsShutdown(ls, c.Namespace, gracePeriodSec)
   326  	if err != nil {
   327  		return errors.Wrap(err, "shutdown pods")
   328  	}
   329  
   330  	pvcs := corev1.PersistentVolumeClaimList{}
   331  	err = r.client.List(
   332  		context.TODO(),
   333  		&pvcs,
   334  		&client.ListOptions{
   335  			Namespace:     c.Namespace,
   336  			LabelSelector: labels.SelectorFromSet(ls),
   337  		},
   338  	)
   339  	if err != nil {
   340  		return errors.Wrap(err, "get pvc list")
   341  	}
   342  
   343  	pxcNode := statefulset.NewNode(c)
   344  	pvcNameTemplate := app.DataVolumeName + "-" + pxcNode.StatefulSet().Name
   345  	for _, pvc := range pvcs.Items {
   346  		// check prefix just in case, to be sure we're not going to delete a wrong pvc
   347  		if pvc.Name == pvcNameTemplate+"-0" || !strings.HasPrefix(pvc.Name, pvcNameTemplate) {
   348  			continue
   349  		}
   350  
   351  		err = r.client.Delete(context.TODO(), &pvc)
   352  		if err != nil {
   353  			return errors.Wrap(err, "delete pvc")
   354  		}
   355  	}
   356  
   357  	err = r.waitForPVCShutdown(ls, c.Namespace)
   358  	if err != nil {
   359  		return errors.Wrap(err, "shutdown pvc")
   360  	}
   361  
   362  	return nil
   363  }
   364  
   365  func (r *ReconcilePerconaXtraDBClusterRestore) startCluster(cr *api.PerconaXtraDBCluster) (err error) {
   366  	// tryin several times just to avoid possible conflicts with the main controller
   367  	err = k8sretry.RetryOnConflict(k8sretry.DefaultRetry, func() error {
   368  		// need to get the object with latest version of meta-data for update
   369  		current := &api.PerconaXtraDBCluster{}
   370  		rerr := r.client.Get(context.TODO(), types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}, current)
   371  		if rerr != nil {
   372  			return errors.Wrap(err, "get cluster")
   373  		}
   374  		current.Spec = cr.Spec
   375  		return r.client.Update(context.TODO(), current)
   376  	})
   377  	if err != nil {
   378  		return errors.Wrap(err, "update cluster")
   379  	}
   380  
   381  	// give time for process new state
   382  	time.Sleep(10 * time.Second)
   383  
   384  	var waitLimit int32 = 2 * 60 * 60 // 2 hours
   385  	if cr.Spec.PXC.LivenessInitialDelaySeconds != nil {
   386  		waitLimit = *cr.Spec.PXC.LivenessInitialDelaySeconds * cr.Spec.PXC.Size
   387  	}
   388  
   389  	for i := int32(0); i < waitLimit; i++ {
   390  		current := &api.PerconaXtraDBCluster{}
   391  		err = r.client.Get(context.TODO(), types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}, current)
   392  		if err != nil {
   393  			return errors.Wrap(err, "get cluster")
   394  		}
   395  		if current.Status.ObservedGeneration == current.Generation && current.Status.PXC.Status == api.AppStateReady {
   396  			return nil
   397  		}
   398  		time.Sleep(time.Second * 1)
   399  	}
   400  
   401  	return errors.Errorf("exceeded wait limit")
   402  }
   403  
   404  const waitLimitSec int64 = 300
   405  
   406  func (r *ReconcilePerconaXtraDBClusterRestore) waitForPodsShutdown(ls map[string]string, namespace string, gracePeriodSec int64) error {
   407  	for i := int64(0); i < waitLimitSec+gracePeriodSec; i++ {
   408  		pods := corev1.PodList{}
   409  
   410  		err := r.client.List(
   411  			context.TODO(),
   412  			&pods,
   413  			&client.ListOptions{
   414  				Namespace:     namespace,
   415  				LabelSelector: labels.SelectorFromSet(ls),
   416  			},
   417  		)
   418  		if err != nil {
   419  			return errors.Wrap(err, "get pods list")
   420  		}
   421  
   422  		if len(pods.Items) == 0 {
   423  			return nil
   424  		}
   425  
   426  		time.Sleep(time.Second * 1)
   427  	}
   428  
   429  	return errors.Errorf("exceeded wait limit")
   430  }
   431  
   432  func (r *ReconcilePerconaXtraDBClusterRestore) waitForPVCShutdown(ls map[string]string, namespace string) error {
   433  	for i := int64(0); i < waitLimitSec; i++ {
   434  		pvcs := corev1.PersistentVolumeClaimList{}
   435  
   436  		err := r.client.List(
   437  			context.TODO(),
   438  			&pvcs,
   439  			&client.ListOptions{
   440  				Namespace:     namespace,
   441  				LabelSelector: labels.SelectorFromSet(ls),
   442  			},
   443  		)
   444  		if err != nil {
   445  			return errors.Wrap(err, "get pvc list")
   446  		}
   447  
   448  		if len(pvcs.Items) == 1 {
   449  			return nil
   450  		}
   451  
   452  		time.Sleep(time.Second * 1)
   453  	}
   454  
   455  	return errors.Errorf("exceeded wait limit")
   456  }
   457  
   458  func (r *ReconcilePerconaXtraDBClusterRestore) setStatus(cr *api.PerconaXtraDBClusterRestore, state api.BcpRestoreStates, comments string) error {
   459  	cr.Status.State = state
   460  	switch state {
   461  	case api.RestoreSucceeded:
   462  		tm := metav1.NewTime(time.Now())
   463  		cr.Status.CompletedAt = &tm
   464  	}
   465  
   466  	cr.Status.Comments = comments
   467  
   468  	err := r.client.Status().Update(context.TODO(), cr)
   469  	if err != nil {
   470  		return errors.Wrap(err, "send update")
   471  	}
   472  
   473  	return nil
   474  }