github.com/percona/percona-xtradb-cluster-operator@v1.14.0/pkg/controller/pxcrestore/controller.go (about) 1 package pxcrestore 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "time" 8 9 "github.com/pkg/errors" 10 corev1 "k8s.io/api/core/v1" 11 k8serrors "k8s.io/apimachinery/pkg/api/errors" 12 "k8s.io/apimachinery/pkg/api/meta" 13 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 14 "k8s.io/apimachinery/pkg/labels" 15 "k8s.io/apimachinery/pkg/runtime" 16 "k8s.io/apimachinery/pkg/types" 17 k8sretry "k8s.io/client-go/util/retry" 18 "sigs.k8s.io/controller-runtime/pkg/builder" 19 "sigs.k8s.io/controller-runtime/pkg/client" 20 "sigs.k8s.io/controller-runtime/pkg/handler" 21 logf "sigs.k8s.io/controller-runtime/pkg/log" 22 "sigs.k8s.io/controller-runtime/pkg/manager" 23 "sigs.k8s.io/controller-runtime/pkg/reconcile" 24 25 "github.com/percona/percona-xtradb-cluster-operator/clientcmd" 26 api "github.com/percona/percona-xtradb-cluster-operator/pkg/apis/pxc/v1" 27 "github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/app" 28 "github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/app/statefulset" 29 "github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/backup" 30 "github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/backup/storage" 31 "github.com/percona/percona-xtradb-cluster-operator/version" 32 ) 33 34 // Add creates a new PerconaXtraDBClusterRestore Controller and adds it to the Manager. The Manager will set fields on the Controller 35 // and Start it when the Manager is Started. 36 func Add(mgr manager.Manager) error { 37 r, err := newReconciler(mgr) 38 if err != nil { 39 return err 40 } 41 return add(mgr, r) 42 } 43 44 // newReconciler returns a new reconcile.Reconciler 45 func newReconciler(mgr manager.Manager) (reconcile.Reconciler, error) { 46 sv, err := version.Server() 47 if err != nil { 48 return nil, fmt.Errorf("get version: %v", err) 49 } 50 51 cli, err := clientcmd.NewClient() 52 if err != nil { 53 return nil, errors.Wrap(err, "create clientcmd") 54 } 55 56 return &ReconcilePerconaXtraDBClusterRestore{ 57 client: mgr.GetClient(), 58 clientcmd: cli, 59 scheme: mgr.GetScheme(), 60 serverVersion: sv, 61 newStorageClientFunc: storage.NewClient, 62 }, nil 63 } 64 65 // add adds a new Controller to mgr with r as the reconcile.Reconciler 66 func add(mgr manager.Manager, r reconcile.Reconciler) error { 67 return builder.ControllerManagedBy(mgr). 68 Named("pxcrestore-controller"). 69 Watches(&api.PerconaXtraDBClusterRestore{}, &handler.EnqueueRequestForObject{}). 70 Complete(r) 71 } 72 73 var _ reconcile.Reconciler = &ReconcilePerconaXtraDBClusterRestore{} 74 75 // ReconcilePerconaXtraDBClusterRestore reconciles a PerconaXtraDBClusterRestore object 76 type ReconcilePerconaXtraDBClusterRestore struct { 77 // This client, initialized using mgr.Client() above, is a split client 78 // that reads objects from the cache and writes to the apiserver 79 client client.Client 80 clientcmd *clientcmd.Client 81 scheme *runtime.Scheme 82 83 serverVersion *version.ServerVersion 84 85 newStorageClientFunc storage.NewClientFunc 86 } 87 88 // Reconcile reads that state of the cluster for a PerconaXtraDBClusterRestore object and makes changes based on the state read 89 // and what is in the PerconaXtraDBClusterRestore.Spec 90 // Note: 91 // The Controller will requeue the Request to be processed again if the returned error is non-nil or 92 // Result.Requeue is true, otherwise upon completion it will remove the work from the queue. 93 func (r *ReconcilePerconaXtraDBClusterRestore) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { 94 log := logf.FromContext(ctx) 95 96 rr := reconcile.Result{} 97 98 cr := &api.PerconaXtraDBClusterRestore{} 99 err := r.client.Get(context.TODO(), request.NamespacedName, cr) 100 if err != nil { 101 if k8serrors.IsNotFound(err) { 102 // Request object not found, could have been deleted after reconcile request. 103 return rr, nil 104 } 105 // Error reading the object - requeue the request. 106 return rr, err 107 } 108 if cr.Status.State != api.RestoreNew { 109 return rr, nil 110 } 111 112 log.Info("backup restore request") 113 114 err = r.setStatus(cr, api.RestoreStarting, "") 115 if err != nil { 116 return rr, errors.Wrap(err, "set status") 117 } 118 rJobsList := &api.PerconaXtraDBClusterRestoreList{} 119 err = r.client.List( 120 context.TODO(), 121 rJobsList, 122 &client.ListOptions{ 123 Namespace: cr.Namespace, 124 }, 125 ) 126 if err != nil { 127 return rr, errors.Wrap(err, "get restore jobs list") 128 } 129 130 returnMsg := fmt.Sprintf(backupRestoredMsg, cr.Name, cr.Spec.PXCCluster, cr.Name) 131 132 defer func() { 133 status := api.BcpRestoreStates(api.RestoreSucceeded) 134 if err != nil { 135 status = api.RestoreFailed 136 returnMsg = err.Error() 137 } 138 err := r.setStatus(cr, status, returnMsg) 139 if err != nil { 140 return 141 } 142 }() 143 144 for _, j := range rJobsList.Items { 145 if j.Spec.PXCCluster == cr.Spec.PXCCluster && 146 j.Name != cr.Name && j.Status.State != api.RestoreFailed && 147 j.Status.State != api.RestoreSucceeded { 148 err = errors.Errorf("unable to continue, concurent restore job %s running now.", j.Name) 149 return rr, err 150 } 151 } 152 153 err = cr.CheckNsetDefaults() 154 if err != nil { 155 return rr, err 156 } 157 158 cluster := new(api.PerconaXtraDBCluster) 159 err = r.client.Get(context.TODO(), types.NamespacedName{Name: cr.Spec.PXCCluster, Namespace: cr.Namespace}, cluster) 160 if err != nil { 161 err = errors.Wrapf(err, "get cluster %s", cr.Spec.PXCCluster) 162 return rr, err 163 } 164 clusterOrig := cluster.DeepCopy() 165 166 err = cluster.CheckNSetDefaults(r.serverVersion, log) 167 if err != nil { 168 return reconcile.Result{}, fmt.Errorf("wrong PXC options: %v", err) 169 } 170 171 err = backup.CheckPITRErrors(ctx, r.client, r.clientcmd, cluster) 172 if err != nil { 173 return reconcile.Result{}, err 174 } 175 176 bcp, err := r.getBackup(ctx, cr) 177 if err != nil { 178 return rr, errors.Wrap(err, "get backup") 179 } 180 181 annotations := cr.GetAnnotations() 182 _, unsafePITR := annotations[api.AnnotationUnsafePITR] 183 cond := meta.FindStatusCondition(bcp.Status.Conditions, api.BackupConditionPITRReady) 184 if cond != nil && cond.Status == metav1.ConditionFalse && !unsafePITR { 185 msg := fmt.Sprintf("Backup doesn't guarantee consistent recovery with PITR. Annotate PerconaXtraDBClusterRestore with %s to force it.", api.AnnotationUnsafePITR) 186 err = errors.New(msg) 187 return reconcile.Result{}, nil 188 } 189 190 err = r.validate(ctx, cr, bcp, cluster) 191 if err != nil { 192 err = errors.Wrap(err, "failed to validate restore job") 193 return rr, err 194 } 195 196 log.Info("stopping cluster", "cluster", cr.Spec.PXCCluster) 197 err = r.setStatus(cr, api.RestoreStopCluster, "") 198 if err != nil { 199 err = errors.Wrap(err, "set status") 200 return rr, err 201 } 202 err = r.stopCluster(cluster.DeepCopy()) 203 if err != nil { 204 err = errors.Wrapf(err, "stop cluster %s", cluster.Name) 205 return rr, err 206 } 207 208 log.Info("starting restore", "cluster", cr.Spec.PXCCluster, "backup", cr.Spec.BackupName) 209 err = r.setStatus(cr, api.RestoreRestore, "") 210 if err != nil { 211 err = errors.Wrap(err, "set status") 212 return rr, err 213 } 214 215 err = r.restore(ctx, cr, bcp, cluster) 216 if err != nil { 217 err = errors.Wrap(err, "run restore") 218 return rr, err 219 } 220 221 log.Info("starting cluster", "cluster", cr.Spec.PXCCluster) 222 err = r.setStatus(cr, api.RestoreStartCluster, "") 223 if err != nil { 224 err = errors.Wrap(err, "set status") 225 return rr, err 226 } 227 228 if cr.Spec.PITR != nil { 229 oldSize := cluster.Spec.PXC.Size 230 oldUnsafe := cluster.Spec.AllowUnsafeConfig 231 cluster.Spec.PXC.Size = 1 232 cluster.Spec.AllowUnsafeConfig = true 233 234 if err := r.startCluster(cluster); err != nil { 235 return rr, errors.Wrap(err, "restart cluster for pitr") 236 } 237 238 log.Info("point-in-time recovering", "cluster", cr.Spec.PXCCluster) 239 err = r.setStatus(cr, api.RestorePITR, "") 240 if err != nil { 241 return rr, errors.Wrap(err, "set status") 242 } 243 244 err = r.pitr(ctx, cr, bcp, cluster) 245 if err != nil { 246 return rr, errors.Wrap(err, "run pitr") 247 } 248 249 cluster.Spec.PXC.Size = oldSize 250 cluster.Spec.AllowUnsafeConfig = oldUnsafe 251 252 log.Info("starting cluster", "cluster", cr.Spec.PXCCluster) 253 err = r.setStatus(cr, api.RestoreStartCluster, "") 254 if err != nil { 255 err = errors.Wrap(err, "set status") 256 return rr, err 257 } 258 } 259 260 err = r.startCluster(clusterOrig) 261 if err != nil { 262 err = errors.Wrap(err, "restart cluster") 263 return rr, err 264 } 265 266 log.Info(returnMsg) 267 268 return rr, err 269 } 270 271 func (r *ReconcilePerconaXtraDBClusterRestore) getBackup(ctx context.Context, cr *api.PerconaXtraDBClusterRestore) (*api.PerconaXtraDBClusterBackup, error) { 272 if cr.Spec.BackupSource != nil { 273 status := cr.Spec.BackupSource.DeepCopy() 274 status.State = api.BackupSucceeded 275 status.CompletedAt = nil 276 status.LastScheduled = nil 277 return &api.PerconaXtraDBClusterBackup{ 278 ObjectMeta: metav1.ObjectMeta{ 279 Name: cr.Name, 280 Namespace: cr.Namespace, 281 }, 282 Spec: api.PXCBackupSpec{ 283 PXCCluster: cr.Spec.PXCCluster, 284 StorageName: cr.Spec.BackupSource.StorageName, 285 }, 286 Status: *status, 287 }, nil 288 } 289 290 bcp := &api.PerconaXtraDBClusterBackup{} 291 err := r.client.Get(ctx, types.NamespacedName{Name: cr.Spec.BackupName, Namespace: cr.Namespace}, bcp) 292 if err != nil { 293 err = errors.Wrapf(err, "get backup %s", cr.Spec.BackupName) 294 return bcp, err 295 } 296 if bcp.Status.State != api.BackupSucceeded { 297 err = errors.Errorf("backup %s didn't finished yet, current state: %s", bcp.Name, bcp.Status.State) 298 return bcp, err 299 } 300 301 return bcp, nil 302 } 303 304 const backupRestoredMsg = `You can view xtrabackup log: 305 $ kubectl logs job/restore-job-%s-%s 306 If everything is fine, you can cleanup the job: 307 $ kubectl delete pxc-restore/%s 308 ` 309 310 func (r *ReconcilePerconaXtraDBClusterRestore) stopCluster(c *api.PerconaXtraDBCluster) error { 311 var gracePeriodSec int64 312 313 if c.Spec.PXC != nil && c.Spec.PXC.TerminationGracePeriodSeconds != nil { 314 gracePeriodSec = int64(c.Spec.PXC.Size) * *c.Spec.PXC.TerminationGracePeriodSeconds 315 } 316 317 patch := client.MergeFrom(c.DeepCopy()) 318 c.Spec.Pause = true 319 err := r.client.Patch(context.TODO(), c, patch) 320 if err != nil { 321 return errors.Wrap(err, "shutdown pods") 322 } 323 324 ls := statefulset.NewNode(c).Labels() 325 err = r.waitForPodsShutdown(ls, c.Namespace, gracePeriodSec) 326 if err != nil { 327 return errors.Wrap(err, "shutdown pods") 328 } 329 330 pvcs := corev1.PersistentVolumeClaimList{} 331 err = r.client.List( 332 context.TODO(), 333 &pvcs, 334 &client.ListOptions{ 335 Namespace: c.Namespace, 336 LabelSelector: labels.SelectorFromSet(ls), 337 }, 338 ) 339 if err != nil { 340 return errors.Wrap(err, "get pvc list") 341 } 342 343 pxcNode := statefulset.NewNode(c) 344 pvcNameTemplate := app.DataVolumeName + "-" + pxcNode.StatefulSet().Name 345 for _, pvc := range pvcs.Items { 346 // check prefix just in case, to be sure we're not going to delete a wrong pvc 347 if pvc.Name == pvcNameTemplate+"-0" || !strings.HasPrefix(pvc.Name, pvcNameTemplate) { 348 continue 349 } 350 351 err = r.client.Delete(context.TODO(), &pvc) 352 if err != nil { 353 return errors.Wrap(err, "delete pvc") 354 } 355 } 356 357 err = r.waitForPVCShutdown(ls, c.Namespace) 358 if err != nil { 359 return errors.Wrap(err, "shutdown pvc") 360 } 361 362 return nil 363 } 364 365 func (r *ReconcilePerconaXtraDBClusterRestore) startCluster(cr *api.PerconaXtraDBCluster) (err error) { 366 // tryin several times just to avoid possible conflicts with the main controller 367 err = k8sretry.RetryOnConflict(k8sretry.DefaultRetry, func() error { 368 // need to get the object with latest version of meta-data for update 369 current := &api.PerconaXtraDBCluster{} 370 rerr := r.client.Get(context.TODO(), types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}, current) 371 if rerr != nil { 372 return errors.Wrap(err, "get cluster") 373 } 374 current.Spec = cr.Spec 375 return r.client.Update(context.TODO(), current) 376 }) 377 if err != nil { 378 return errors.Wrap(err, "update cluster") 379 } 380 381 // give time for process new state 382 time.Sleep(10 * time.Second) 383 384 var waitLimit int32 = 2 * 60 * 60 // 2 hours 385 if cr.Spec.PXC.LivenessInitialDelaySeconds != nil { 386 waitLimit = *cr.Spec.PXC.LivenessInitialDelaySeconds * cr.Spec.PXC.Size 387 } 388 389 for i := int32(0); i < waitLimit; i++ { 390 current := &api.PerconaXtraDBCluster{} 391 err = r.client.Get(context.TODO(), types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}, current) 392 if err != nil { 393 return errors.Wrap(err, "get cluster") 394 } 395 if current.Status.ObservedGeneration == current.Generation && current.Status.PXC.Status == api.AppStateReady { 396 return nil 397 } 398 time.Sleep(time.Second * 1) 399 } 400 401 return errors.Errorf("exceeded wait limit") 402 } 403 404 const waitLimitSec int64 = 300 405 406 func (r *ReconcilePerconaXtraDBClusterRestore) waitForPodsShutdown(ls map[string]string, namespace string, gracePeriodSec int64) error { 407 for i := int64(0); i < waitLimitSec+gracePeriodSec; i++ { 408 pods := corev1.PodList{} 409 410 err := r.client.List( 411 context.TODO(), 412 &pods, 413 &client.ListOptions{ 414 Namespace: namespace, 415 LabelSelector: labels.SelectorFromSet(ls), 416 }, 417 ) 418 if err != nil { 419 return errors.Wrap(err, "get pods list") 420 } 421 422 if len(pods.Items) == 0 { 423 return nil 424 } 425 426 time.Sleep(time.Second * 1) 427 } 428 429 return errors.Errorf("exceeded wait limit") 430 } 431 432 func (r *ReconcilePerconaXtraDBClusterRestore) waitForPVCShutdown(ls map[string]string, namespace string) error { 433 for i := int64(0); i < waitLimitSec; i++ { 434 pvcs := corev1.PersistentVolumeClaimList{} 435 436 err := r.client.List( 437 context.TODO(), 438 &pvcs, 439 &client.ListOptions{ 440 Namespace: namespace, 441 LabelSelector: labels.SelectorFromSet(ls), 442 }, 443 ) 444 if err != nil { 445 return errors.Wrap(err, "get pvc list") 446 } 447 448 if len(pvcs.Items) == 1 { 449 return nil 450 } 451 452 time.Sleep(time.Second * 1) 453 } 454 455 return errors.Errorf("exceeded wait limit") 456 } 457 458 func (r *ReconcilePerconaXtraDBClusterRestore) setStatus(cr *api.PerconaXtraDBClusterRestore, state api.BcpRestoreStates, comments string) error { 459 cr.Status.State = state 460 switch state { 461 case api.RestoreSucceeded: 462 tm := metav1.NewTime(time.Now()) 463 cr.Status.CompletedAt = &tm 464 } 465 466 cr.Status.Comments = comments 467 468 err := r.client.Status().Update(context.TODO(), cr) 469 if err != nil { 470 return errors.Wrap(err, "send update") 471 } 472 473 return nil 474 }