github.com/IBM-Blockchain/fabric-operator@v1.0.4/pkg/action/upgradedbs.go (about)

     1  /*
     2   * Copyright contributors to the Hyperledger Fabric Operator project
     3   *
     4   * SPDX-License-Identifier: Apache-2.0
     5   *
     6   * Licensed under the Apache License, Version 2.0 (the "License");
     7   * you may not use this file except in compliance with the License.
     8   * You may obtain a copy of the License at:
     9   *
    10   * 	  http://www.apache.org/licenses/LICENSE-2.0
    11   *
    12   * Unless required by applicable law or agreed to in writing, software
    13   * distributed under the License is distributed on an "AS IS" BASIS,
    14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15   * See the License for the specific language governing permissions and
    16   * limitations under the License.
    17   */
    18  
    19  package action
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"path/filepath"
    25  	"time"
    26  
    27  	"github.com/pkg/errors"
    28  
    29  	current "github.com/IBM-Blockchain/fabric-operator/api/v1beta1"
    30  	oconfig "github.com/IBM-Blockchain/fabric-operator/operatorconfig"
    31  	"github.com/IBM-Blockchain/fabric-operator/pkg/initializer/common/config"
    32  	"github.com/IBM-Blockchain/fabric-operator/pkg/k8s/controllerclient"
    33  	controller "github.com/IBM-Blockchain/fabric-operator/pkg/k8s/controllerclient"
    34  	"github.com/IBM-Blockchain/fabric-operator/pkg/manager/resources/container"
    35  	"github.com/IBM-Blockchain/fabric-operator/pkg/manager/resources/deployment"
    36  	jobv1 "github.com/IBM-Blockchain/fabric-operator/pkg/manager/resources/job"
    37  	"github.com/IBM-Blockchain/fabric-operator/pkg/util"
    38  	"github.com/IBM-Blockchain/fabric-operator/pkg/util/image"
    39  
    40  	appsv1 "k8s.io/api/apps/v1"
    41  	batchv1 "k8s.io/api/batch/v1"
    42  	corev1 "k8s.io/api/core/v1"
    43  	"k8s.io/apimachinery/pkg/api/resource"
    44  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    45  	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    46  	"k8s.io/apimachinery/pkg/labels"
    47  	"k8s.io/apimachinery/pkg/runtime"
    48  	"k8s.io/apimachinery/pkg/util/wait"
    49  
    50  	k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
    51  )
    52  
    53  //go:generate counterfeiter -o mocks/deploymentreset.go -fake-name DeploymentReset . DeploymentReset
    54  
    55  // DeploymentReset defines the contract to manage deployment reousrce
    56  type DeploymentReset interface {
    57  	Get(v1.Object) (k8sclient.Object, error)
    58  	DeploymentStatus(v1.Object) (appsv1.DeploymentStatus, error)
    59  	GetScheme() *runtime.Scheme
    60  }
    61  
    62  //go:generate counterfeiter -o mocks/upgradeinstance.go -fake-name UpgradeInstance . UpgradeInstance
    63  
    64  // UpgradeInstance defines the contract to update the insstance database
    65  type UpgradeInstance interface {
    66  	runtime.Object
    67  	v1.Object
    68  	UsingCouchDB() bool
    69  	UsingHSMProxy() bool
    70  	IsHSMEnabled() bool
    71  }
    72  
    73  // UpgradeDBs will update the database and peform all necessary clean up and restart logic
    74  func UpgradeDBs(deploymentManager DeploymentReset, client controller.Client, instance UpgradeInstance, timeouts oconfig.DBMigrationTimeouts) error {
    75  	obj, err := deploymentManager.Get(instance)
    76  	if err != nil {
    77  		return errors.Wrap(err, "failed to get deployment")
    78  	}
    79  
    80  	dep := deployment.New(obj.(*appsv1.Deployment))
    81  	originalReplicas := dep.Spec.Replicas
    82  
    83  	// Need to set replica to 0, otherwise migration job won't be able start to due to
    84  	// volume being attached to another node.
    85  	//
    86  	// Wait for deployment to get marked as unavailable after replica updated to 0
    87  	if err := setReplicaCountAndWait(client, deploymentManager, instance, int32(0), timeouts.ReplicaChange.Get()); err != nil {
    88  		return errors.Wrapf(err, "failed to update deployment for '%s'", instance.GetName())
    89  	}
    90  
    91  	if err := waitForPodToDelete(client, instance, timeouts.PodDeletion.Get()); err != nil {
    92  		return err
    93  	}
    94  
    95  	var ip string
    96  	if instance.UsingCouchDB() {
    97  		couchDBPod := getCouchDBPod(dep)
    98  		if err := startCouchDBPod(client, couchDBPod); err != nil {
    99  			return err
   100  		}
   101  
   102  		ip, err = waitForPodToBeRunning(client, couchDBPod, timeouts.PodStart.Get())
   103  		if err != nil {
   104  			return errors.Wrap(err, "couchdb pod failed to start")
   105  		}
   106  	}
   107  
   108  	var hsmConfig *config.HSMConfig
   109  	if !instance.UsingHSMProxy() && instance.IsHSMEnabled() {
   110  		hsmConfig, err = config.ReadHSMConfig(client, instance)
   111  		if err != nil {
   112  			return err
   113  		}
   114  	}
   115  
   116  	job := peerDBMigrationJob(dep, instance.(*current.IBPPeer), hsmConfig, ip, timeouts)
   117  	creatOpt := controllerclient.CreateOption{
   118  		Owner:  instance,
   119  		Scheme: deploymentManager.GetScheme(),
   120  	}
   121  	if err := StartJob(client, job.Job, creatOpt); err != nil {
   122  		if instance.UsingCouchDB() {
   123  			log.Info("failed to start db migration job, deleting couchdb pod")
   124  			couchDBPod := &corev1.Pod{
   125  				ObjectMeta: v1.ObjectMeta{
   126  					Name:      fmt.Sprintf("%s-couchdb", instance.GetName()),
   127  					Namespace: instance.GetNamespace(),
   128  				},
   129  			}
   130  
   131  			if err := client.Delete(context.TODO(), couchDBPod); err != nil {
   132  				return errors.Wrap(err, "failed to delete couchdb pod")
   133  			}
   134  		}
   135  		return errors.Wrap(err, "failed to start db migration job")
   136  	}
   137  	log.Info(fmt.Sprintf("Job '%s' created", job.GetName()))
   138  
   139  	// Wait for job to start and pod to go into running state before reverting
   140  	// back to original replica value
   141  	if err := job.WaitUntilActive(client); err != nil {
   142  		return err
   143  	}
   144  	log.Info(fmt.Sprintf("Job '%s' active", job.GetName()))
   145  
   146  	if err := job.WaitUntilContainerFinished(client, "dbmigration"); err != nil {
   147  		return err
   148  	}
   149  	log.Info(fmt.Sprintf("Job '%s' finished", job.GetName()))
   150  
   151  	// Wait for deployment to get marked as available after replica update
   152  	if err := setReplicaCountAndWait(client, deploymentManager, instance, *originalReplicas, timeouts.ReplicaChange.Get()); err != nil {
   153  		return errors.Wrapf(err, "failed to update deployment for '%s'", instance.GetName())
   154  	}
   155  
   156  	return nil
   157  }
   158  
   159  // StartJob uses the client to create a job on kubernetes client
   160  func StartJob(client controller.Client, job *batchv1.Job, opt controller.CreateOption) error {
   161  	log.Info(fmt.Sprintf("Starting job '%s'", job.GetName()))
   162  
   163  	if err := client.Create(context.TODO(), job, opt); err != nil {
   164  		return errors.Wrap(err, "failed to create migration job")
   165  	}
   166  
   167  	return nil
   168  }
   169  
   170  func startCouchDBPod(client controller.Client, pod *corev1.Pod) error {
   171  	log.Info(fmt.Sprintf("Starting couchdb pod '%s'", pod.GetName()))
   172  
   173  	if err := client.Create(context.TODO(), pod); err != nil {
   174  		return errors.Wrap(err, "failed to create couchdb pod")
   175  	}
   176  
   177  	return nil
   178  }
   179  
   180  func getCouchDBPod(dep *deployment.Deployment) *corev1.Pod {
   181  	couchdb := dep.MustGetContainer("couchdb")
   182  
   183  	localSpecCopy := dep.Spec.Template.Spec.DeepCopy()
   184  	volumes := localSpecCopy.Volumes
   185  	// Remove ledgerdb volume from couchddb pod
   186  	for i, volume := range volumes {
   187  		if volume.Name == "fabric-peer-0" {
   188  			// Remove the ledgerdb data from couchdb container
   189  			volumes[i] = volumes[len(volumes)-1]
   190  			volumes = volumes[:len(volumes)-1]
   191  			break
   192  		}
   193  	}
   194  	return &corev1.Pod{
   195  		ObjectMeta: metav1.ObjectMeta{
   196  			Name:      fmt.Sprintf("%s-couchdb", dep.GetName()),
   197  			Namespace: dep.GetNamespace(),
   198  			Labels: map[string]string{
   199  				"app": dep.Name,
   200  			},
   201  		},
   202  		Spec: corev1.PodSpec{
   203  			ImagePullSecrets: dep.Spec.Template.Spec.ImagePullSecrets,
   204  			RestartPolicy:    corev1.RestartPolicyNever,
   205  			Containers: []corev1.Container{
   206  				*couchdb.Container,
   207  			},
   208  			Volumes: volumes,
   209  		},
   210  	}
   211  }
   212  
   213  func waitForPodToDelete(client controller.Client, instance metav1.Object, timeout time.Duration) error {
   214  	err := wait.Poll(2*time.Second, timeout, func() (bool, error) {
   215  		log.Info(fmt.Sprintf("Waiting for pod for deployment '%s' to be deleted", instance.GetName()))
   216  
   217  		labelSelector, err := labels.Parse(fmt.Sprintf("app=%s", instance.GetName()))
   218  		if err != nil {
   219  			return false, nil
   220  		}
   221  
   222  		opts := &k8sclient.ListOptions{
   223  			LabelSelector: labelSelector,
   224  		}
   225  
   226  		pods := &corev1.PodList{}
   227  		if err := client.List(context.TODO(), pods, opts); err != nil {
   228  			return false, nil
   229  		}
   230  
   231  		if len(pods.Items) == 0 {
   232  			return true, nil
   233  		}
   234  
   235  		return false, nil
   236  	})
   237  	if err != nil {
   238  		return errors.Wrapf(err, "failed to delete pod associated with '%s'", instance.GetName())
   239  	}
   240  	return nil
   241  }
   242  
   243  func waitForPodToBeRunning(client controller.Client, pod *corev1.Pod, timeout time.Duration) (string, error) {
   244  	var podIP string
   245  	p := &corev1.Pod{}
   246  
   247  	err := wait.Poll(2*time.Second, timeout, func() (bool, error) {
   248  		log.Info(fmt.Sprintf("Waiting for couchdb pod '%s' to be running", pod.GetName()))
   249  
   250  		label := fmt.Sprintf("app=%s", pod.Labels["app"])
   251  		labelSelector, err := labels.Parse(label)
   252  		if err != nil {
   253  			return false, err
   254  		}
   255  
   256  		opts := &k8sclient.ListOptions{
   257  			LabelSelector: labelSelector,
   258  		}
   259  
   260  		pods := &corev1.PodList{}
   261  		if err := client.List(context.TODO(), pods, opts); err != nil {
   262  			return false, err
   263  		}
   264  
   265  		if len(pods.Items) != 1 {
   266  			return false, nil
   267  		}
   268  
   269  		p = &pods.Items[0]
   270  		if len(p.Status.ContainerStatuses) > 0 && p.Status.ContainerStatuses[0].State.Running != nil {
   271  			if p.Status.ContainerStatuses[0].Ready {
   272  				return true, nil
   273  			}
   274  		}
   275  
   276  		return false, nil
   277  	})
   278  	if err != nil {
   279  		return podIP, errors.Wrapf(err, "pod '%s' not running", pod.GetName())
   280  	}
   281  
   282  	if p != nil {
   283  		podIP = p.Status.PodIP
   284  	}
   285  
   286  	return podIP, nil
   287  }
   288  
   289  func setReplicaCountAndWait(client controller.Client, deploymentManager DeploymentReset, instance metav1.Object, count int32, timeout time.Duration) error {
   290  	obj, err := deploymentManager.Get(instance)
   291  	if err != nil {
   292  		return errors.Wrap(err, "failed to get deployment")
   293  	}
   294  	dep := deployment.New(obj.DeepCopyObject().(*appsv1.Deployment))
   295  
   296  	if err := setReplicaCountOnDeployment(client, obj, dep, count); err != nil {
   297  		return err
   298  	}
   299  
   300  	err = wait.Poll(2*time.Second, timeout, func() (bool, error) {
   301  		log.Info(fmt.Sprintf("Waiting for deployment '%s' replicas to go to %d", dep.GetName(), count))
   302  		status, err := deploymentManager.DeploymentStatus(instance)
   303  		if err == nil {
   304  			if status.Replicas == count {
   305  				return true, nil
   306  			}
   307  		}
   308  		return false, nil
   309  	})
   310  	if err != nil {
   311  		return errors.Wrap(err, "failed to determine if deployment is available")
   312  	}
   313  
   314  	return nil
   315  }
   316  
   317  func setReplicaCountOnDeployment(client controller.Client, obj k8sclient.Object, dep *deployment.Deployment, count int32) error {
   318  	dep.Deployment.Spec.Replicas = &count
   319  	if err := client.Patch(context.TODO(), dep.Deployment, k8sclient.MergeFrom(obj)); err != nil {
   320  		return errors.Wrapf(err, "failed to update replica to %d", count)
   321  	}
   322  	return nil
   323  }
   324  
   325  // Copy of container that is passed but updated with new command
   326  func peerDBMigrationJob(dep *deployment.Deployment, instance *current.IBPPeer, hsmConfig *config.HSMConfig, couchdbIP string, timeouts oconfig.DBMigrationTimeouts) *jobv1.Job {
   327  	cont := dep.MustGetContainer("peer")
   328  	envs := []string{
   329  		"LICENSE",
   330  		"FABRIC_CFG_PATH",
   331  		"CORE_PEER_MSPCONFIGPATH",
   332  		"CORE_PEER_FILESYSTEMPATH",
   333  		"CORE_PEER_TLS_ENABLED",
   334  		"CORE_PEER_TLS_CERT_FILE",
   335  		"CORE_PEER_TLS_KEY_FILE",
   336  		"CORE_PEER_TLS_ROOTCERT_FILE",
   337  		"CORE_PEER_LOCALMSPID",
   338  		"CORE_LEDGER_STATE_COUCHDBCONFIG_USERNAME",
   339  		"CORE_LEDGER_STATE_COUCHDBCONFIG_PASSWORD",
   340  		"CORE_LEDGER_STATE_STATEDATABASE",
   341  	}
   342  
   343  	backoffLimit := int32(0)
   344  	envVars := cont.GetEnvs(envs)
   345  	envVars = append(envVars,
   346  		corev1.EnvVar{
   347  			Name:  "FABRIC_LOGGING_SPEC",
   348  			Value: "debug",
   349  		},
   350  	)
   351  
   352  	if couchdbIP != "" {
   353  		envVars = append(envVars,
   354  			corev1.EnvVar{
   355  				Name:  "CORE_LEDGER_STATE_COUCHDBCONFIG_COUCHDBADDRESS",
   356  				Value: fmt.Sprintf("%s:5984", couchdbIP),
   357  			},
   358  		)
   359  	}
   360  
   361  	command := `echo "Migrating peer's database" && peer node upgrade-dbs && mkdir -p /data/status && ts=$(date +%Y%m%d-%H%M%S) && touch /data/status/migrated_to_v2-$ts`
   362  
   363  	if instance.UsingHSMProxy() {
   364  		envVars = append(envVars,
   365  			corev1.EnvVar{
   366  				Name:  "PKCS11_PROXY_SOCKET",
   367  				Value: instance.Spec.HSM.PKCS11Endpoint,
   368  			},
   369  		)
   370  	}
   371  
   372  	localSpecCopy := dep.Spec.Template.Spec.DeepCopy()
   373  	volumes := localSpecCopy.Volumes
   374  
   375  	if instance.UsingCouchDB() {
   376  		// Remove statedb volume from migration pod
   377  		for i, volume := range volumes {
   378  			if volume.Name == "db-data" {
   379  				// Remove the statedb data from couchdb container
   380  				volumes[i] = volumes[len(volumes)-1]
   381  				volumes = volumes[:len(volumes)-1]
   382  				break
   383  			}
   384  		}
   385  	}
   386  
   387  	k8sJob := &batchv1.Job{
   388  		ObjectMeta: metav1.ObjectMeta{
   389  			Name:      fmt.Sprintf("%s-dbmigration", instance.GetName()),
   390  			Namespace: dep.GetNamespace(),
   391  			Labels: map[string]string{
   392  				"job-name": fmt.Sprintf("%s-dbmigration", instance.GetName()),
   393  				"owner":    instance.GetName(),
   394  			},
   395  		},
   396  		Spec: batchv1.JobSpec{
   397  			BackoffLimit: &backoffLimit,
   398  			Template: corev1.PodTemplateSpec{
   399  				Spec: corev1.PodSpec{
   400  					ImagePullSecrets: dep.Spec.Template.Spec.ImagePullSecrets,
   401  					RestartPolicy:    corev1.RestartPolicyNever,
   402  					Containers: []corev1.Container{
   403  						{
   404  							Name:            "dbmigration",
   405  							Image:           image.Format(instance.Spec.Images.PeerImage, instance.Spec.Images.PeerTag),
   406  							ImagePullPolicy: cont.ImagePullPolicy,
   407  							Command: []string{
   408  								"sh",
   409  								"-c",
   410  								command,
   411  							},
   412  							Env:             envVars,
   413  							Resources:       cont.Resources,
   414  							SecurityContext: cont.SecurityContext,
   415  							VolumeMounts:    cont.VolumeMounts,
   416  						},
   417  					},
   418  					Volumes: volumes,
   419  				},
   420  			},
   421  		},
   422  	}
   423  
   424  	job := jobv1.New(k8sJob, &jobv1.Timeouts{
   425  		WaitUntilActive:   timeouts.JobStart.Get(),
   426  		WaitUntilFinished: timeouts.JobCompletion.Get(),
   427  	})
   428  
   429  	if hsmConfig != nil {
   430  		migrationCont := job.MustGetContainer("dbmigration")
   431  		migrationCont.Env = append(migrationCont.Env, hsmConfig.Envs...)
   432  
   433  		volume := corev1.Volume{
   434  			Name: "shared",
   435  			VolumeSource: corev1.VolumeSource{
   436  				EmptyDir: &corev1.EmptyDirVolumeSource{
   437  					Medium: corev1.StorageMediumMemory,
   438  				},
   439  			},
   440  		}
   441  		job.Spec.Template.Spec.Volumes = util.AppendVolumeIfMissing(job.Spec.Template.Spec.Volumes, volume)
   442  
   443  		initCont := HSMInitContainer(instance, hsmConfig)
   444  		job.Spec.Template.Spec.InitContainers = append(job.Spec.Template.Spec.InitContainers, *initCont.Container)
   445  
   446  		if hsmConfig.Daemon != nil {
   447  			// Unable to launch daemon if not running priviledged moe
   448  			t := true
   449  			migrationCont.SecurityContext.Privileged = &t
   450  			migrationCont.SecurityContext.AllowPrivilegeEscalation = &t
   451  
   452  			// This is the shared volume where the file 'pkcsslotd-luanched' is touched to let
   453  			// other containers know that the daemon has successfully launched.
   454  			migrationCont.AppendVolumeMountIfMissing("shared", "/shared")
   455  
   456  			// Update command in deployment to ensure that deamon is running before starting the ca
   457  			migrationCont.Command = []string{
   458  				"sh",
   459  				"-c",
   460  				fmt.Sprintf("%s && %s", config.DAEMON_CHECK_CMD, command),
   461  			}
   462  
   463  			var pvcMount *corev1.VolumeMount
   464  			for _, vm := range hsmConfig.MountPaths {
   465  				if vm.UsePVC {
   466  					pvcMount = &corev1.VolumeMount{
   467  						Name:      "fabric-peer-0",
   468  						MountPath: vm.MountPath,
   469  					}
   470  				}
   471  			}
   472  
   473  			// Add daemon container to the job
   474  			config.AddDaemonContainer(hsmConfig, job, instance.GetResource(current.HSMDAEMON), pvcMount)
   475  
   476  			// If a pvc mount has been configured in HSM config, set the volume mount on the CertGen container
   477  			if pvcMount != nil {
   478  				migrationCont.AppendVolumeMountIfMissing(pvcMount.Name, pvcMount.MountPath)
   479  			}
   480  		}
   481  	}
   482  
   483  	return job
   484  }
   485  
   486  // HSMInitContainer creates a container that copies the HSM library to shared volume
   487  func HSMInitContainer(instance *current.IBPPeer, hsmConfig *config.HSMConfig) *container.Container {
   488  	hsmLibraryPath := hsmConfig.Library.FilePath
   489  	hsmLibraryName := filepath.Base(hsmLibraryPath)
   490  
   491  	f := false
   492  	user := int64(0)
   493  	mountPath := "/shared"
   494  	initCont := &container.Container{
   495  		Container: &corev1.Container{
   496  			Name:            "hsm-client",
   497  			Image:           image.Format(instance.Spec.Images.HSMImage, instance.Spec.Images.HSMTag),
   498  			ImagePullPolicy: corev1.PullAlways,
   499  			Command: []string{
   500  				"sh",
   501  				"-c",
   502  				fmt.Sprintf("mkdir -p %s/hsm && dst=\"%s/hsm/%s\" && echo \"Copying %s to ${dst}\" && mkdir -p $(dirname $dst) && cp -r %s $dst", mountPath, mountPath, hsmLibraryName, hsmLibraryPath, hsmLibraryPath),
   503  			},
   504  			SecurityContext: &corev1.SecurityContext{
   505  				RunAsUser:    &user,
   506  				RunAsNonRoot: &f,
   507  			},
   508  			VolumeMounts: []corev1.VolumeMount{
   509  				{
   510  					Name:      "shared",
   511  					MountPath: mountPath,
   512  				},
   513  			},
   514  			Resources: corev1.ResourceRequirements{
   515  				Requests: corev1.ResourceList{
   516  					corev1.ResourceCPU:    resource.MustParse("0.1"),
   517  					corev1.ResourceMemory: resource.MustParse("100Mi"),
   518  				},
   519  				Limits: corev1.ResourceList{
   520  					corev1.ResourceCPU:    resource.MustParse("1"),
   521  					corev1.ResourceMemory: resource.MustParse("500Mi"),
   522  				},
   523  			},
   524  		},
   525  	}
   526  
   527  	return initCont
   528  }