github.com/jenkins-x/jx/v2@v2.1.155/pkg/kube/build_lock.go (about)

     1  package kube
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"strconv"
     7  	"time"
     8  
     9  	"github.com/jenkins-x/jx-logging/pkg/log"
    10  
    11  	v1 "k8s.io/api/core/v1"
    12  	"k8s.io/apimachinery/pkg/api/errors"
    13  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    14  	"k8s.io/apimachinery/pkg/watch"
    15  	"k8s.io/client-go/kubernetes"
    16  )
    17  
    18  // Labels required to be a lock. Anything else should be ignored.
    19  var buildLockLabels map[string]string = map[string]string{
    20  	"jenkins-x.io/kind": "build-lock",
    21  }
    22  var buildLockExpires time.Duration = time.Hour
    23  var buildLockPhaseRunning map[v1.PodPhase]bool = map[v1.PodPhase]bool{
    24  	v1.PodPending: true,
    25  	v1.PodRunning: true,
    26  	v1.PodUnknown: true,
    27  }
    28  
    29  // DisableBuildLockEnvKey environment variable used to disable build lock in jx step helm apply
    30  const DisableBuildLockEnvKey = "JX_DISABLE_BUILD_LOCK"
    31  
    32  // AcquireBuildLock acquires a build lock, to avoid other builds to edit the
    33  // same namespace while a deployment is already running, other deployments
    34  // can negotiate which one should run after, by editing its data.
    35  // Returns a function to release the lock (to be called in a defer)
    36  // Returns an error if a newer build is already running, or if an error happened
    37  func AcquireBuildLock(kubeClient kubernetes.Interface, devNamespace, namespace string) (func() error, error) {
    38  	// Only lock if running in Tekton
    39  	if ok, err := IsTektonEnabled(kubeClient, devNamespace); err != nil {
    40  		log.Logger().Warnf("error while looking for Tekton: %s\n", err.Error())
    41  		return nil, err
    42  	} else if !ok {
    43  		log.Logger().Debugf("lock cancelled because not running in tekton")
    44  		return func() error { return nil }, nil
    45  	}
    46  	// Create the lock object
    47  	lock, err := makeBuildLock(kubeClient, devNamespace, namespace)
    48  	if err != nil {
    49  		return nil, err
    50  	}
    51  	// this loop continuously tries to create the lock
    52  Create:
    53  	for {
    54  		// no pod to follow, set an expiration date
    55  		if len(lock.OwnerReferences) == 0 {
    56  			expires := time.Now().UTC().Add(buildLockExpires).Format(time.RFC3339)
    57  			lock.Annotations["expires"] = expires
    58  			lock.Data["expires"] = expires
    59  		}
    60  		log.Logger().Infof("creating the lock configmap %s", lock.Name)
    61  		// create the lock
    62  		new, err := kubeClient.CoreV1().ConfigMaps(devNamespace).Create(lock)
    63  		if err != nil {
    64  			status, ok := err.(*errors.StatusError)
    65  			// an error while creating the lock
    66  			if !ok || status.Status().Reason != metav1.StatusReasonAlreadyExists {
    67  				log.Logger().Warnf("failed to create the lock configmap %s: %s\n", lock.Name, err.Error())
    68  				return nil, err
    69  			}
    70  			// there is already a similat lock
    71  			log.Logger().Infof("lock configmap %s already exists", lock.Name)
    72  		} else {
    73  			// the lock is created, can now perform the updates
    74  			log.Logger().Infof("lock configmap %s created", lock.Name)
    75  			// returns a function that releases the lock
    76  			return func() error {
    77  				log.Logger().Infof("cleaning the lock configmap %s", lock.Name)
    78  				err := kubeClient.CoreV1().ConfigMaps(devNamespace).Delete(lock.Name,
    79  					&metav1.DeleteOptions{
    80  						Preconditions: &metav1.Preconditions{
    81  							UID: &new.UID,
    82  						},
    83  					})
    84  				if err != nil {
    85  					log.Logger().Warnf("failed to cleanup the lock configmap %s: %s\n", lock.Name, err.Error())
    86  				}
    87  				return err
    88  			}, nil
    89  		}
    90  		// create these variables outside, to be able to edit them before the next loop
    91  		var old *v1.ConfigMap
    92  		var pod *v1.Pod
    93  	Read:
    94  		for {
    95  			// get the current lock if not already provided
    96  			if old == nil {
    97  				old, err = kubeClient.CoreV1().ConfigMaps(devNamespace).Get(lock.Name, metav1.GetOptions{})
    98  				if err != nil {
    99  					status, ok := err.(*errors.StatusError)
   100  					// the lock does not exist anymore, try to create it
   101  					if ok && status.Status().Reason == metav1.StatusReasonNotFound {
   102  						log.Logger().Infof("lock configmap %s deleted", lock.Name)
   103  						continue Create
   104  					}
   105  					// an error getting the lock
   106  					log.Logger().Warnf("failed to get the lock configmap %s: %s\n", lock.Name, err.Error())
   107  					return nil, err
   108  				}
   109  			}
   110  			// get the locking pod
   111  			var remove bool
   112  			remove, pod, err = getLockingPod(kubeClient, namespace, old, pod)
   113  			if err != nil {
   114  				return nil, err
   115  				// the lock should simply be removed
   116  			} else if remove {
   117  				log.Logger().Infof("cleaning the old lock configmap %s", lock.Name)
   118  				err := kubeClient.CoreV1().ConfigMaps(devNamespace).Delete(lock.Name,
   119  					&metav1.DeleteOptions{
   120  						Preconditions: &metav1.Preconditions{
   121  							UID: &old.UID,
   122  						},
   123  					})
   124  				// removed, now try to create it
   125  				if err == nil {
   126  					continue Create
   127  				}
   128  				status, ok := err.(*errors.StatusError)
   129  				// already deleted, try to create it
   130  				if ok && status.Status().Reason == metav1.StatusReasonNotFound {
   131  					continue Create
   132  					// the lock changed, read it again
   133  				} else if ok && status.Status().Reason == metav1.StatusReasonConflict {
   134  					log.Logger().Infof("lock configmap %s changed", lock.Name)
   135  					old = nil
   136  					continue Read
   137  					// an error while removing the pod
   138  				} else {
   139  					log.Logger().Warnf("failed to cleanup the old lock configmap %s: %s\n", lock.Name, err.Error())
   140  					return nil, err
   141  				}
   142  			}
   143  			// compare the builds
   144  			if data, err := compareBuildLocks(old.Data, lock.Data); err != nil {
   145  				return nil, err
   146  				// should update the build to wait
   147  			} else if data != nil {
   148  				old.Data = data
   149  				old, err = kubeClient.CoreV1().ConfigMaps(devNamespace).Update(old)
   150  				if err != nil {
   151  					status, ok := err.(*errors.StatusError)
   152  					// the lock does not exist anymore, try to create it
   153  					if ok && status.Status().Reason == metav1.StatusReasonNotFound {
   154  						log.Logger().Infof("lock configmap %s deleted", lock.Name)
   155  						continue Create
   156  						// the lock has changed, read it again
   157  					} else if ok && status.Status().Reason == metav1.StatusReasonConflict {
   158  						log.Logger().Infof("lock configmap %s changed", lock.Name)
   159  						old = nil
   160  						continue Read
   161  					}
   162  					// an error updating the lock
   163  					log.Logger().Warnf("failed to update the lock configmap %s: %s\n", lock.Name, err.Error())
   164  					return nil, err
   165  				}
   166  			}
   167  			// watch the lock for updates
   168  			if old, err = watchBuildLock(kubeClient, old, pod, lock.Data); err != nil {
   169  				return nil, err
   170  				// lock configmap was updated, read it again
   171  			} else if old != nil {
   172  				continue Read
   173  				// lock configmap was (probably) deleted, try to create it again
   174  			} else {
   175  				continue Create
   176  			}
   177  		}
   178  	}
   179  }
   180  
   181  // makeBuildLock make the lock configmap of the current build
   182  func makeBuildLock(kubeClient kubernetes.Interface, devNamespace, namespace string) (*v1.ConfigMap, error) {
   183  	// Get infos from the headers
   184  	now := time.Now().UTC().Format(time.RFC3339)
   185  	owner := os.Getenv("REPO_OWNER")
   186  	if owner == "" {
   187  		log.Logger().Warnf("no REPO_OWNER provided")
   188  		return nil, fmt.Errorf("no REPO_OWNER provided")
   189  	}
   190  	repository := os.Getenv("REPO_NAME")
   191  	if repository == "" {
   192  		log.Logger().Warnf("no REPO_NAME provided")
   193  		return nil, fmt.Errorf("no REPO_NAME provided")
   194  	}
   195  	branch := os.Getenv("BRANCH_NAME")
   196  	if branch == "" {
   197  		log.Logger().Warnf("no BRANCH_NAME provided")
   198  		return nil, fmt.Errorf("no BRANCH_NAME provided")
   199  	}
   200  	build := os.Getenv("BUILD_NUMBER")
   201  	if _, err := strconv.Atoi(build); err != nil {
   202  		log.Logger().Warnf("no BUILD_NUMBER provided: %s\n", err.Error())
   203  		return nil, err
   204  	}
   205  	interpret := os.Getenv("JX_INTERPRET_PIPELINE") == "true"
   206  	// Create the lock object
   207  	lock := &v1.ConfigMap{
   208  		ObjectMeta: metav1.ObjectMeta{
   209  			Name:      fmt.Sprintf("jx-lock-%s", namespace),
   210  			Namespace: devNamespace,
   211  			Labels: map[string]string{
   212  				"namespace":  namespace,
   213  				"owner":      owner,
   214  				"repository": repository,
   215  				"branch":     branch,
   216  				"build":      build,
   217  			},
   218  			Annotations: map[string]string{
   219  				"jenkins-x.io/created-by": "Jenkins X",
   220  				"warning":                 "DO NOT REMOVE",
   221  				"purpose": fmt.Sprintf("This is a deployment lock for the "+
   222  					"namespace \"%s\". It prevents several deployments to "+
   223  					"edit the same namespace at the same time. It will "+
   224  					"automatically be removed once the deployemnt is "+
   225  					"finished, or replaced by the next deployemnt to run.",
   226  					namespace),
   227  			},
   228  		},
   229  		Data: map[string]string{
   230  			"namespace":  namespace,
   231  			"owner":      owner,
   232  			"repository": repository,
   233  			"branch":     branch,
   234  			"build":      build,
   235  			"timestamp":  now,
   236  		},
   237  	}
   238  	for k, v := range buildLockLabels {
   239  		lock.Labels[k] = v
   240  	}
   241  	// Find our pod
   242  	if !interpret {
   243  		podList, err := kubeClient.CoreV1().Pods(devNamespace).List(metav1.ListOptions{
   244  			LabelSelector: fmt.Sprintf("owner=%s,repository=%s,branch=%s,build=%s,jenkins.io/pipelineType=build", owner, repository, branch, build),
   245  		})
   246  		if err != nil {
   247  			return nil, err
   248  		} else if len(podList.Items) != 1 {
   249  			return nil, fmt.Errorf("%d pods found for this job (owner=%s,repository=%s,branch=%s,build=%s,jenkins.io/pipelineType=build)",
   250  				len(podList.Items), owner, repository, branch, build)
   251  		}
   252  		pod := &podList.Items[0]
   253  		// kubernetes library seems to forget APIVersoin and Kind
   254  		// fill those if they're missing
   255  		if pod.APIVersion == "" {
   256  			pod.APIVersion = "v1"
   257  		}
   258  		if pod.Kind == "" {
   259  			pod.Kind = "Pod"
   260  		}
   261  		lock.OwnerReferences = []metav1.OwnerReference{{
   262  			APIVersion: pod.APIVersion,
   263  			Kind:       pod.Kind,
   264  			Name:       pod.Name,
   265  			UID:        pod.UID,
   266  		}}
   267  		lock.Data["pod"] = pod.Name
   268  	}
   269  	return lock, nil
   270  }
   271  
   272  // getLockingPod checks the lock and return its locking pod
   273  // receives the previously known pod, to avoid refreshing it if not needed
   274  // Returns true if the lock should be removed (because the lock is invalid,
   275  // or its pod is missing or finished)
   276  // Returns the pod if one is running, or nil if running locally
   277  func getLockingPod(kubeClient kubernetes.Interface, namespace string, lock *v1.ConfigMap, pod *v1.Pod) (bool, *v1.Pod, error) {
   278  	// check the lock
   279  	for k, v := range buildLockLabels {
   280  		if lock.Labels[k] != v {
   281  			log.Logger().Warnf("the lock %s should have annotation \"%s: %s\"", lock.Name, k, v)
   282  			return true, nil, nil
   283  		}
   284  	}
   285  	if lock.Labels["namespace"] != namespace {
   286  		log.Logger().Warnf("the lock %s should have label \"namespace: %s\"", lock.Name, namespace)
   287  		return true, nil, nil
   288  	}
   289  	// the lock has no owner, check the timeout
   290  	if len(lock.OwnerReferences) == 0 {
   291  		expires, err := time.Parse(time.RFC3339, lock.Annotations["expires"])
   292  		if err != nil {
   293  			log.Logger().Warnf("cannot parse the lock's annotation \"expires: %s\": %s\n", lock.Annotations["expires"], err.Error())
   294  			return false, nil, err
   295  		} else if !expires.After(time.Now()) {
   296  			log.Logger().Infof("the lock %s has expired", lock.Name)
   297  			return true, nil, nil
   298  		}
   299  		return false, nil, nil
   300  	}
   301  
   302  	var owner *metav1.OwnerReference
   303  	if len(lock.OwnerReferences) != 1 {
   304  		err := fmt.Errorf("the lock %s has %d OwnerReferences", lock.Name, len(lock.OwnerReferences))
   305  		log.Logger().Warnf(err.Error())
   306  		return false, nil, err
   307  	} else if owner = &lock.OwnerReferences[0]; owner.Kind != "Pod" || owner.Name == "" {
   308  		err := fmt.Errorf("the lock %s has invalid OwnerReference %v", lock.Name, owner)
   309  		log.Logger().Warn(err.Error())
   310  		return false, nil, err
   311  	}
   312  	// get the current locking pod if not already provided
   313  	if pod == nil || pod.Name != owner.Name {
   314  		var err error
   315  		pod, err = kubeClient.CoreV1().Pods(lock.Namespace).Get(owner.Name, metav1.GetOptions{})
   316  		if err != nil {
   317  			status, ok := err.(*errors.StatusError)
   318  			// the pod does not exist anymore, the lock should be removed
   319  			if ok && status.Status().Reason == metav1.StatusReasonNotFound {
   320  				log.Logger().Infof("locking pod %s finished", owner.Name)
   321  				return true, nil, nil
   322  				// an error while getting the pod
   323  			} else {
   324  				log.Logger().Warnf("failed to get the locking pod %s: %s\n", lock.Data["pod"], err.Error())
   325  				return false, nil, err
   326  			}
   327  		}
   328  	}
   329  	// check the pod's phase
   330  	log.Logger().Infof("locking pod %s is in phase %s", pod.Name, pod.Status.Phase)
   331  	if !buildLockPhaseRunning[pod.Status.Phase] {
   332  		return true, nil, nil
   333  	}
   334  	return false, pod, nil
   335  }
   336  
   337  // watchBuildLock watches a lock configmap and its locking pod to detect any change
   338  // Returns nil if the lock was deleted, or is expected to be deleted
   339  // Returns the new lock configmap if another build is waiting
   340  func watchBuildLock(kubeClient kubernetes.Interface, lock *v1.ConfigMap, pod *v1.Pod, build map[string]string) (*v1.ConfigMap, error) {
   341  	log.Logger().Infof("waiting for updates on the lock configmap %s", lock.Name)
   342  	// watch a timer for expiration
   343  	var expChan <-chan time.Time
   344  	if pod == nil {
   345  		expires, err := time.Parse(time.RFC3339, lock.Annotations["expires"])
   346  		if err != nil {
   347  			log.Logger().Warnf("cannot parse the lock's annotation \"expires: %s\": %s\n", lock.Annotations["expires"], err.Error())
   348  			return nil, err
   349  		}
   350  		remaining := expires.Sub(time.Now())
   351  		// the lock has already expired, no need to wait for anything
   352  		if remaining <= time.Duration(0) {
   353  			return lock, nil
   354  		}
   355  		log.Logger().Infof("waiting for the lock configmap %s for %s. "+
   356  			"if you are sure that the local build %s/%s #%s has finished, "+
   357  			"you can clean the lock with\n\t`kubectl delete configmap -n %s %s`",
   358  			lock.Name, remaining.Round(time.Second), lock.Labels["repository"],
   359  			lock.Labels["branch"], lock.Labels["build"], lock.Namespace, lock.Name)
   360  		timer := time.NewTimer(remaining)
   361  		defer timer.Stop()
   362  		expChan = timer.C
   363  	} else {
   364  		expChan = make(chan time.Time)
   365  	}
   366  	// watch the lock for updates
   367  	lockWatch, err := kubeClient.CoreV1().ConfigMaps(lock.Namespace).Watch(metav1.SingleObject(lock.ObjectMeta))
   368  	if err != nil {
   369  		log.Logger().Warnf("cannot watch the lock configmap %s: %s\n", lock.Name, err.Error())
   370  		return nil, err
   371  	}
   372  	defer lockWatch.Stop()
   373  	lockChan := lockWatch.ResultChan()
   374  	// watch the pod for updates
   375  	var podChan <-chan watch.Event
   376  	if pod != nil {
   377  		podWatch, err := kubeClient.CoreV1().Pods(pod.Namespace).Watch(metav1.SingleObject(pod.ObjectMeta))
   378  		if err != nil {
   379  			log.Logger().Warnf("cannot watch the locking pod %s: %s\n", pod.Name, err.Error())
   380  			return nil, err
   381  		}
   382  		defer podWatch.Stop()
   383  		podChan = podWatch.ResultChan()
   384  	} else {
   385  		podChan = make(chan watch.Event)
   386  	}
   387  	for {
   388  		select {
   389  		// an event about the lock
   390  		case event := <-lockChan:
   391  			switch event.Type {
   392  			// the lock has changed
   393  			case watch.Added, watch.Modified:
   394  				lock = event.Object.(*v1.ConfigMap)
   395  				// if the waiting build has changed, read again
   396  				if next, err := compareBuildLocks(lock.Data, build); err != nil {
   397  					return nil, err
   398  				} else if next != nil {
   399  					return lock, nil
   400  				}
   401  			// the lock is deleted, try to create it
   402  			case watch.Deleted:
   403  				return nil, nil
   404  			// an error
   405  			case watch.Error:
   406  				err := errors.FromObject(event.Object)
   407  				log.Logger().Warnf("cannot watch the lock configmap %s: %s\n", lock.Name, err.Error())
   408  				return nil, err
   409  			}
   410  		// an event about the locking pod
   411  		case event := <-podChan:
   412  			switch event.Type {
   413  			// the pod has changed, if its phase has changed,
   414  			// let's assume that the configmap has been deleted
   415  			case watch.Added, watch.Modified:
   416  				pod = event.Object.(*v1.Pod)
   417  				if !buildLockPhaseRunning[pod.Status.Phase] {
   418  					return nil, nil
   419  				}
   420  			// the pod was deleted, let's assume the configmap too
   421  			case watch.Deleted:
   422  				return nil, nil
   423  			// an error
   424  			case watch.Error:
   425  				err := errors.FromObject(event.Object)
   426  				log.Logger().Warnf("cannot watch the locking pod %s: %s\n", pod.Name, err.Error())
   427  				return nil, err
   428  			}
   429  		// the lock has expired
   430  		case <-expChan:
   431  			return lock, nil
   432  		}
   433  	}
   434  }
   435  
   436  // compareBuildLocks compares two builds
   437  // If next is nil, the build is already waiting
   438  // if next is not nil, the build should wait by updating the lock with these data
   439  func compareBuildLocks(old, new map[string]string) (map[string]string, error) {
   440  	sameRepo := true
   441  	for _, k := range [3]string{"owner", "repository", "branch"} {
   442  		if old[k] != new[k] {
   443  			sameRepo = false
   444  		}
   445  	}
   446  	// both are deplying the same repo and branch, compare build number
   447  	if sameRepo {
   448  		// same build and pod, we're already waiting
   449  		if old["build"] == new["build"] && old["pod"] == new["pod"] && old["expires"] == new["expires"] {
   450  			return nil, nil
   451  		}
   452  		// parse the builds
   453  		if oldBuild, err := strconv.Atoi(old["build"]); err != nil {
   454  			log.Logger().Warnf("cannot parse the lock's build number %s: %s\n", old["build"], err.Error())
   455  			return nil, err
   456  		} else if newBuild, err := strconv.Atoi(new["build"]); err != nil {
   457  			log.Logger().Warnf("cannot parse the lock's build number %s: %s\n", new["build"], err.Error())
   458  			return nil, err
   459  			// older build, give up
   460  		} else if oldBuild >= newBuild {
   461  			log.Logger().Warnf("newer build %d is waiting already", oldBuild)
   462  			return nil, fmt.Errorf("newer build %d is waiting already", oldBuild)
   463  		}
   464  		// parse the timestamps in order to keep th newest one
   465  		if oldTime, err := time.Parse(time.RFC3339, old["timestamp"]); err != nil {
   466  			log.Logger().Warnf("cannot parse the lock's timestamp %s: %s\n", old["timestamp"], err.Error())
   467  			return nil, err
   468  		} else if newTime, err := time.Parse(time.RFC3339, new["timestamp"]); err != nil {
   469  			log.Logger().Warnf("cannot parse the lock's timestamp %s: %s\n", new["timestamp"], err.Error())
   470  			return nil, err
   471  			// keep increasing the timestamp, for consistency reasons
   472  		} else if oldTime.After(newTime) {
   473  			next := map[string]string{}
   474  			for k, v := range new {
   475  				next[k] = v
   476  			}
   477  			next["timestamp"] = old["timestamp"]
   478  			return next, nil
   479  			// timestamp already right
   480  		} else {
   481  			return new, nil
   482  		}
   483  		// both are deploying different repos, keep the newest one
   484  		// it is a corner case for consistency
   485  		// but should not happen on a standard cluster
   486  	} else {
   487  		// parse the timestamps
   488  		if oldTime, err := time.Parse(time.RFC3339, old["timestamp"]); err != nil {
   489  			log.Logger().Warnf("cannot parse the lock's timestamp %s: %s\n", old["timestamp"], err.Error())
   490  			return nil, err
   491  		} else if newTime, err := time.Parse(time.RFC3339, new["timestamp"]); err != nil {
   492  			log.Logger().Warnf("cannot parse the lock's timestamp %s: %s\n", new["timestamp"], err.Error())
   493  			return nil, err
   494  			// newer deployment, wait
   495  		} else if newTime.After(oldTime) {
   496  			return new, nil
   497  			// older deployment, give up
   498  		} else {
   499  			return nil, fmt.Errorf("newer build %s is waiting already", oldTime)
   500  		}
   501  	}
   502  }