agones.dev/agones@v1.53.0/pkg/gameservers/health.go (about)

     1  // Copyright 2018 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gameservers
    16  
    17  import (
    18  	"context"
    19  	"strings"
    20  
    21  	"agones.dev/agones/pkg/apis/agones"
    22  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    23  	"agones.dev/agones/pkg/client/clientset/versioned"
    24  	getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
    25  	"agones.dev/agones/pkg/client/informers/externalversions"
    26  	listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
    27  	"agones.dev/agones/pkg/util/logfields"
    28  	"agones.dev/agones/pkg/util/runtime"
    29  	"agones.dev/agones/pkg/util/workerqueue"
    30  	"github.com/heptiolabs/healthcheck"
    31  	"github.com/pkg/errors"
    32  	"github.com/sirupsen/logrus"
    33  	corev1 "k8s.io/api/core/v1"
    34  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    35  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    36  	"k8s.io/client-go/informers"
    37  	"k8s.io/client-go/kubernetes"
    38  	"k8s.io/client-go/kubernetes/scheme"
    39  	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    40  	corelisterv1 "k8s.io/client-go/listers/core/v1"
    41  	"k8s.io/client-go/tools/cache"
    42  	"k8s.io/client-go/tools/record"
    43  )
    44  
    45  // HealthController watches Pods, and applies
    46  // an Unhealthy state if certain pods crash, or can't be assigned a port, and other
    47  // similar type conditions.
    48  type HealthController struct {
    49  	baseLogger       *logrus.Entry
    50  	podSynced        cache.InformerSynced
    51  	podLister        corelisterv1.PodLister
    52  	gameServerSynced cache.InformerSynced
    53  	gameServerGetter getterv1.GameServersGetter
    54  	gameServerLister listerv1.GameServerLister
    55  	workerqueue      *workerqueue.WorkerQueue
    56  	recorder         record.EventRecorder
    57  	waitOnFreePorts  bool
    58  }
    59  
    60  // NewHealthController returns a HealthController
    61  func NewHealthController(
    62  	health healthcheck.Handler,
    63  	kubeClient kubernetes.Interface,
    64  	agonesClient versioned.Interface,
    65  	kubeInformerFactory informers.SharedInformerFactory,
    66  	agonesInformerFactory externalversions.SharedInformerFactory,
    67  	waitOnFreePorts bool) *HealthController {
    68  
    69  	podInformer := kubeInformerFactory.Core().V1().Pods().Informer()
    70  	gameserverInformer := agonesInformerFactory.Agones().V1().GameServers()
    71  	hc := &HealthController{
    72  		podSynced:        podInformer.HasSynced,
    73  		podLister:        kubeInformerFactory.Core().V1().Pods().Lister(),
    74  		gameServerSynced: gameserverInformer.Informer().HasSynced,
    75  		gameServerGetter: agonesClient.AgonesV1(),
    76  		gameServerLister: gameserverInformer.Lister(),
    77  		waitOnFreePorts:  waitOnFreePorts,
    78  	}
    79  
    80  	hc.baseLogger = runtime.NewLoggerWithType(hc)
    81  	hc.workerqueue = workerqueue.NewWorkerQueue(hc.syncGameServer, hc.baseLogger, logfields.GameServerKey, agones.GroupName+".HealthController")
    82  	health.AddLivenessCheck("gameserver-health-workerqueue", healthcheck.Check(hc.workerqueue.Healthy))
    83  
    84  	eventBroadcaster := record.NewBroadcaster()
    85  	eventBroadcaster.StartLogging(hc.baseLogger.Debugf)
    86  	eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
    87  	hc.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "health-controller"})
    88  
    89  	_, _ = podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
    90  		UpdateFunc: func(_, newObj interface{}) {
    91  			pod := newObj.(*corev1.Pod)
    92  			if pod.ObjectMeta.DeletionTimestamp.IsZero() && isGameServerPod(pod) && hc.isUnhealthy(pod) {
    93  				hc.workerqueue.Enqueue(pod)
    94  			}
    95  		},
    96  		DeleteFunc: func(obj interface{}) {
    97  			// Could be a DeletedFinalStateUnknown, in which case, just ignore it
    98  			pod, ok := obj.(*corev1.Pod)
    99  			if ok && isGameServerPod(pod) {
   100  				hc.workerqueue.Enqueue(pod)
   101  			}
   102  		},
   103  	})
   104  	return hc
   105  }
   106  
   107  // isUnhealthy returns if the Pod event is going
   108  // to cause the GameServer to become Unhealthy
   109  func (hc *HealthController) isUnhealthy(pod *corev1.Pod) bool {
   110  	return hc.evictedPod(pod) || hc.unschedulableWithNoFreePorts(pod) || hc.failedContainer(pod) || hc.failedPod(pod)
   111  }
   112  
   113  // unschedulableWithNoFreePorts checks if the reason the Pod couldn't be scheduled
   114  // was because there weren't any free ports in the range specified
   115  func (hc *HealthController) unschedulableWithNoFreePorts(pod *corev1.Pod) bool {
   116  	// return false, when sidecars are enabled, since we are just going to rely mostly on Pod Status.
   117  	if runtime.FeatureEnabled(runtime.FeatureSidecarContainers) {
   118  		return false
   119  	}
   120  
   121  	// On some cloud products (GKE Autopilot), wait on the Autoscaler to schedule a pod with conflicting ports.
   122  	if hc.waitOnFreePorts {
   123  		return false
   124  	}
   125  	for _, cond := range pod.Status.Conditions {
   126  		if cond.Type == corev1.PodScheduled && cond.Reason == corev1.PodReasonUnschedulable {
   127  			if strings.Contains(cond.Message, "free ports") {
   128  				hc.baseLogger.WithField("gs", pod.ObjectMeta.Name).WithField("conditions", pod.Status.Conditions).Debug("Pod Unschedulable With No Free Ports")
   129  				return true
   130  			}
   131  		}
   132  	}
   133  	return false
   134  }
   135  
   136  // failedPod checks if the Pod's phase is "Failed"
   137  func (hc *HealthController) failedPod(pod *corev1.Pod) bool {
   138  	// return false, since a failed pod only happens when sidecars are enabled.
   139  	if !runtime.FeatureEnabled(runtime.FeatureSidecarContainers) {
   140  		return false
   141  	}
   142  
   143  	return pod.Status.Phase == corev1.PodFailed
   144  }
   145  
   146  // evictedPod checks if the Pod was Evicted
   147  // could be caused by reaching limit on Ephemeral storage
   148  func (hc *HealthController) evictedPod(pod *corev1.Pod) bool {
   149  	evicted := pod.Status.Reason == "Evicted"
   150  	if evicted {
   151  		hc.baseLogger.WithField("gs", pod.ObjectMeta.Name).WithField("status", pod.Status).Debug("Pod Evicted")
   152  	}
   153  	return evicted
   154  }
   155  
   156  // failedContainer checks each container, and determines if the main gameserver
   157  // container has failed
   158  func (hc *HealthController) failedContainer(pod *corev1.Pod) bool {
   159  	// return false, since there is no gameserver restart before ready. When this graduates, you can delete this
   160  	// whole function.
   161  	if runtime.FeatureEnabled(runtime.FeatureSidecarContainers) {
   162  		return false
   163  	}
   164  
   165  	container := pod.Annotations[agonesv1.GameServerContainerAnnotation]
   166  	for _, cs := range pod.Status.ContainerStatuses {
   167  		if cs.Name == container {
   168  			// sometimes on a restart, the cs.State can be running and the last state will be merged
   169  			failed := cs.State.Terminated != nil || cs.LastTerminationState.Terminated != nil
   170  			if failed {
   171  				hc.baseLogger.WithField("gs", pod.ObjectMeta.Name).WithField("containerStatuses", pod.Status.ContainerStatuses).WithField("container", container).Debug("Container Failed")
   172  			}
   173  			return failed
   174  		}
   175  	}
   176  	return false
   177  }
   178  
   179  // Run processes the rate limited queue.
   180  // Will block until stop is closed
   181  func (hc *HealthController) Run(ctx context.Context, workers int) error {
   182  	hc.baseLogger.Debug("Wait for cache sync")
   183  	if !cache.WaitForCacheSync(ctx.Done(), hc.gameServerSynced, hc.podSynced) {
   184  		return errors.New("failed to wait for caches to sync")
   185  	}
   186  
   187  	hc.workerqueue.Run(ctx, workers)
   188  
   189  	return nil
   190  }
   191  
   192  func (hc *HealthController) loggerForGameServerKey(key string) *logrus.Entry {
   193  	return logfields.AugmentLogEntry(hc.baseLogger, logfields.GameServerKey, key)
   194  }
   195  
   196  func (hc *HealthController) loggerForGameServer(gs *agonesv1.GameServer) *logrus.Entry {
   197  	gsName := logfields.NilGameServer
   198  	if gs != nil {
   199  		gsName = gs.Namespace + "/" + gs.Name
   200  	}
   201  	return hc.loggerForGameServerKey(gsName).WithField("gs", gs)
   202  }
   203  
   204  // syncGameServer sets the GameServer to Unhealthy, if its state is Ready
   205  func (hc *HealthController) syncGameServer(ctx context.Context, key string) error {
   206  	hc.loggerForGameServerKey(key).Debug("Synchronising")
   207  
   208  	// Convert the namespace/name string into a distinct namespace and name
   209  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
   210  	if err != nil {
   211  		// don't return an error, as we don't want this retried
   212  		runtime.HandleError(hc.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key"))
   213  		return nil
   214  	}
   215  
   216  	gs, err := hc.gameServerLister.GameServers(namespace).Get(name)
   217  	if err != nil {
   218  		if k8serrors.IsNotFound(err) {
   219  			hc.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing")
   220  			return nil
   221  		}
   222  		return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace)
   223  	}
   224  
   225  	// at this point we don't care, we're already Unhealthy / deleting
   226  	if gs.IsBeingDeleted() || gs.Status.State == agonesv1.GameServerStateUnhealthy || gs.Status.State == agonesv1.GameServerStateError {
   227  		return nil
   228  	}
   229  
   230  	// retrieve the pod for the gameserver
   231  	pod, err := hc.podLister.Pods(gs.ObjectMeta.Namespace).Get(gs.ObjectMeta.Name)
   232  	if err != nil {
   233  		if !k8serrors.IsNotFound(err) {
   234  			// If the pod exists but there is an error, go back into the queue.
   235  			return errors.Wrapf(err, "error retrieving Pod %s for GameServer to check status", gs.ObjectMeta.Name)
   236  		}
   237  		hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).Debug("Could not find Pod")
   238  	}
   239  
   240  	// Make sure that the pod has to be marked unhealthy
   241  	if pod != nil {
   242  		if skip, err := hc.skipUnhealthyGameContainer(gs, pod); err != nil || skip {
   243  			return err
   244  		}
   245  
   246  		// If the pod is not unhealthy any more, go back in the queue
   247  		if !hc.isUnhealthy(pod) {
   248  			hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("podStatus", pod.Status).Debug("GameServer is not unhealthy anymore")
   249  			return nil
   250  		}
   251  	}
   252  
   253  	hc.loggerForGameServer(gs).Debug("Issue with GameServer pod, marking as GameServerStateUnhealthy")
   254  	gsCopy := gs.DeepCopy()
   255  	gsCopy.Status.State = agonesv1.GameServerStateUnhealthy
   256  
   257  	if _, err := hc.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}); err != nil {
   258  		return errors.Wrapf(err, "error updating GameServer %s/%s to unhealthy", gs.ObjectMeta.Name, gs.ObjectMeta.Namespace)
   259  	}
   260  
   261  	hc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), "Issue with Gameserver pod")
   262  
   263  	return nil
   264  }
   265  
   266  // skipUnhealthyGameContainer determines if it's appropriate to not move to Unhealthy when a Pod's
   267  // gameserver container has crashed, or let it restart as per usual K8s operations.
   268  // It does this by checking a combination of the current GameServer state and annotation data that stores
   269  // which container instance was live if the GameServer has been marked as Ready.
   270  // The logic is as follows:
   271  //   - If the GameServer is not yet Ready, allow to restart (return true)
   272  //   - If the GameServer is in a state past Ready, move to Unhealthy
   273  func (hc *HealthController) skipUnhealthyGameContainer(gs *agonesv1.GameServer, pod *corev1.Pod) (bool, error) {
   274  	if !metav1.IsControlledBy(pod, gs) {
   275  		// This is not the Pod we are looking for 🤖
   276  		return false, nil
   277  	}
   278  
   279  	// if Sidecar is enabled, always return false, since there is no skip - it's just whatever K8s/Agones wants tso do.
   280  	// on move to stable, this function can be deleted.
   281  	if runtime.FeatureEnabled(runtime.FeatureSidecarContainers) {
   282  		return false, nil
   283  	}
   284  
   285  	// If the GameServer is before Ready, both annotation values should be ""
   286  	// If the GameServer is past Ready, both the annotations should be exactly the same.
   287  	// If they are annotations are different, then the data between the GameServer and the Pod is out of sync,
   288  	// in which case, send it back to the queue to try again.
   289  	gsReadyContainerID := gs.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]
   290  	if pod.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] != gsReadyContainerID {
   291  		return false, workerqueue.NewTraceError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name))
   292  	}
   293  
   294  	if gs.IsBeforeReady() {
   295  		hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("state", gs.Status.State).Debug("skipUnhealthyGameContainer: Is Before Ready. Checking failed container")
   296  		// If the reason for failure was a container failure, then we can skip moving to Unhealthy.
   297  		// otherwise, we know it was one of the other reasons (eviction, lack of ports), so we should definitely go to Unhealthy.
   298  		return hc.failedContainer(pod), nil
   299  	}
   300  
   301  	// finally, we need to check if the failed container happened after the gameserver was ready or before.
   302  	for _, cs := range pod.Status.ContainerStatuses {
   303  		if cs.Name == gs.Spec.Container {
   304  			if cs.State.Terminated != nil {
   305  				hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Container is terminated, returning false")
   306  				return false, nil
   307  			}
   308  			if cs.LastTerminationState.Terminated != nil {
   309  				// if the current container is running, and is the ready container, then we know this is some
   310  				// other pod update, and we previously had a restart before we got to being Ready, and therefore
   311  				// shouldn't move to Unhealthy.
   312  				check := cs.ContainerID == gsReadyContainerID
   313  				if !check {
   314  					hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Container crashed after Ready, returning false")
   315  				}
   316  				return check, nil
   317  			}
   318  			break
   319  		}
   320  	}
   321  
   322  	hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Game Container has not crashed, game container may be healthy")
   323  	return false, nil
   324  }