agones.dev/agones@v1.53.0/pkg/gameservers/missing.go (about)

     1  // Copyright 2020 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gameservers
    16  
    17  import (
    18  	"context"
    19  
    20  	"agones.dev/agones/pkg/apis/agones"
    21  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    22  	"agones.dev/agones/pkg/client/clientset/versioned"
    23  	"agones.dev/agones/pkg/client/clientset/versioned/scheme"
    24  	getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
    25  	"agones.dev/agones/pkg/client/informers/externalversions"
    26  	listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
    27  	"agones.dev/agones/pkg/util/logfields"
    28  	"agones.dev/agones/pkg/util/runtime"
    29  	"agones.dev/agones/pkg/util/workerqueue"
    30  	"github.com/heptiolabs/healthcheck"
    31  	"github.com/pkg/errors"
    32  	"github.com/sirupsen/logrus"
    33  	corev1 "k8s.io/api/core/v1"
    34  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    35  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    36  	"k8s.io/client-go/informers"
    37  	"k8s.io/client-go/kubernetes"
    38  	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    39  	corelisterv1 "k8s.io/client-go/listers/core/v1"
    40  	"k8s.io/client-go/tools/cache"
    41  	"k8s.io/client-go/tools/record"
    42  )
    43  
    44  // MissingPodController makes sure that any GameServer
    45  // that isn't in a Scheduled or Unhealthy state and is missing a Pod is
    46  // moved to Unhealthy.
    47  //
    48  // It's possible that a GameServer is missing its associated pod due to
    49  // unexpected controller downtime or if the Pod is deleted with no subsequent Delete event.
    50  //
    51  // Since resync on the controller is every 30 seconds, even if there is some time in which a GameServer
    52  // is in a broken state, it will eventually move to Unhealthy, and get replaced (if in a Fleet).
    53  type MissingPodController struct {
    54  	baseLogger       *logrus.Entry
    55  	podSynced        cache.InformerSynced
    56  	podLister        corelisterv1.PodLister
    57  	gameServerSynced cache.InformerSynced
    58  	gameServerGetter getterv1.GameServersGetter
    59  	gameServerLister listerv1.GameServerLister
    60  	workerqueue      *workerqueue.WorkerQueue
    61  	recorder         record.EventRecorder
    62  }
    63  
    64  // NewMissingPodController returns a MissingPodController
    65  func NewMissingPodController(health healthcheck.Handler,
    66  	kubeClient kubernetes.Interface,
    67  	agonesClient versioned.Interface,
    68  	kubeInformerFactory informers.SharedInformerFactory,
    69  	agonesInformerFactory externalversions.SharedInformerFactory) *MissingPodController {
    70  	podInformer := kubeInformerFactory.Core().V1().Pods().Informer()
    71  	gameServers := agonesInformerFactory.Agones().V1().GameServers()
    72  
    73  	c := &MissingPodController{
    74  		podSynced:        podInformer.HasSynced,
    75  		podLister:        kubeInformerFactory.Core().V1().Pods().Lister(),
    76  		gameServerSynced: gameServers.Informer().HasSynced,
    77  		gameServerGetter: agonesClient.AgonesV1(),
    78  		gameServerLister: gameServers.Lister(),
    79  	}
    80  
    81  	c.baseLogger = runtime.NewLoggerWithType(c)
    82  	c.workerqueue = workerqueue.NewWorkerQueue(c.syncGameServer, c.baseLogger, logfields.GameServerKey, agones.GroupName+".MissingPodController")
    83  	health.AddLivenessCheck("gameserver-missing-pod-workerqueue", healthcheck.Check(c.workerqueue.Healthy))
    84  
    85  	eventBroadcaster := record.NewBroadcaster()
    86  	eventBroadcaster.StartLogging(c.baseLogger.Debugf)
    87  	eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
    88  	c.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "missing-pod-controller"})
    89  
    90  	_, _ = gameServers.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    91  		UpdateFunc: func(_, newObj interface{}) {
    92  			gs := newObj.(*agonesv1.GameServer)
    93  
    94  			if _, isDev := gs.GetDevAddress(); !isDev && !isBeforePodCreated(gs) && !gs.IsBeingDeleted() &&
    95  				!(gs.Status.State == agonesv1.GameServerStateUnhealthy) && !(gs.Status.State == agonesv1.GameServerStateError) {
    96  
    97  				// Only queue the Pod if there is an issue retrieving it. If it exists, don't queue it, since we know it's not missing.
    98  				// If there was an error accessing the Kubernetes control plane then enqueue it, so it can be rechecked when the control plane comes back up.
    99  				if pod, err := c.podLister.Pods(gs.ObjectMeta.Namespace).Get(gs.ObjectMeta.Name); err != nil || !isGameServerPod(pod) {
   100  					c.workerqueue.Enqueue(gs)
   101  				}
   102  			}
   103  		},
   104  	})
   105  
   106  	return c
   107  }
   108  
   109  // Run processes the rate limited queue.
   110  // Will block until stop is closed
   111  func (c *MissingPodController) Run(ctx context.Context, workers int) error {
   112  	c.baseLogger.Debug("Wait for cache sync")
   113  	if !cache.WaitForCacheSync(ctx.Done(), c.gameServerSynced, c.podSynced) {
   114  		return errors.New("failed to wait for caches to sync")
   115  	}
   116  
   117  	c.workerqueue.Run(ctx, workers)
   118  	return nil
   119  }
   120  
   121  func (c *MissingPodController) loggerForGameServerKey(key string) *logrus.Entry {
   122  	return logfields.AugmentLogEntry(c.baseLogger, logfields.GameServerKey, key)
   123  }
   124  
   125  // syncGameServer checks if a GameServer has a backing Pod, and if not,
   126  // moves it to Unhealthy
   127  func (c *MissingPodController) syncGameServer(ctx context.Context, key string) error {
   128  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
   129  	if err != nil {
   130  		// don't return an error, as we don't want this retried
   131  		runtime.HandleError(c.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key"))
   132  		return nil
   133  	}
   134  
   135  	// check if the pod exists
   136  	if pod, err := c.podLister.Pods(namespace).Get(name); err != nil {
   137  		if !k8serrors.IsNotFound(err) {
   138  			return errors.Wrapf(err, "error retrieving Pod %s from namespace %s", name, namespace)
   139  		}
   140  	} else if isGameServerPod(pod) {
   141  		// if the pod exists, all is well, and we can continue on our merry way.
   142  		return nil
   143  	}
   144  	c.loggerForGameServerKey(key).Debug("Pod is missing. Moving GameServer to Unhealthy.")
   145  
   146  	gs, err := c.gameServerLister.GameServers(namespace).Get(name)
   147  	if err != nil {
   148  		if k8serrors.IsNotFound(err) {
   149  			c.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing")
   150  			return nil
   151  		}
   152  		return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace)
   153  	}
   154  
   155  	// already on the way out, so no need to do anything.
   156  	if gs.IsBeingDeleted() || gs.Status.State == agonesv1.GameServerStateUnhealthy {
   157  		c.loggerForGameServerKey(key).WithField("state", gs.Status.State).Debug("GameServer already being deleted/unhealthy. Skipping.")
   158  		return nil
   159  	}
   160  
   161  	gsCopy := gs.DeepCopy()
   162  	gsCopy.Status.State = agonesv1.GameServerStateUnhealthy
   163  	gs, err = c.gameServerGetter.GameServers(gsCopy.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{})
   164  	if err != nil {
   165  		return errors.Wrap(err, "error updating GameServer to Unhealthy")
   166  	}
   167  
   168  	c.recorder.Event(gs, corev1.EventTypeWarning, string(gs.Status.State), "Pod is missing")
   169  	return nil
   170  }