agones.dev/agones@v1.54.0/pkg/gameservers/missing.go (about) 1 // Copyright 2020 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gameservers 16 17 import ( 18 "context" 19 20 "agones.dev/agones/pkg/apis/agones" 21 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 22 "agones.dev/agones/pkg/client/clientset/versioned" 23 "agones.dev/agones/pkg/client/clientset/versioned/scheme" 24 getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" 25 "agones.dev/agones/pkg/client/informers/externalversions" 26 listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" 27 "agones.dev/agones/pkg/util/logfields" 28 "agones.dev/agones/pkg/util/runtime" 29 "agones.dev/agones/pkg/util/workerqueue" 30 "github.com/heptiolabs/healthcheck" 31 "github.com/pkg/errors" 32 "github.com/sirupsen/logrus" 33 corev1 "k8s.io/api/core/v1" 34 k8serrors "k8s.io/apimachinery/pkg/api/errors" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 "k8s.io/client-go/informers" 37 "k8s.io/client-go/kubernetes" 38 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" 39 corelisterv1 "k8s.io/client-go/listers/core/v1" 40 "k8s.io/client-go/tools/cache" 41 "k8s.io/client-go/tools/record" 42 ) 43 44 // MissingPodController makes sure that any GameServer 45 // that isn't in a Scheduled or Unhealthy state and is missing a Pod is 46 // moved to Unhealthy. 47 // 48 // It's possible that a GameServer is missing its associated pod due to 49 // unexpected controller downtime or if the Pod is deleted with no subsequent Delete event. 50 // 51 // Since resync on the controller is every 30 seconds, even if there is some time in which a GameServer 52 // is in a broken state, it will eventually move to Unhealthy, and get replaced (if in a Fleet). 53 type MissingPodController struct { 54 baseLogger *logrus.Entry 55 podSynced cache.InformerSynced 56 podLister corelisterv1.PodLister 57 gameServerSynced cache.InformerSynced 58 gameServerGetter getterv1.GameServersGetter 59 gameServerLister listerv1.GameServerLister 60 workerqueue *workerqueue.WorkerQueue 61 recorder record.EventRecorder 62 } 63 64 // NewMissingPodController returns a MissingPodController 65 func NewMissingPodController(health healthcheck.Handler, 66 kubeClient kubernetes.Interface, 67 agonesClient versioned.Interface, 68 kubeInformerFactory informers.SharedInformerFactory, 69 agonesInformerFactory externalversions.SharedInformerFactory) *MissingPodController { 70 podInformer := kubeInformerFactory.Core().V1().Pods().Informer() 71 gameServers := agonesInformerFactory.Agones().V1().GameServers() 72 73 c := &MissingPodController{ 74 podSynced: podInformer.HasSynced, 75 podLister: kubeInformerFactory.Core().V1().Pods().Lister(), 76 gameServerSynced: gameServers.Informer().HasSynced, 77 gameServerGetter: agonesClient.AgonesV1(), 78 gameServerLister: gameServers.Lister(), 79 } 80 81 c.baseLogger = runtime.NewLoggerWithType(c) 82 c.workerqueue = workerqueue.NewWorkerQueue(c.syncGameServer, c.baseLogger, logfields.GameServerKey, agones.GroupName+".MissingPodController") 83 health.AddLivenessCheck("gameserver-missing-pod-workerqueue", healthcheck.Check(c.workerqueue.Healthy)) 84 85 eventBroadcaster := record.NewBroadcaster() 86 eventBroadcaster.StartLogging(c.baseLogger.Debugf) 87 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) 88 c.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "missing-pod-controller"}) 89 90 _, _ = gameServers.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 91 UpdateFunc: func(_, newObj interface{}) { 92 gs := newObj.(*agonesv1.GameServer) 93 94 if _, isDev := gs.GetDevAddress(); !isDev && !isBeforePodCreated(gs) && !gs.IsBeingDeleted() && 95 !(gs.Status.State == agonesv1.GameServerStateUnhealthy) && !(gs.Status.State == agonesv1.GameServerStateError) { 96 97 // Only queue the Pod if there is an issue retrieving it. If it exists, don't queue it, since we know it's not missing. 98 // If there was an error accessing the Kubernetes control plane then enqueue it, so it can be rechecked when the control plane comes back up. 99 if pod, err := c.podLister.Pods(gs.ObjectMeta.Namespace).Get(gs.ObjectMeta.Name); err != nil || !isGameServerPod(pod) { 100 c.workerqueue.Enqueue(gs) 101 } 102 } 103 }, 104 }) 105 106 return c 107 } 108 109 // Run processes the rate limited queue. 110 // Will block until stop is closed 111 func (c *MissingPodController) Run(ctx context.Context, workers int) error { 112 c.baseLogger.Debug("Wait for cache sync") 113 if !cache.WaitForCacheSync(ctx.Done(), c.gameServerSynced, c.podSynced) { 114 return errors.New("failed to wait for caches to sync") 115 } 116 117 c.workerqueue.Run(ctx, workers) 118 return nil 119 } 120 121 func (c *MissingPodController) loggerForGameServerKey(key string) *logrus.Entry { 122 return logfields.AugmentLogEntry(c.baseLogger, logfields.GameServerKey, key) 123 } 124 125 // syncGameServer checks if a GameServer has a backing Pod, and if not, 126 // moves it to Unhealthy 127 func (c *MissingPodController) syncGameServer(ctx context.Context, key string) error { 128 namespace, name, err := cache.SplitMetaNamespaceKey(key) 129 if err != nil { 130 // don't return an error, as we don't want this retried 131 runtime.HandleError(c.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key")) 132 return nil 133 } 134 135 // check if the pod exists 136 if pod, err := c.podLister.Pods(namespace).Get(name); err != nil { 137 if !k8serrors.IsNotFound(err) { 138 return errors.Wrapf(err, "error retrieving Pod %s from namespace %s", name, namespace) 139 } 140 } else if isGameServerPod(pod) { 141 // if the pod exists, all is well, and we can continue on our merry way. 142 return nil 143 } 144 c.loggerForGameServerKey(key).Debug("Pod is missing. Moving GameServer to Unhealthy.") 145 146 gs, err := c.gameServerLister.GameServers(namespace).Get(name) 147 if err != nil { 148 if k8serrors.IsNotFound(err) { 149 c.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing") 150 return nil 151 } 152 return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace) 153 } 154 155 // already on the way out, so no need to do anything. 156 if gs.IsBeingDeleted() || gs.Status.State == agonesv1.GameServerStateUnhealthy { 157 c.loggerForGameServerKey(key).WithField("state", gs.Status.State).Debug("GameServer already being deleted/unhealthy. Skipping.") 158 return nil 159 } 160 161 gsCopy := gs.DeepCopy() 162 gsCopy.Status.State = agonesv1.GameServerStateUnhealthy 163 gs, err = c.gameServerGetter.GameServers(gsCopy.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}) 164 if err != nil { 165 return errors.Wrap(err, "error updating GameServer to Unhealthy") 166 } 167 168 c.recorder.Event(gs, corev1.EventTypeWarning, string(gs.Status.State), "Pod is missing") 169 return nil 170 }