agones.dev/agones@v1.53.0/pkg/gameservers/health.go (about) 1 // Copyright 2018 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gameservers 16 17 import ( 18 "context" 19 "strings" 20 21 "agones.dev/agones/pkg/apis/agones" 22 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 23 "agones.dev/agones/pkg/client/clientset/versioned" 24 getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" 25 "agones.dev/agones/pkg/client/informers/externalversions" 26 listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" 27 "agones.dev/agones/pkg/util/logfields" 28 "agones.dev/agones/pkg/util/runtime" 29 "agones.dev/agones/pkg/util/workerqueue" 30 "github.com/heptiolabs/healthcheck" 31 "github.com/pkg/errors" 32 "github.com/sirupsen/logrus" 33 corev1 "k8s.io/api/core/v1" 34 k8serrors "k8s.io/apimachinery/pkg/api/errors" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 "k8s.io/client-go/informers" 37 "k8s.io/client-go/kubernetes" 38 "k8s.io/client-go/kubernetes/scheme" 39 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" 40 corelisterv1 "k8s.io/client-go/listers/core/v1" 41 "k8s.io/client-go/tools/cache" 42 "k8s.io/client-go/tools/record" 43 ) 44 45 // HealthController watches Pods, and applies 46 // an Unhealthy state if certain pods crash, or can't be assigned a port, and other 47 // similar type conditions. 48 type HealthController struct { 49 baseLogger *logrus.Entry 50 podSynced cache.InformerSynced 51 podLister corelisterv1.PodLister 52 gameServerSynced cache.InformerSynced 53 gameServerGetter getterv1.GameServersGetter 54 gameServerLister listerv1.GameServerLister 55 workerqueue *workerqueue.WorkerQueue 56 recorder record.EventRecorder 57 waitOnFreePorts bool 58 } 59 60 // NewHealthController returns a HealthController 61 func NewHealthController( 62 health healthcheck.Handler, 63 kubeClient kubernetes.Interface, 64 agonesClient versioned.Interface, 65 kubeInformerFactory informers.SharedInformerFactory, 66 agonesInformerFactory externalversions.SharedInformerFactory, 67 waitOnFreePorts bool) *HealthController { 68 69 podInformer := kubeInformerFactory.Core().V1().Pods().Informer() 70 gameserverInformer := agonesInformerFactory.Agones().V1().GameServers() 71 hc := &HealthController{ 72 podSynced: podInformer.HasSynced, 73 podLister: kubeInformerFactory.Core().V1().Pods().Lister(), 74 gameServerSynced: gameserverInformer.Informer().HasSynced, 75 gameServerGetter: agonesClient.AgonesV1(), 76 gameServerLister: gameserverInformer.Lister(), 77 waitOnFreePorts: waitOnFreePorts, 78 } 79 80 hc.baseLogger = runtime.NewLoggerWithType(hc) 81 hc.workerqueue = workerqueue.NewWorkerQueue(hc.syncGameServer, hc.baseLogger, logfields.GameServerKey, agones.GroupName+".HealthController") 82 health.AddLivenessCheck("gameserver-health-workerqueue", healthcheck.Check(hc.workerqueue.Healthy)) 83 84 eventBroadcaster := record.NewBroadcaster() 85 eventBroadcaster.StartLogging(hc.baseLogger.Debugf) 86 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) 87 hc.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "health-controller"}) 88 89 _, _ = podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 90 UpdateFunc: func(_, newObj interface{}) { 91 pod := newObj.(*corev1.Pod) 92 if pod.ObjectMeta.DeletionTimestamp.IsZero() && isGameServerPod(pod) && hc.isUnhealthy(pod) { 93 hc.workerqueue.Enqueue(pod) 94 } 95 }, 96 DeleteFunc: func(obj interface{}) { 97 // Could be a DeletedFinalStateUnknown, in which case, just ignore it 98 pod, ok := obj.(*corev1.Pod) 99 if ok && isGameServerPod(pod) { 100 hc.workerqueue.Enqueue(pod) 101 } 102 }, 103 }) 104 return hc 105 } 106 107 // isUnhealthy returns if the Pod event is going 108 // to cause the GameServer to become Unhealthy 109 func (hc *HealthController) isUnhealthy(pod *corev1.Pod) bool { 110 return hc.evictedPod(pod) || hc.unschedulableWithNoFreePorts(pod) || hc.failedContainer(pod) || hc.failedPod(pod) 111 } 112 113 // unschedulableWithNoFreePorts checks if the reason the Pod couldn't be scheduled 114 // was because there weren't any free ports in the range specified 115 func (hc *HealthController) unschedulableWithNoFreePorts(pod *corev1.Pod) bool { 116 // return false, when sidecars are enabled, since we are just going to rely mostly on Pod Status. 117 if runtime.FeatureEnabled(runtime.FeatureSidecarContainers) { 118 return false 119 } 120 121 // On some cloud products (GKE Autopilot), wait on the Autoscaler to schedule a pod with conflicting ports. 122 if hc.waitOnFreePorts { 123 return false 124 } 125 for _, cond := range pod.Status.Conditions { 126 if cond.Type == corev1.PodScheduled && cond.Reason == corev1.PodReasonUnschedulable { 127 if strings.Contains(cond.Message, "free ports") { 128 hc.baseLogger.WithField("gs", pod.ObjectMeta.Name).WithField("conditions", pod.Status.Conditions).Debug("Pod Unschedulable With No Free Ports") 129 return true 130 } 131 } 132 } 133 return false 134 } 135 136 // failedPod checks if the Pod's phase is "Failed" 137 func (hc *HealthController) failedPod(pod *corev1.Pod) bool { 138 // return false, since a failed pod only happens when sidecars are enabled. 139 if !runtime.FeatureEnabled(runtime.FeatureSidecarContainers) { 140 return false 141 } 142 143 return pod.Status.Phase == corev1.PodFailed 144 } 145 146 // evictedPod checks if the Pod was Evicted 147 // could be caused by reaching limit on Ephemeral storage 148 func (hc *HealthController) evictedPod(pod *corev1.Pod) bool { 149 evicted := pod.Status.Reason == "Evicted" 150 if evicted { 151 hc.baseLogger.WithField("gs", pod.ObjectMeta.Name).WithField("status", pod.Status).Debug("Pod Evicted") 152 } 153 return evicted 154 } 155 156 // failedContainer checks each container, and determines if the main gameserver 157 // container has failed 158 func (hc *HealthController) failedContainer(pod *corev1.Pod) bool { 159 // return false, since there is no gameserver restart before ready. When this graduates, you can delete this 160 // whole function. 161 if runtime.FeatureEnabled(runtime.FeatureSidecarContainers) { 162 return false 163 } 164 165 container := pod.Annotations[agonesv1.GameServerContainerAnnotation] 166 for _, cs := range pod.Status.ContainerStatuses { 167 if cs.Name == container { 168 // sometimes on a restart, the cs.State can be running and the last state will be merged 169 failed := cs.State.Terminated != nil || cs.LastTerminationState.Terminated != nil 170 if failed { 171 hc.baseLogger.WithField("gs", pod.ObjectMeta.Name).WithField("containerStatuses", pod.Status.ContainerStatuses).WithField("container", container).Debug("Container Failed") 172 } 173 return failed 174 } 175 } 176 return false 177 } 178 179 // Run processes the rate limited queue. 180 // Will block until stop is closed 181 func (hc *HealthController) Run(ctx context.Context, workers int) error { 182 hc.baseLogger.Debug("Wait for cache sync") 183 if !cache.WaitForCacheSync(ctx.Done(), hc.gameServerSynced, hc.podSynced) { 184 return errors.New("failed to wait for caches to sync") 185 } 186 187 hc.workerqueue.Run(ctx, workers) 188 189 return nil 190 } 191 192 func (hc *HealthController) loggerForGameServerKey(key string) *logrus.Entry { 193 return logfields.AugmentLogEntry(hc.baseLogger, logfields.GameServerKey, key) 194 } 195 196 func (hc *HealthController) loggerForGameServer(gs *agonesv1.GameServer) *logrus.Entry { 197 gsName := logfields.NilGameServer 198 if gs != nil { 199 gsName = gs.Namespace + "/" + gs.Name 200 } 201 return hc.loggerForGameServerKey(gsName).WithField("gs", gs) 202 } 203 204 // syncGameServer sets the GameServer to Unhealthy, if its state is Ready 205 func (hc *HealthController) syncGameServer(ctx context.Context, key string) error { 206 hc.loggerForGameServerKey(key).Debug("Synchronising") 207 208 // Convert the namespace/name string into a distinct namespace and name 209 namespace, name, err := cache.SplitMetaNamespaceKey(key) 210 if err != nil { 211 // don't return an error, as we don't want this retried 212 runtime.HandleError(hc.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key")) 213 return nil 214 } 215 216 gs, err := hc.gameServerLister.GameServers(namespace).Get(name) 217 if err != nil { 218 if k8serrors.IsNotFound(err) { 219 hc.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing") 220 return nil 221 } 222 return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace) 223 } 224 225 // at this point we don't care, we're already Unhealthy / deleting 226 if gs.IsBeingDeleted() || gs.Status.State == agonesv1.GameServerStateUnhealthy || gs.Status.State == agonesv1.GameServerStateError { 227 return nil 228 } 229 230 // retrieve the pod for the gameserver 231 pod, err := hc.podLister.Pods(gs.ObjectMeta.Namespace).Get(gs.ObjectMeta.Name) 232 if err != nil { 233 if !k8serrors.IsNotFound(err) { 234 // If the pod exists but there is an error, go back into the queue. 235 return errors.Wrapf(err, "error retrieving Pod %s for GameServer to check status", gs.ObjectMeta.Name) 236 } 237 hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).Debug("Could not find Pod") 238 } 239 240 // Make sure that the pod has to be marked unhealthy 241 if pod != nil { 242 if skip, err := hc.skipUnhealthyGameContainer(gs, pod); err != nil || skip { 243 return err 244 } 245 246 // If the pod is not unhealthy any more, go back in the queue 247 if !hc.isUnhealthy(pod) { 248 hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("podStatus", pod.Status).Debug("GameServer is not unhealthy anymore") 249 return nil 250 } 251 } 252 253 hc.loggerForGameServer(gs).Debug("Issue with GameServer pod, marking as GameServerStateUnhealthy") 254 gsCopy := gs.DeepCopy() 255 gsCopy.Status.State = agonesv1.GameServerStateUnhealthy 256 257 if _, err := hc.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}); err != nil { 258 return errors.Wrapf(err, "error updating GameServer %s/%s to unhealthy", gs.ObjectMeta.Name, gs.ObjectMeta.Namespace) 259 } 260 261 hc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), "Issue with Gameserver pod") 262 263 return nil 264 } 265 266 // skipUnhealthyGameContainer determines if it's appropriate to not move to Unhealthy when a Pod's 267 // gameserver container has crashed, or let it restart as per usual K8s operations. 268 // It does this by checking a combination of the current GameServer state and annotation data that stores 269 // which container instance was live if the GameServer has been marked as Ready. 270 // The logic is as follows: 271 // - If the GameServer is not yet Ready, allow to restart (return true) 272 // - If the GameServer is in a state past Ready, move to Unhealthy 273 func (hc *HealthController) skipUnhealthyGameContainer(gs *agonesv1.GameServer, pod *corev1.Pod) (bool, error) { 274 if !metav1.IsControlledBy(pod, gs) { 275 // This is not the Pod we are looking for 🤖 276 return false, nil 277 } 278 279 // if Sidecar is enabled, always return false, since there is no skip - it's just whatever K8s/Agones wants tso do. 280 // on move to stable, this function can be deleted. 281 if runtime.FeatureEnabled(runtime.FeatureSidecarContainers) { 282 return false, nil 283 } 284 285 // If the GameServer is before Ready, both annotation values should be "" 286 // If the GameServer is past Ready, both the annotations should be exactly the same. 287 // If they are annotations are different, then the data between the GameServer and the Pod is out of sync, 288 // in which case, send it back to the queue to try again. 289 gsReadyContainerID := gs.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] 290 if pod.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] != gsReadyContainerID { 291 return false, workerqueue.NewTraceError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name)) 292 } 293 294 if gs.IsBeforeReady() { 295 hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("state", gs.Status.State).Debug("skipUnhealthyGameContainer: Is Before Ready. Checking failed container") 296 // If the reason for failure was a container failure, then we can skip moving to Unhealthy. 297 // otherwise, we know it was one of the other reasons (eviction, lack of ports), so we should definitely go to Unhealthy. 298 return hc.failedContainer(pod), nil 299 } 300 301 // finally, we need to check if the failed container happened after the gameserver was ready or before. 302 for _, cs := range pod.Status.ContainerStatuses { 303 if cs.Name == gs.Spec.Container { 304 if cs.State.Terminated != nil { 305 hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Container is terminated, returning false") 306 return false, nil 307 } 308 if cs.LastTerminationState.Terminated != nil { 309 // if the current container is running, and is the ready container, then we know this is some 310 // other pod update, and we previously had a restart before we got to being Ready, and therefore 311 // shouldn't move to Unhealthy. 312 check := cs.ContainerID == gsReadyContainerID 313 if !check { 314 hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Container crashed after Ready, returning false") 315 } 316 return check, nil 317 } 318 break 319 } 320 } 321 322 hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Game Container has not crashed, game container may be healthy") 323 return false, nil 324 }