agones.dev/agones@v1.54.0/pkg/gameservers/migration.go (about) 1 // Copyright 2020 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gameservers 16 17 import ( 18 "context" 19 "strings" 20 21 "agones.dev/agones/pkg/apis/agones" 22 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 23 "agones.dev/agones/pkg/client/clientset/versioned" 24 getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" 25 "agones.dev/agones/pkg/client/informers/externalversions" 26 listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" 27 "agones.dev/agones/pkg/util/logfields" 28 "agones.dev/agones/pkg/util/runtime" 29 "agones.dev/agones/pkg/util/workerqueue" 30 "github.com/heptiolabs/healthcheck" 31 "github.com/pkg/errors" 32 "github.com/sirupsen/logrus" 33 corev1 "k8s.io/api/core/v1" 34 k8sv1 "k8s.io/api/core/v1" 35 k8serrors "k8s.io/apimachinery/pkg/api/errors" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/client-go/informers" 38 "k8s.io/client-go/kubernetes" 39 "k8s.io/client-go/kubernetes/scheme" 40 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" 41 corelisterv1 "k8s.io/client-go/listers/core/v1" 42 "k8s.io/client-go/tools/cache" 43 "k8s.io/client-go/tools/record" 44 ) 45 46 // MigrationController watches for if a Pod is migrated/a maintenance 47 // event happens on a node, and a Pod is recreated with a new Address for a 48 // GameServer 49 type MigrationController struct { 50 baseLogger *logrus.Entry 51 podSynced cache.InformerSynced 52 podLister corelisterv1.PodLister 53 gameServerSynced cache.InformerSynced 54 gameServerGetter getterv1.GameServersGetter 55 gameServerLister listerv1.GameServerLister 56 nodeLister corelisterv1.NodeLister 57 nodeSynced cache.InformerSynced 58 workerqueue *workerqueue.WorkerQueue 59 recorder record.EventRecorder 60 syncPodPortsToGameServer func(*agonesv1.GameServer, *corev1.Pod) error 61 } 62 63 // NewMigrationController returns a MigrationController 64 func NewMigrationController(health healthcheck.Handler, 65 kubeClient kubernetes.Interface, 66 agonesClient versioned.Interface, 67 kubeInformerFactory informers.SharedInformerFactory, 68 agonesInformerFactory externalversions.SharedInformerFactory, 69 syncPodPortsToGameServer func(*agonesv1.GameServer, *corev1.Pod) error, 70 ) *MigrationController { 71 72 podInformer := kubeInformerFactory.Core().V1().Pods().Informer() 73 gameserverInformer := agonesInformerFactory.Agones().V1().GameServers() 74 mc := &MigrationController{ 75 podSynced: podInformer.HasSynced, 76 podLister: kubeInformerFactory.Core().V1().Pods().Lister(), 77 gameServerSynced: gameserverInformer.Informer().HasSynced, 78 gameServerGetter: agonesClient.AgonesV1(), 79 gameServerLister: gameserverInformer.Lister(), 80 nodeLister: kubeInformerFactory.Core().V1().Nodes().Lister(), 81 nodeSynced: kubeInformerFactory.Core().V1().Nodes().Informer().HasSynced, 82 syncPodPortsToGameServer: syncPodPortsToGameServer, 83 } 84 85 mc.baseLogger = runtime.NewLoggerWithType(mc) 86 mc.workerqueue = workerqueue.NewWorkerQueue(mc.syncGameServer, mc.baseLogger, logfields.GameServerKey, agones.GroupName+".MigrationController") 87 health.AddLivenessCheck("gameserver-migration-workerqueue", healthcheck.Check(mc.workerqueue.Healthy)) 88 89 eventBroadcaster := record.NewBroadcaster() 90 eventBroadcaster.StartLogging(mc.baseLogger.Debugf) 91 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) 92 mc.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "migration-controller"}) 93 94 _, _ = podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 95 AddFunc: func(obj interface{}) { 96 pod := obj.(*corev1.Pod) 97 if _, _, ok, err := mc.isMigratingGameServerPod(pod); err != nil || ok { 98 mc.workerqueue.Enqueue(pod) 99 } 100 }, 101 UpdateFunc: func(_, newObj interface{}) { 102 pod := newObj.(*corev1.Pod) 103 if _, _, ok, err := mc.isMigratingGameServerPod(pod); err != nil || ok { 104 mc.workerqueue.Enqueue(pod) 105 } 106 }, 107 }) 108 return mc 109 } 110 111 // Run processes the rate limited queue. 112 // Will block until stop is closed 113 func (mc *MigrationController) Run(ctx context.Context, workers int) error { 114 mc.baseLogger.Debug("Wait for cache sync") 115 if !cache.WaitForCacheSync(ctx.Done(), mc.nodeSynced, mc.gameServerSynced, mc.podSynced) { 116 return errors.New("failed to wait for caches to sync") 117 } 118 119 mc.workerqueue.Run(ctx, workers) 120 return nil 121 } 122 123 func (mc *MigrationController) loggerForGameServerKey(key string) *logrus.Entry { 124 return logfields.AugmentLogEntry(mc.baseLogger, logfields.GameServerKey, key) 125 } 126 127 func (mc *MigrationController) loggerForGameServer(gs *agonesv1.GameServer) *logrus.Entry { 128 gsName := logfields.NilGameServer 129 if gs != nil { 130 gsName = gs.Namespace + "/" + gs.Name 131 } 132 return mc.loggerForGameServerKey(gsName).WithField("gs", gs) 133 } 134 135 func (mc *MigrationController) isMigratingGameServerPod(pod *k8sv1.Pod) (*agonesv1.GameServer, *k8sv1.Node, bool, error) { 136 if pod.Spec.NodeName == "" || !pod.ObjectMeta.DeletionTimestamp.IsZero() || !isGameServerPod(pod) { 137 return nil, nil, false, nil 138 } 139 140 key := pod.Namespace + "/" + pod.Name 141 gs, err := mc.gameServerLister.GameServers(pod.ObjectMeta.Namespace).Get(pod.ObjectMeta.Name) 142 if err != nil { 143 if k8serrors.IsNotFound(err) { 144 mc.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing") 145 return nil, nil, false, nil 146 } 147 return nil, nil, false, errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", pod.ObjectMeta.Name, pod.ObjectMeta.Namespace) 148 } 149 150 // Either the address has not been set, or we're being deleted already 151 if gs.Status.NodeName == "" || gs.IsBeingDeleted() || gs.Status.State == agonesv1.GameServerStateUnhealthy { 152 return nil, nil, false, nil 153 } 154 155 if pod.Spec.NodeName == "" { 156 return nil, nil, false, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name)) 157 } 158 159 node, err := mc.nodeLister.Get(pod.Spec.NodeName) 160 if err != nil { 161 if k8serrors.IsNotFound(err) { 162 mc.loggerForGameServerKey(key).WithField("node", pod.Spec.NodeName).Debug("Node is no longer available for syncing") 163 return nil, nil, false, nil 164 } 165 return nil, nil, false, errors.Wrapf(err, "error retrieving node %s for Pod %s", pod.Spec.NodeName, pod.ObjectMeta.Name) 166 } 167 168 // if the node is being terminated, then also escape, because the Pod is going to be Terminated if it hasn't been 169 // already. 170 if !node.ObjectMeta.DeletionTimestamp.IsZero() { 171 return nil, nil, false, nil 172 } 173 174 // if the nodes match, and the default GameServer Address matches one of the node addresses - escape, since 175 // migration isn't happening. 176 if pod.Spec.NodeName == gs.Status.NodeName && mc.anyAddressMatch(node, gs) { 177 return nil, nil, false, nil 178 } 179 180 return gs, node, true, nil 181 } 182 183 // syncGameServer will check if the Pod for the GameServer 184 // has been migrated to a new node (or a node with the same name, but different address) 185 // and will either update it, or move it to Unhealthy, depending on its State 186 func (mc *MigrationController) syncGameServer(ctx context.Context, key string) error { 187 namespace, name, err := cache.SplitMetaNamespaceKey(key) 188 if err != nil { 189 // don't return an error, as we don't want this retried 190 runtime.HandleError(mc.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key")) 191 return nil 192 } 193 194 pod, err := mc.podLister.Pods(namespace).Get(name) 195 if err != nil { 196 if k8serrors.IsNotFound(err) { 197 mc.loggerForGameServerKey(key).Debug("Pod is no longer available for syncing") 198 return nil 199 } 200 return errors.Wrapf(err, "error retrieving Pod %s from namespace %s", name, namespace) 201 } 202 203 gs, node, ok, err := mc.isMigratingGameServerPod(pod) 204 // if there is an error, retry, but if not migrating then escape and continue on with your life doing other things. 205 if err != nil || !ok { 206 return err 207 } 208 209 // If the GameServer has yet to become ready, we will reapply the Address and Port 210 // otherwise, we move it to Unhealthy so that a new GameServer will be recreated. 211 gsCopy := gs.DeepCopy() 212 var eventMsg string 213 if gsCopy.IsBeforeReady() { 214 gsCopy, err = applyGameServerAddressAndPort(gsCopy, node, pod, mc.syncPodPortsToGameServer) 215 if err != nil { 216 return err 217 } 218 eventMsg = "Address updated due to Node migration" 219 } else { 220 gsCopy.Status.State = agonesv1.GameServerStateUnhealthy 221 eventMsg = "Node migration occurred" 222 } 223 224 if gs, err = mc.gameServerGetter.GameServers(gsCopy.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}); err != nil { 225 return err 226 } 227 228 mc.loggerForGameServer(gs).Debug("GameServer migration occurred") 229 mc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), eventMsg) 230 231 return nil 232 } 233 234 func (mc *MigrationController) anyAddressMatch(node *k8sv1.Node, gs *agonesv1.GameServer) bool { 235 var nodeAddresses []string 236 for _, a := range node.Status.Addresses { 237 if a.Address == gs.Status.Address { 238 return true 239 } 240 nodeAddresses = append(nodeAddresses, a.Address) 241 } 242 mc.loggerForGameServer(gs). 243 WithField("gs", gs.Name). 244 WithField("gs.Status.Address", gs.Status.Address). 245 WithField("node.Status.Addresses", strings.Join(nodeAddresses, ",")). 246 Warn("GameServer/Node address mismatch") 247 return false 248 }