agones.dev/agones@v1.54.0/pkg/gameservers/migration.go (about)

     1  // Copyright 2020 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gameservers
    16  
    17  import (
    18  	"context"
    19  	"strings"
    20  
    21  	"agones.dev/agones/pkg/apis/agones"
    22  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    23  	"agones.dev/agones/pkg/client/clientset/versioned"
    24  	getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
    25  	"agones.dev/agones/pkg/client/informers/externalversions"
    26  	listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
    27  	"agones.dev/agones/pkg/util/logfields"
    28  	"agones.dev/agones/pkg/util/runtime"
    29  	"agones.dev/agones/pkg/util/workerqueue"
    30  	"github.com/heptiolabs/healthcheck"
    31  	"github.com/pkg/errors"
    32  	"github.com/sirupsen/logrus"
    33  	corev1 "k8s.io/api/core/v1"
    34  	k8sv1 "k8s.io/api/core/v1"
    35  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    36  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    37  	"k8s.io/client-go/informers"
    38  	"k8s.io/client-go/kubernetes"
    39  	"k8s.io/client-go/kubernetes/scheme"
    40  	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    41  	corelisterv1 "k8s.io/client-go/listers/core/v1"
    42  	"k8s.io/client-go/tools/cache"
    43  	"k8s.io/client-go/tools/record"
    44  )
    45  
    46  // MigrationController watches for if a Pod is migrated/a maintenance
    47  // event happens on a node, and a Pod is recreated with a new Address for a
    48  // GameServer
    49  type MigrationController struct {
    50  	baseLogger               *logrus.Entry
    51  	podSynced                cache.InformerSynced
    52  	podLister                corelisterv1.PodLister
    53  	gameServerSynced         cache.InformerSynced
    54  	gameServerGetter         getterv1.GameServersGetter
    55  	gameServerLister         listerv1.GameServerLister
    56  	nodeLister               corelisterv1.NodeLister
    57  	nodeSynced               cache.InformerSynced
    58  	workerqueue              *workerqueue.WorkerQueue
    59  	recorder                 record.EventRecorder
    60  	syncPodPortsToGameServer func(*agonesv1.GameServer, *corev1.Pod) error
    61  }
    62  
    63  // NewMigrationController returns a MigrationController
    64  func NewMigrationController(health healthcheck.Handler,
    65  	kubeClient kubernetes.Interface,
    66  	agonesClient versioned.Interface,
    67  	kubeInformerFactory informers.SharedInformerFactory,
    68  	agonesInformerFactory externalversions.SharedInformerFactory,
    69  	syncPodPortsToGameServer func(*agonesv1.GameServer, *corev1.Pod) error,
    70  ) *MigrationController {
    71  
    72  	podInformer := kubeInformerFactory.Core().V1().Pods().Informer()
    73  	gameserverInformer := agonesInformerFactory.Agones().V1().GameServers()
    74  	mc := &MigrationController{
    75  		podSynced:                podInformer.HasSynced,
    76  		podLister:                kubeInformerFactory.Core().V1().Pods().Lister(),
    77  		gameServerSynced:         gameserverInformer.Informer().HasSynced,
    78  		gameServerGetter:         agonesClient.AgonesV1(),
    79  		gameServerLister:         gameserverInformer.Lister(),
    80  		nodeLister:               kubeInformerFactory.Core().V1().Nodes().Lister(),
    81  		nodeSynced:               kubeInformerFactory.Core().V1().Nodes().Informer().HasSynced,
    82  		syncPodPortsToGameServer: syncPodPortsToGameServer,
    83  	}
    84  
    85  	mc.baseLogger = runtime.NewLoggerWithType(mc)
    86  	mc.workerqueue = workerqueue.NewWorkerQueue(mc.syncGameServer, mc.baseLogger, logfields.GameServerKey, agones.GroupName+".MigrationController")
    87  	health.AddLivenessCheck("gameserver-migration-workerqueue", healthcheck.Check(mc.workerqueue.Healthy))
    88  
    89  	eventBroadcaster := record.NewBroadcaster()
    90  	eventBroadcaster.StartLogging(mc.baseLogger.Debugf)
    91  	eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
    92  	mc.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "migration-controller"})
    93  
    94  	_, _ = podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
    95  		AddFunc: func(obj interface{}) {
    96  			pod := obj.(*corev1.Pod)
    97  			if _, _, ok, err := mc.isMigratingGameServerPod(pod); err != nil || ok {
    98  				mc.workerqueue.Enqueue(pod)
    99  			}
   100  		},
   101  		UpdateFunc: func(_, newObj interface{}) {
   102  			pod := newObj.(*corev1.Pod)
   103  			if _, _, ok, err := mc.isMigratingGameServerPod(pod); err != nil || ok {
   104  				mc.workerqueue.Enqueue(pod)
   105  			}
   106  		},
   107  	})
   108  	return mc
   109  }
   110  
   111  // Run processes the rate limited queue.
   112  // Will block until stop is closed
   113  func (mc *MigrationController) Run(ctx context.Context, workers int) error {
   114  	mc.baseLogger.Debug("Wait for cache sync")
   115  	if !cache.WaitForCacheSync(ctx.Done(), mc.nodeSynced, mc.gameServerSynced, mc.podSynced) {
   116  		return errors.New("failed to wait for caches to sync")
   117  	}
   118  
   119  	mc.workerqueue.Run(ctx, workers)
   120  	return nil
   121  }
   122  
   123  func (mc *MigrationController) loggerForGameServerKey(key string) *logrus.Entry {
   124  	return logfields.AugmentLogEntry(mc.baseLogger, logfields.GameServerKey, key)
   125  }
   126  
   127  func (mc *MigrationController) loggerForGameServer(gs *agonesv1.GameServer) *logrus.Entry {
   128  	gsName := logfields.NilGameServer
   129  	if gs != nil {
   130  		gsName = gs.Namespace + "/" + gs.Name
   131  	}
   132  	return mc.loggerForGameServerKey(gsName).WithField("gs", gs)
   133  }
   134  
   135  func (mc *MigrationController) isMigratingGameServerPod(pod *k8sv1.Pod) (*agonesv1.GameServer, *k8sv1.Node, bool, error) {
   136  	if pod.Spec.NodeName == "" || !pod.ObjectMeta.DeletionTimestamp.IsZero() || !isGameServerPod(pod) {
   137  		return nil, nil, false, nil
   138  	}
   139  
   140  	key := pod.Namespace + "/" + pod.Name
   141  	gs, err := mc.gameServerLister.GameServers(pod.ObjectMeta.Namespace).Get(pod.ObjectMeta.Name)
   142  	if err != nil {
   143  		if k8serrors.IsNotFound(err) {
   144  			mc.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing")
   145  			return nil, nil, false, nil
   146  		}
   147  		return nil, nil, false, errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", pod.ObjectMeta.Name, pod.ObjectMeta.Namespace)
   148  	}
   149  
   150  	// Either the address has not been set, or we're being deleted already
   151  	if gs.Status.NodeName == "" || gs.IsBeingDeleted() || gs.Status.State == agonesv1.GameServerStateUnhealthy {
   152  		return nil, nil, false, nil
   153  	}
   154  
   155  	if pod.Spec.NodeName == "" {
   156  		return nil, nil, false, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
   157  	}
   158  
   159  	node, err := mc.nodeLister.Get(pod.Spec.NodeName)
   160  	if err != nil {
   161  		if k8serrors.IsNotFound(err) {
   162  			mc.loggerForGameServerKey(key).WithField("node", pod.Spec.NodeName).Debug("Node is no longer available for syncing")
   163  			return nil, nil, false, nil
   164  		}
   165  		return nil, nil, false, errors.Wrapf(err, "error retrieving node %s for Pod %s", pod.Spec.NodeName, pod.ObjectMeta.Name)
   166  	}
   167  
   168  	// if the node is being terminated, then also escape, because the Pod is going to be Terminated if it hasn't been
   169  	// already.
   170  	if !node.ObjectMeta.DeletionTimestamp.IsZero() {
   171  		return nil, nil, false, nil
   172  	}
   173  
   174  	// if the nodes match, and the default GameServer Address matches one of the node addresses - escape, since
   175  	// migration isn't happening.
   176  	if pod.Spec.NodeName == gs.Status.NodeName && mc.anyAddressMatch(node, gs) {
   177  		return nil, nil, false, nil
   178  	}
   179  
   180  	return gs, node, true, nil
   181  }
   182  
   183  // syncGameServer will check if the Pod for the GameServer
   184  // has been migrated to a new node (or a node with the same name, but different address)
   185  // and will either update it, or move it to Unhealthy, depending on its State
   186  func (mc *MigrationController) syncGameServer(ctx context.Context, key string) error {
   187  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
   188  	if err != nil {
   189  		// don't return an error, as we don't want this retried
   190  		runtime.HandleError(mc.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key"))
   191  		return nil
   192  	}
   193  
   194  	pod, err := mc.podLister.Pods(namespace).Get(name)
   195  	if err != nil {
   196  		if k8serrors.IsNotFound(err) {
   197  			mc.loggerForGameServerKey(key).Debug("Pod is no longer available for syncing")
   198  			return nil
   199  		}
   200  		return errors.Wrapf(err, "error retrieving Pod %s from namespace %s", name, namespace)
   201  	}
   202  
   203  	gs, node, ok, err := mc.isMigratingGameServerPod(pod)
   204  	// if there is an error, retry, but if not migrating then escape and continue on with your life doing other things.
   205  	if err != nil || !ok {
   206  		return err
   207  	}
   208  
   209  	// If the GameServer has yet to become ready, we will reapply the Address and Port
   210  	// otherwise, we move it to Unhealthy so that a new GameServer will be recreated.
   211  	gsCopy := gs.DeepCopy()
   212  	var eventMsg string
   213  	if gsCopy.IsBeforeReady() {
   214  		gsCopy, err = applyGameServerAddressAndPort(gsCopy, node, pod, mc.syncPodPortsToGameServer)
   215  		if err != nil {
   216  			return err
   217  		}
   218  		eventMsg = "Address updated due to Node migration"
   219  	} else {
   220  		gsCopy.Status.State = agonesv1.GameServerStateUnhealthy
   221  		eventMsg = "Node migration occurred"
   222  	}
   223  
   224  	if gs, err = mc.gameServerGetter.GameServers(gsCopy.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{}); err != nil {
   225  		return err
   226  	}
   227  
   228  	mc.loggerForGameServer(gs).Debug("GameServer migration occurred")
   229  	mc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), eventMsg)
   230  
   231  	return nil
   232  }
   233  
   234  func (mc *MigrationController) anyAddressMatch(node *k8sv1.Node, gs *agonesv1.GameServer) bool {
   235  	var nodeAddresses []string
   236  	for _, a := range node.Status.Addresses {
   237  		if a.Address == gs.Status.Address {
   238  			return true
   239  		}
   240  		nodeAddresses = append(nodeAddresses, a.Address)
   241  	}
   242  	mc.loggerForGameServer(gs).
   243  		WithField("gs", gs.Name).
   244  		WithField("gs.Status.Address", gs.Status.Address).
   245  		WithField("node.Status.Addresses", strings.Join(nodeAddresses, ",")).
   246  		Warn("GameServer/Node address mismatch")
   247  	return false
   248  }