agones.dev/agones@v1.53.0/pkg/cloudproduct/gke/gke.go (about)

     1  // Copyright 2022 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package gke implements the GKE cloud product (specifically Autopilot for now)
    16  package gke
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"fmt"
    22  
    23  	"agones.dev/agones/pkg/apis"
    24  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    25  	"agones.dev/agones/pkg/client/informers/externalversions"
    26  	"agones.dev/agones/pkg/cloudproduct/eviction"
    27  	"agones.dev/agones/pkg/portallocator"
    28  	"agones.dev/agones/pkg/util/runtime"
    29  	"cloud.google.com/go/compute/metadata"
    30  	"github.com/pkg/errors"
    31  	corev1 "k8s.io/api/core/v1"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/util/validation/field"
    34  	"k8s.io/client-go/informers"
    35  	"k8s.io/client-go/kubernetes"
    36  )
    37  
    38  const (
    39  	hostPortAssignmentAnnotation = "autopilot.gke.io/host-port-assignment"
    40  	primaryContainerAnnotation   = "autopilot.gke.io/primary-container"
    41  
    42  	errPortPolicyMustBeDynamicOrNone = "portPolicy must be Dynamic or None on GKE Autopilot"
    43  	errRangeInvalid                  = "range must not be used on GKE Autopilot"
    44  	errSchedulingMustBePacked        = "scheduling strategy must be Packed on GKE Autopilot"
    45  	errEvictionSafeOnUpgradeInvalid  = "eviction.safe OnUpgrade not supported on GKE Autopilot"
    46  )
    47  
    48  var (
    49  	autopilotMutatingWebhooks = []string{
    50  		"workload-defaulter.config.common-webhooks.networking.gke.io", // pre-1.26
    51  		"sasecret-redacter.config.common-webhooks.networking.gke.io",  // 1.26+
    52  	}
    53  	noWorkloadDefaulter = fmt.Sprintf("found no MutatingWebhookConfigurations matching %v", autopilotMutatingWebhooks)
    54  
    55  	logger = runtime.NewLoggerWithSource("gke")
    56  )
    57  
    58  type gkeAutopilot struct {
    59  	useExtendedDurationPods bool
    60  }
    61  
    62  // hostPortAssignment is the JSON structure of the `host-port-assignment` annotation
    63  //
    64  //nolint:govet // API-like, keep consistent
    65  type hostPortAssignment struct {
    66  	Min           int32           `json:"min,omitempty"`
    67  	Max           int32           `json:"max,omitempty"`
    68  	PortsAssigned map[int32]int32 `json:"portsAssigned,omitempty"` // old -> new
    69  }
    70  
    71  // Detect whether we're running on GKE and/or Autopilot and return the appropriate
    72  // cloud product string.
    73  func Detect(ctx context.Context, kc *kubernetes.Clientset) string {
    74  	if !metadata.OnGCE() {
    75  		return ""
    76  	}
    77  	// Look for the workload defaulter - this is the current best method to detect Autopilot
    78  	found := false
    79  	for _, webhook := range autopilotMutatingWebhooks {
    80  		if _, err := kc.AdmissionregistrationV1().MutatingWebhookConfigurations().Get(
    81  			ctx, webhook, metav1.GetOptions{}); err != nil {
    82  			logger.WithError(err).WithField("webhook", webhook).Info("Detecting Autopilot MutatingWebhookConfiguration")
    83  		} else {
    84  			found = true
    85  			break
    86  		}
    87  	}
    88  	if !found {
    89  		logger.WithField("reason", noWorkloadDefaulter).Info(
    90  			"Assuming GKE Standard and defaulting to generic provider (expected if not on GKE Autopilot)")
    91  		return "" // GKE standard, but we don't need an interface for it just yet.
    92  	}
    93  	logger.Info("Running on GKE Autopilot (skip detection with --cloud-product=gke-autopilot)")
    94  	return "gke-autopilot"
    95  }
    96  
    97  // Autopilot returns a GKE Autopilot cloud product
    98  //
    99  //nolint:revive // ignore the unexported return; implements ControllerHooksInterface
   100  func Autopilot() *gkeAutopilot {
   101  	return &gkeAutopilot{useExtendedDurationPods: runtime.FeatureEnabled(runtime.FeatureGKEAutopilotExtendedDurationPods)}
   102  }
   103  
   104  func (*gkeAutopilot) SyncPodPortsToGameServer(gs *agonesv1.GameServer, pod *corev1.Pod) error {
   105  	// If applyGameServerAddressAndPort has already filled in Status, SyncPodPortsToGameServer
   106  	// has already run. Skip syncing from the Pod again - this avoids having to reason
   107  	// about whether we're re-applying the old->new mapping.
   108  	if len(gs.Status.Ports) == len(gs.Spec.Ports) {
   109  		return nil
   110  	}
   111  	annotation, ok := pod.ObjectMeta.Annotations[hostPortAssignmentAnnotation]
   112  	if !ok {
   113  		return nil
   114  	}
   115  	var hpa hostPortAssignment
   116  	if err := json.Unmarshal([]byte(annotation), &hpa); err != nil {
   117  		return errors.Wrapf(err, "could not unmarshal annotation %s (value %q)", hostPortAssignmentAnnotation, annotation)
   118  	}
   119  	for i, p := range gs.Spec.Ports {
   120  		if newPort, ok := hpa.PortsAssigned[p.HostPort]; ok {
   121  			gs.Spec.Ports[i].HostPort = newPort
   122  		}
   123  	}
   124  	return nil
   125  }
   126  
   127  func (*gkeAutopilot) NewPortAllocator(portRanges map[string]portallocator.PortRange,
   128  	_ informers.SharedInformerFactory,
   129  	_ externalversions.SharedInformerFactory,
   130  ) portallocator.Interface {
   131  	defPortRange := portRanges[agonesv1.DefaultPortRange]
   132  	return &autopilotPortAllocator{minPort: defPortRange.MinPort, maxPort: defPortRange.MaxPort}
   133  }
   134  
   135  func (*gkeAutopilot) WaitOnFreePorts() bool { return true }
   136  
   137  func checkPassthroughPortPolicy(portPolicy agonesv1.PortPolicy) bool {
   138  	// if feature is not enabled and port is Passthrough return true because that should be an invalid port
   139  	// if feature is not enabled and port is not Passthrough you can return false because there's no error  but check for None port
   140  	// if feature is enabled and port is passthrough return false because there is no error
   141  	// if feature is enabled and port is not passthrough return false because there is no error but check for None port
   142  	return (!runtime.FeatureEnabled(runtime.FeatureAutopilotPassthroughPort) && portPolicy == agonesv1.Passthrough) || portPolicy == agonesv1.Static
   143  }
   144  
   145  func (g *gkeAutopilot) ValidateGameServerSpec(gss *agonesv1.GameServerSpec, fldPath *field.Path) field.ErrorList {
   146  	allErrs := g.ValidateScheduling(gss.Scheduling, fldPath.Child("scheduling"))
   147  	for i, p := range gss.Ports {
   148  		if p.PortPolicy != agonesv1.Dynamic && (p.PortPolicy != agonesv1.None || !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone)) && checkPassthroughPortPolicy(p.PortPolicy) {
   149  			allErrs = append(allErrs, field.Invalid(fldPath.Child("ports").Index(i).Child("portPolicy"), string(p.PortPolicy), errPortPolicyMustBeDynamicOrNone))
   150  		}
   151  		if p.Range != agonesv1.DefaultPortRange && (p.PortPolicy != agonesv1.None || !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone)) {
   152  			allErrs = append(allErrs, field.Invalid(fldPath.Child("ports").Index(i).Child("range"), p.Range, errRangeInvalid))
   153  		}
   154  	}
   155  	// See SetEviction comment below for why we block EvictionSafeOnUpgrade, if Extended Duration pods aren't supported.
   156  	if !g.useExtendedDurationPods && gss.Eviction.Safe == agonesv1.EvictionSafeOnUpgrade {
   157  		allErrs = append(allErrs, field.Invalid(fldPath.Child("eviction").Child("safe"), string(gss.Eviction.Safe), errEvictionSafeOnUpgradeInvalid))
   158  	}
   159  	return allErrs
   160  }
   161  
   162  func (*gkeAutopilot) ValidateScheduling(ss apis.SchedulingStrategy, fldPath *field.Path) field.ErrorList {
   163  	if ss != apis.Packed {
   164  		return field.ErrorList{
   165  			field.Invalid(fldPath, string(ss), errSchedulingMustBePacked),
   166  		}
   167  	}
   168  	return nil
   169  }
   170  
   171  func (*gkeAutopilot) MutateGameServerPod(gss *agonesv1.GameServerSpec, pod *corev1.Pod) error {
   172  	setPassthroughLabel(gss, pod)
   173  	setPrimaryContainer(pod, gss.Container)
   174  	podSpecSeccompUnconfined(&pod.Spec)
   175  	return nil
   176  }
   177  
   178  // setPassthroughLabel sets the agones.dev/port: "autopilot-passthrough" label to the game server container.
   179  // This will help to back the container port from the allocated port using an objectSelector of this label
   180  // in GameServers that are using Passthrough Port Policy
   181  func setPassthroughLabel(gs *agonesv1.GameServerSpec, pod *corev1.Pod) {
   182  	if runtime.FeatureEnabled(runtime.FeatureAutopilotPassthroughPort) && hasPortPolicy(gs, agonesv1.Passthrough) {
   183  		pod.ObjectMeta.Labels[agonesv1.GameServerPortPolicyPodLabel] = "autopilot-passthrough"
   184  	}
   185  }
   186  
   187  // setPrimaryContainer sets the autopilot.gke.io/primary-container annotation to the game server container.
   188  // This acts as a hint to Autopilot for which container to add resources to during resource adjustment.
   189  // See https://cloud.google.com/kubernetes-engine/docs/concepts/autopilot-resource-requests#autopilot-resource-management
   190  // for more details.
   191  func setPrimaryContainer(pod *corev1.Pod, containerName string) {
   192  	if _, ok := pod.ObjectMeta.Annotations[primaryContainerAnnotation]; ok {
   193  		return
   194  	}
   195  	pod.ObjectMeta.Annotations[primaryContainerAnnotation] = containerName
   196  }
   197  
   198  // podSpecSeccompUnconfined sets to seccomp profile to `Unconfined` to avoid serious performance
   199  // degradation possible with seccomp. We only set the pod level seccompProfile, and only set
   200  // it if it hasn't been set - users can then override at either the pod or container level
   201  // in the GameServer spec.
   202  func podSpecSeccompUnconfined(podSpec *corev1.PodSpec) {
   203  	if podSpec.SecurityContext != nil && podSpec.SecurityContext.SeccompProfile != nil {
   204  		return
   205  	}
   206  	if podSpec.SecurityContext == nil {
   207  		podSpec.SecurityContext = &corev1.PodSecurityContext{}
   208  	}
   209  	podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{Type: corev1.SeccompProfileTypeUnconfined}
   210  }
   211  
   212  func (g *gkeAutopilot) SetEviction(ev *agonesv1.Eviction, pod *corev1.Pod) error {
   213  	if g.useExtendedDurationPods {
   214  		return eviction.SetEviction(ev, pod)
   215  	}
   216  	return setEvictionNoExtended(ev, pod)
   217  }
   218  
   219  // setEvictionNoExtended sets disruption controls based on GameServer.Status.Eviction. For Autopilot:
   220  //   - Since the safe-to-evict pod annotation is not supported if "false", we delete it (if it's set
   221  //     to anything else, we allow it - Autopilot only rejects "false").
   222  //   - OnUpgrade is not supported and rejected by validation above. Since we can't support
   223  //     safe-to-evict=false but can support a restrictive PDB, we can support Never and Always, but
   224  //     OnUpgrade doesn't make sense on Autopilot today. - an overly restrictive PDB prevents
   225  //     any sort of graceful eviction.
   226  func setEvictionNoExtended(ev *agonesv1.Eviction, pod *corev1.Pod) error {
   227  	if safeAnnotation := pod.ObjectMeta.Annotations[agonesv1.PodSafeToEvictAnnotation]; safeAnnotation == agonesv1.False {
   228  		delete(pod.ObjectMeta.Annotations, agonesv1.PodSafeToEvictAnnotation)
   229  	}
   230  	if ev == nil {
   231  		return errors.New("No eviction value set. Should be the default value")
   232  	}
   233  	if _, exists := pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel]; !exists {
   234  		switch ev.Safe {
   235  		case agonesv1.EvictionSafeAlways:
   236  			// For EvictionSafeAlways, we use a label value that does not match the
   237  			// agones-gameserver-safe-to-evict-false PDB. But we go ahead and label
   238  			// it, in case someone wants to adopt custom logic for this group of
   239  			// game servers.
   240  			pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.True
   241  		case agonesv1.EvictionSafeNever:
   242  			pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.False
   243  		default:
   244  			return errors.Errorf("eviction.safe == %s, which webhook should have rejected on Autopilot", ev.Safe)
   245  		}
   246  	}
   247  	return nil
   248  }
   249  
   250  func hasPortPolicy(gs *agonesv1.GameServerSpec, portPolicy agonesv1.PortPolicy) bool {
   251  	for _, p := range gs.Ports {
   252  		if p.PortPolicy == portPolicy {
   253  			return true
   254  		}
   255  	}
   256  	return false
   257  }
   258  
   259  type autopilotPortAllocator struct {
   260  	minPort int32
   261  	maxPort int32
   262  }
   263  
   264  func (*autopilotPortAllocator) Run(_ context.Context) error       { return nil }
   265  func (*autopilotPortAllocator) DeAllocate(_ *agonesv1.GameServer) {}
   266  
   267  func checkPassthroughPortPolicyForAutopilot(portPolicy agonesv1.PortPolicy) bool {
   268  	// Autopilot can have Dynamic or Passthrough
   269  	// if feature is not enabled and port is Passthrough -> true
   270  	// if feature is not enabled and port is not Passthrough -> true
   271  	// if feature is enabled and port is Passthrough -> false
   272  	// if feature is enabled and port is not Passthrough -> true
   273  	return !(runtime.FeatureEnabled(runtime.FeatureAutopilotPassthroughPort) && portPolicy == agonesv1.Passthrough)
   274  }
   275  
   276  func (apa *autopilotPortAllocator) Allocate(gs *agonesv1.GameServer) *agonesv1.GameServer {
   277  	if len(gs.Spec.Ports) == 0 {
   278  		return gs // Nothing to do.
   279  	}
   280  
   281  	var ports []agonesv1.GameServerPort
   282  	for i, p := range gs.Spec.Ports {
   283  		if p.PortPolicy != agonesv1.Dynamic && checkPassthroughPortPolicyForAutopilot(p.PortPolicy) {
   284  			logger.WithField("gs", gs.Name).WithField("portPolicy", p.PortPolicy).Error(
   285  				"GameServer has invalid PortPolicy for Autopilot - this should have been rejected by webhooks. Refusing to assign ports.")
   286  			return gs
   287  		}
   288  		p.HostPort = int32(i + 1) // Autopilot expects _some_ host port - use a value unique to this GameServer Port.
   289  
   290  		if p.Protocol == agonesv1.ProtocolTCPUDP {
   291  			tcp := p
   292  			tcp.Name = p.Name + "-tcp"
   293  			tcp.Protocol = corev1.ProtocolTCP
   294  			ports = append(ports, tcp)
   295  
   296  			p.Name += "-udp"
   297  			p.Protocol = corev1.ProtocolUDP
   298  		}
   299  		ports = append(ports, p)
   300  	}
   301  
   302  	hpa := hostPortAssignment{Min: apa.minPort, Max: apa.maxPort}
   303  	hpaJSON, err := json.Marshal(hpa)
   304  	if err != nil {
   305  		logger.WithError(err).WithField("hostPort", hpa).WithField("gs", gs.Name).Error("Internal error marshalling hostPortAssignment for GameServer")
   306  		// In error cases, return the original gs - on Autopilot this will result in a policy failure.
   307  		return gs
   308  	}
   309  
   310  	// No errors past here.
   311  	gs.Spec.Ports = ports
   312  	if gs.Spec.Template.ObjectMeta.Annotations == nil {
   313  		gs.Spec.Template.ObjectMeta.Annotations = make(map[string]string)
   314  	}
   315  	gs.Spec.Template.ObjectMeta.Annotations[hostPortAssignmentAnnotation] = string(hpaJSON)
   316  	return gs
   317  }