agones.dev/agones@v1.54.0/pkg/cloudproduct/gke/gke.go (about)

     1  // Copyright 2022 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package gke implements the GKE cloud product (specifically Autopilot for now)
    16  package gke
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"fmt"
    22  
    23  	"agones.dev/agones/pkg/apis"
    24  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    25  	"agones.dev/agones/pkg/client/informers/externalversions"
    26  	"agones.dev/agones/pkg/cloudproduct/eviction"
    27  	"agones.dev/agones/pkg/portallocator"
    28  	"agones.dev/agones/pkg/util/runtime"
    29  	"cloud.google.com/go/compute/metadata"
    30  	"github.com/pkg/errors"
    31  	corev1 "k8s.io/api/core/v1"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/util/validation/field"
    34  	"k8s.io/client-go/informers"
    35  	"k8s.io/client-go/kubernetes"
    36  )
    37  
    38  const (
    39  	hostPortAssignmentAnnotation = "autopilot.gke.io/host-port-assignment"
    40  	primaryContainerAnnotation   = "autopilot.gke.io/primary-container"
    41  
    42  	errRangeInvalid                 = "range must not be used on GKE Autopilot"
    43  	errSchedulingMustBePacked       = "scheduling strategy must be Packed on GKE Autopilot"
    44  	errEvictionSafeOnUpgradeInvalid = "eviction.safe OnUpgrade not supported on GKE Autopilot"
    45  )
    46  
    47  var (
    48  	autopilotMutatingWebhooks = []string{
    49  		"workload-defaulter.config.common-webhooks.networking.gke.io", // pre-1.26
    50  		"sasecret-redacter.config.common-webhooks.networking.gke.io",  // 1.26+
    51  	}
    52  	noWorkloadDefaulter = fmt.Sprintf("found no MutatingWebhookConfigurations matching %v", autopilotMutatingWebhooks)
    53  
    54  	logger = runtime.NewLoggerWithSource("gke")
    55  )
    56  
    57  type gkeAutopilot struct {
    58  	useExtendedDurationPods bool
    59  }
    60  
    61  // hostPortAssignment is the JSON structure of the `host-port-assignment` annotation
    62  //
    63  //nolint:govet // API-like, keep consistent
    64  type hostPortAssignment struct {
    65  	Min           int32           `json:"min,omitempty"`
    66  	Max           int32           `json:"max,omitempty"`
    67  	PortsAssigned map[int32]int32 `json:"portsAssigned,omitempty"` // old -> new
    68  }
    69  
    70  // Detect whether we're running on GKE and/or Autopilot and return the appropriate
    71  // cloud product string.
    72  func Detect(ctx context.Context, kc *kubernetes.Clientset) string {
    73  	if !metadata.OnGCE() {
    74  		return ""
    75  	}
    76  	// Look for the workload defaulter - this is the current best method to detect Autopilot
    77  	found := false
    78  	for _, webhook := range autopilotMutatingWebhooks {
    79  		if _, err := kc.AdmissionregistrationV1().MutatingWebhookConfigurations().Get(
    80  			ctx, webhook, metav1.GetOptions{}); err != nil {
    81  			logger.WithError(err).WithField("webhook", webhook).Info("Detecting Autopilot MutatingWebhookConfiguration")
    82  		} else {
    83  			found = true
    84  			break
    85  		}
    86  	}
    87  	if !found {
    88  		logger.WithField("reason", noWorkloadDefaulter).Info(
    89  			"Assuming GKE Standard and defaulting to generic provider (expected if not on GKE Autopilot)")
    90  		return "" // GKE standard, but we don't need an interface for it just yet.
    91  	}
    92  	logger.Info("Running on GKE Autopilot (skip detection with --cloud-product=gke-autopilot)")
    93  	return "gke-autopilot"
    94  }
    95  
    96  // Autopilot returns a GKE Autopilot cloud product
    97  //
    98  //nolint:revive // ignore the unexported return; implements ControllerHooksInterface
    99  func Autopilot() *gkeAutopilot {
   100  	return &gkeAutopilot{useExtendedDurationPods: runtime.FeatureEnabled(runtime.FeatureGKEAutopilotExtendedDurationPods)}
   101  }
   102  
   103  func (*gkeAutopilot) SyncPodPortsToGameServer(gs *agonesv1.GameServer, pod *corev1.Pod) error {
   104  	// If applyGameServerAddressAndPort has already filled in Status, SyncPodPortsToGameServer
   105  	// has already run. Skip syncing from the Pod again - this avoids having to reason
   106  	// about whether we're re-applying the old->new mapping.
   107  	if len(gs.Status.Ports) == len(gs.Spec.Ports) {
   108  		return nil
   109  	}
   110  	annotation, ok := pod.ObjectMeta.Annotations[hostPortAssignmentAnnotation]
   111  	if !ok {
   112  		return nil
   113  	}
   114  	var hpa hostPortAssignment
   115  	if err := json.Unmarshal([]byte(annotation), &hpa); err != nil {
   116  		return errors.Wrapf(err, "could not unmarshal annotation %s (value %q)", hostPortAssignmentAnnotation, annotation)
   117  	}
   118  	for i, p := range gs.Spec.Ports {
   119  		if newPort, ok := hpa.PortsAssigned[p.HostPort]; ok {
   120  			gs.Spec.Ports[i].HostPort = newPort
   121  		}
   122  	}
   123  	return nil
   124  }
   125  
   126  func (*gkeAutopilot) NewPortAllocator(portRanges map[string]portallocator.PortRange,
   127  	_ informers.SharedInformerFactory,
   128  	_ externalversions.SharedInformerFactory,
   129  ) portallocator.Interface {
   130  	defPortRange := portRanges[agonesv1.DefaultPortRange]
   131  	return &autopilotPortAllocator{minPort: defPortRange.MinPort, maxPort: defPortRange.MaxPort}
   132  }
   133  
   134  func (*gkeAutopilot) WaitOnFreePorts() bool { return true }
   135  
   136  func (g *gkeAutopilot) ValidateGameServerSpec(gss *agonesv1.GameServerSpec, fldPath *field.Path) field.ErrorList {
   137  	allErrs := g.ValidateScheduling(gss.Scheduling, fldPath.Child("scheduling"))
   138  
   139  	// Loop through ports and use the helper function for validation
   140  	for i, p := range gss.Ports {
   141  		allErrs = append(allErrs, validatePortPolicy(p, i, fldPath)...)
   142  
   143  	}
   144  
   145  	// See SetEviction comment below for why we block EvictionSafeOnUpgrade, if Extended Duration pods aren't supported.
   146  	if !g.useExtendedDurationPods && gss.Eviction.Safe == agonesv1.EvictionSafeOnUpgrade {
   147  		allErrs = append(allErrs, field.Invalid(fldPath.Child("eviction").Child("safe"), string(gss.Eviction.Safe), errEvictionSafeOnUpgradeInvalid))
   148  	}
   149  	return allErrs
   150  }
   151  
   152  func (*gkeAutopilot) ValidateScheduling(ss apis.SchedulingStrategy, fldPath *field.Path) field.ErrorList {
   153  	if ss != apis.Packed {
   154  		return field.ErrorList{
   155  			field.Invalid(fldPath, string(ss), errSchedulingMustBePacked),
   156  		}
   157  	}
   158  	return nil
   159  }
   160  
   161  func (*gkeAutopilot) MutateGameServerPod(gss *agonesv1.GameServerSpec, pod *corev1.Pod) error {
   162  	setPassthroughLabel(gss, pod)
   163  	setPrimaryContainer(pod, gss.Container)
   164  	podSpecSeccompUnconfined(&pod.Spec)
   165  	return nil
   166  }
   167  
   168  // setPassthroughLabel sets the agones.dev/port: "autopilot-passthrough" label to the game server container.
   169  // This will help to back the container port from the allocated port using an objectSelector of this label
   170  // in GameServers that are using Passthrough Port Policy
   171  func setPassthroughLabel(gs *agonesv1.GameServerSpec, pod *corev1.Pod) {
   172  	if hasPortPolicy(gs, agonesv1.Passthrough) {
   173  		pod.ObjectMeta.Labels[agonesv1.GameServerPortPolicyPodLabel] = "autopilot-passthrough"
   174  	}
   175  }
   176  
   177  // setPrimaryContainer sets the autopilot.gke.io/primary-container annotation to the game server container.
   178  // This acts as a hint to Autopilot for which container to add resources to during resource adjustment.
   179  // See https://cloud.google.com/kubernetes-engine/docs/concepts/autopilot-resource-requests#autopilot-resource-management
   180  // for more details.
   181  func setPrimaryContainer(pod *corev1.Pod, containerName string) {
   182  	if _, ok := pod.ObjectMeta.Annotations[primaryContainerAnnotation]; ok {
   183  		return
   184  	}
   185  	pod.ObjectMeta.Annotations[primaryContainerAnnotation] = containerName
   186  }
   187  
   188  // podSpecSeccompUnconfined sets to seccomp profile to `Unconfined` to avoid serious performance
   189  // degradation possible with seccomp. We only set the pod level seccompProfile, and only set
   190  // it if it hasn't been set - users can then override at either the pod or container level
   191  // in the GameServer spec.
   192  func podSpecSeccompUnconfined(podSpec *corev1.PodSpec) {
   193  	if podSpec.SecurityContext != nil && podSpec.SecurityContext.SeccompProfile != nil {
   194  		return
   195  	}
   196  	if podSpec.SecurityContext == nil {
   197  		podSpec.SecurityContext = &corev1.PodSecurityContext{}
   198  	}
   199  	podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{Type: corev1.SeccompProfileTypeUnconfined}
   200  }
   201  
   202  func (g *gkeAutopilot) SetEviction(ev *agonesv1.Eviction, pod *corev1.Pod) error {
   203  	if g.useExtendedDurationPods {
   204  		return eviction.SetEviction(ev, pod)
   205  	}
   206  	return setEvictionNoExtended(ev, pod)
   207  }
   208  
   209  // setEvictionNoExtended sets disruption controls based on GameServer.Status.Eviction. For Autopilot:
   210  //   - Since the safe-to-evict pod annotation is not supported if "false", we delete it (if it's set
   211  //     to anything else, we allow it - Autopilot only rejects "false").
   212  //   - OnUpgrade is not supported and rejected by validation above. Since we can't support
   213  //     safe-to-evict=false but can support a restrictive PDB, we can support Never and Always, but
   214  //     OnUpgrade doesn't make sense on Autopilot today. - an overly restrictive PDB prevents
   215  //     any sort of graceful eviction.
   216  func setEvictionNoExtended(ev *agonesv1.Eviction, pod *corev1.Pod) error {
   217  	if safeAnnotation := pod.ObjectMeta.Annotations[agonesv1.PodSafeToEvictAnnotation]; safeAnnotation == agonesv1.False {
   218  		delete(pod.ObjectMeta.Annotations, agonesv1.PodSafeToEvictAnnotation)
   219  	}
   220  	if ev == nil {
   221  		return errors.New("No eviction value set. Should be the default value")
   222  	}
   223  	if _, exists := pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel]; !exists {
   224  		switch ev.Safe {
   225  		case agonesv1.EvictionSafeAlways:
   226  			// For EvictionSafeAlways, we use a label value that does not match the
   227  			// agones-gameserver-safe-to-evict-false PDB. But we go ahead and label
   228  			// it, in case someone wants to adopt custom logic for this group of
   229  			// game servers.
   230  			pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.True
   231  		case agonesv1.EvictionSafeNever:
   232  			pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.False
   233  		default:
   234  			return errors.Errorf("eviction.safe == %s, which webhook should have rejected on Autopilot", ev.Safe)
   235  		}
   236  	}
   237  	return nil
   238  }
   239  
   240  func hasPortPolicy(gs *agonesv1.GameServerSpec, portPolicy agonesv1.PortPolicy) bool {
   241  	for _, p := range gs.Ports {
   242  		if p.PortPolicy == portPolicy {
   243  			return true
   244  		}
   245  	}
   246  	return false
   247  }
   248  
   249  type autopilotPortAllocator struct {
   250  	minPort int32
   251  	maxPort int32
   252  }
   253  
   254  func (*autopilotPortAllocator) Run(_ context.Context) error       { return nil }
   255  func (*autopilotPortAllocator) DeAllocate(_ *agonesv1.GameServer) {}
   256  
   257  func (apa *autopilotPortAllocator) Allocate(gs *agonesv1.GameServer) *agonesv1.GameServer {
   258  	if len(gs.Spec.Ports) == 0 {
   259  		return gs // Nothing to do.
   260  	}
   261  
   262  	var ports []agonesv1.GameServerPort
   263  	for i, p := range gs.Spec.Ports {
   264  		if !(p.PortPolicy == agonesv1.Dynamic || p.PortPolicy == agonesv1.Passthrough) {
   265  			logger.WithField("gs", gs.Name).WithField("portPolicy", p.PortPolicy).Error(
   266  				"GameServer has invalid PortPolicy for Autopilot - this should have been rejected by webhooks. Refusing to assign ports.")
   267  			return gs
   268  		}
   269  		p.HostPort = int32(i + 1) // Autopilot expects _some_ host port - use a value unique to this GameServer Port.
   270  
   271  		if p.Protocol == agonesv1.ProtocolTCPUDP {
   272  			tcp := p
   273  			tcp.Name = p.Name + "-tcp"
   274  			tcp.Protocol = corev1.ProtocolTCP
   275  			ports = append(ports, tcp)
   276  
   277  			p.Name += "-udp"
   278  			p.Protocol = corev1.ProtocolUDP
   279  		}
   280  		ports = append(ports, p)
   281  	}
   282  
   283  	hpa := hostPortAssignment{Min: apa.minPort, Max: apa.maxPort}
   284  	hpaJSON, err := json.Marshal(hpa)
   285  	if err != nil {
   286  		logger.WithError(err).WithField("hostPort", hpa).WithField("gs", gs.Name).Error("Internal error marshalling hostPortAssignment for GameServer")
   287  		// In error cases, return the original gs - on Autopilot this will result in a policy failure.
   288  		return gs
   289  	}
   290  
   291  	// No errors past here.
   292  	gs.Spec.Ports = ports
   293  	if gs.Spec.Template.ObjectMeta.Annotations == nil {
   294  		gs.Spec.Template.ObjectMeta.Annotations = make(map[string]string)
   295  	}
   296  	gs.Spec.Template.ObjectMeta.Annotations[hostPortAssignmentAnnotation] = string(hpaJSON)
   297  	return gs
   298  }
   299  
   300  // validatePortPolicy is a helper function to validate a single GameServerPort's PortPolicy
   301  // for GKE Autopilot constraints.
   302  func validatePortPolicy(p agonesv1.GameServerPort, i int, fldPath *field.Path) field.ErrorList {
   303  	var allErrs field.ErrorList
   304  	portPath := fldPath.Child("ports").Index(i)
   305  
   306  	switch p.PortPolicy {
   307  	case agonesv1.Dynamic, agonesv1.Passthrough:
   308  		// These policies are always valid on GKE Autopilot.
   309  	case agonesv1.None:
   310  		// "None" is valid only if the feature gate FeaturePortPolicyNone is enabled.
   311  		if !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone) {
   312  			allErrs = append(allErrs, field.Invalid(portPath.Child("portPolicy"), p.PortPolicy, "PortPolicy 'None' is not enabled"))
   313  		}
   314  	default:
   315  		// Any other port policy, such as "Static", is considered invalid on GKE Autopilot.
   316  		allErrs = append(allErrs, field.Invalid(portPath.Child("portPolicy"), p.PortPolicy, "portPolicy must be Dynamic, Passthrough, or None on GKE Autopilot"))
   317  	}
   318  
   319  	if p.Range != agonesv1.DefaultPortRange && (p.PortPolicy != agonesv1.None || !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone)) {
   320  		allErrs = append(allErrs, field.Invalid(fldPath.Child("ports").Index(i).Child("range"), p.Range, errRangeInvalid))
   321  	}
   322  
   323  	return allErrs
   324  }