agones.dev/agones@v1.54.0/pkg/cloudproduct/gke/gke.go (about) 1 // Copyright 2022 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package gke implements the GKE cloud product (specifically Autopilot for now) 16 package gke 17 18 import ( 19 "context" 20 "encoding/json" 21 "fmt" 22 23 "agones.dev/agones/pkg/apis" 24 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 25 "agones.dev/agones/pkg/client/informers/externalversions" 26 "agones.dev/agones/pkg/cloudproduct/eviction" 27 "agones.dev/agones/pkg/portallocator" 28 "agones.dev/agones/pkg/util/runtime" 29 "cloud.google.com/go/compute/metadata" 30 "github.com/pkg/errors" 31 corev1 "k8s.io/api/core/v1" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/util/validation/field" 34 "k8s.io/client-go/informers" 35 "k8s.io/client-go/kubernetes" 36 ) 37 38 const ( 39 hostPortAssignmentAnnotation = "autopilot.gke.io/host-port-assignment" 40 primaryContainerAnnotation = "autopilot.gke.io/primary-container" 41 42 errRangeInvalid = "range must not be used on GKE Autopilot" 43 errSchedulingMustBePacked = "scheduling strategy must be Packed on GKE Autopilot" 44 errEvictionSafeOnUpgradeInvalid = "eviction.safe OnUpgrade not supported on GKE Autopilot" 45 ) 46 47 var ( 48 autopilotMutatingWebhooks = []string{ 49 "workload-defaulter.config.common-webhooks.networking.gke.io", // pre-1.26 50 "sasecret-redacter.config.common-webhooks.networking.gke.io", // 1.26+ 51 } 52 noWorkloadDefaulter = fmt.Sprintf("found no MutatingWebhookConfigurations matching %v", autopilotMutatingWebhooks) 53 54 logger = runtime.NewLoggerWithSource("gke") 55 ) 56 57 type gkeAutopilot struct { 58 useExtendedDurationPods bool 59 } 60 61 // hostPortAssignment is the JSON structure of the `host-port-assignment` annotation 62 // 63 //nolint:govet // API-like, keep consistent 64 type hostPortAssignment struct { 65 Min int32 `json:"min,omitempty"` 66 Max int32 `json:"max,omitempty"` 67 PortsAssigned map[int32]int32 `json:"portsAssigned,omitempty"` // old -> new 68 } 69 70 // Detect whether we're running on GKE and/or Autopilot and return the appropriate 71 // cloud product string. 72 func Detect(ctx context.Context, kc *kubernetes.Clientset) string { 73 if !metadata.OnGCE() { 74 return "" 75 } 76 // Look for the workload defaulter - this is the current best method to detect Autopilot 77 found := false 78 for _, webhook := range autopilotMutatingWebhooks { 79 if _, err := kc.AdmissionregistrationV1().MutatingWebhookConfigurations().Get( 80 ctx, webhook, metav1.GetOptions{}); err != nil { 81 logger.WithError(err).WithField("webhook", webhook).Info("Detecting Autopilot MutatingWebhookConfiguration") 82 } else { 83 found = true 84 break 85 } 86 } 87 if !found { 88 logger.WithField("reason", noWorkloadDefaulter).Info( 89 "Assuming GKE Standard and defaulting to generic provider (expected if not on GKE Autopilot)") 90 return "" // GKE standard, but we don't need an interface for it just yet. 91 } 92 logger.Info("Running on GKE Autopilot (skip detection with --cloud-product=gke-autopilot)") 93 return "gke-autopilot" 94 } 95 96 // Autopilot returns a GKE Autopilot cloud product 97 // 98 //nolint:revive // ignore the unexported return; implements ControllerHooksInterface 99 func Autopilot() *gkeAutopilot { 100 return &gkeAutopilot{useExtendedDurationPods: runtime.FeatureEnabled(runtime.FeatureGKEAutopilotExtendedDurationPods)} 101 } 102 103 func (*gkeAutopilot) SyncPodPortsToGameServer(gs *agonesv1.GameServer, pod *corev1.Pod) error { 104 // If applyGameServerAddressAndPort has already filled in Status, SyncPodPortsToGameServer 105 // has already run. Skip syncing from the Pod again - this avoids having to reason 106 // about whether we're re-applying the old->new mapping. 107 if len(gs.Status.Ports) == len(gs.Spec.Ports) { 108 return nil 109 } 110 annotation, ok := pod.ObjectMeta.Annotations[hostPortAssignmentAnnotation] 111 if !ok { 112 return nil 113 } 114 var hpa hostPortAssignment 115 if err := json.Unmarshal([]byte(annotation), &hpa); err != nil { 116 return errors.Wrapf(err, "could not unmarshal annotation %s (value %q)", hostPortAssignmentAnnotation, annotation) 117 } 118 for i, p := range gs.Spec.Ports { 119 if newPort, ok := hpa.PortsAssigned[p.HostPort]; ok { 120 gs.Spec.Ports[i].HostPort = newPort 121 } 122 } 123 return nil 124 } 125 126 func (*gkeAutopilot) NewPortAllocator(portRanges map[string]portallocator.PortRange, 127 _ informers.SharedInformerFactory, 128 _ externalversions.SharedInformerFactory, 129 ) portallocator.Interface { 130 defPortRange := portRanges[agonesv1.DefaultPortRange] 131 return &autopilotPortAllocator{minPort: defPortRange.MinPort, maxPort: defPortRange.MaxPort} 132 } 133 134 func (*gkeAutopilot) WaitOnFreePorts() bool { return true } 135 136 func (g *gkeAutopilot) ValidateGameServerSpec(gss *agonesv1.GameServerSpec, fldPath *field.Path) field.ErrorList { 137 allErrs := g.ValidateScheduling(gss.Scheduling, fldPath.Child("scheduling")) 138 139 // Loop through ports and use the helper function for validation 140 for i, p := range gss.Ports { 141 allErrs = append(allErrs, validatePortPolicy(p, i, fldPath)...) 142 143 } 144 145 // See SetEviction comment below for why we block EvictionSafeOnUpgrade, if Extended Duration pods aren't supported. 146 if !g.useExtendedDurationPods && gss.Eviction.Safe == agonesv1.EvictionSafeOnUpgrade { 147 allErrs = append(allErrs, field.Invalid(fldPath.Child("eviction").Child("safe"), string(gss.Eviction.Safe), errEvictionSafeOnUpgradeInvalid)) 148 } 149 return allErrs 150 } 151 152 func (*gkeAutopilot) ValidateScheduling(ss apis.SchedulingStrategy, fldPath *field.Path) field.ErrorList { 153 if ss != apis.Packed { 154 return field.ErrorList{ 155 field.Invalid(fldPath, string(ss), errSchedulingMustBePacked), 156 } 157 } 158 return nil 159 } 160 161 func (*gkeAutopilot) MutateGameServerPod(gss *agonesv1.GameServerSpec, pod *corev1.Pod) error { 162 setPassthroughLabel(gss, pod) 163 setPrimaryContainer(pod, gss.Container) 164 podSpecSeccompUnconfined(&pod.Spec) 165 return nil 166 } 167 168 // setPassthroughLabel sets the agones.dev/port: "autopilot-passthrough" label to the game server container. 169 // This will help to back the container port from the allocated port using an objectSelector of this label 170 // in GameServers that are using Passthrough Port Policy 171 func setPassthroughLabel(gs *agonesv1.GameServerSpec, pod *corev1.Pod) { 172 if hasPortPolicy(gs, agonesv1.Passthrough) { 173 pod.ObjectMeta.Labels[agonesv1.GameServerPortPolicyPodLabel] = "autopilot-passthrough" 174 } 175 } 176 177 // setPrimaryContainer sets the autopilot.gke.io/primary-container annotation to the game server container. 178 // This acts as a hint to Autopilot for which container to add resources to during resource adjustment. 179 // See https://cloud.google.com/kubernetes-engine/docs/concepts/autopilot-resource-requests#autopilot-resource-management 180 // for more details. 181 func setPrimaryContainer(pod *corev1.Pod, containerName string) { 182 if _, ok := pod.ObjectMeta.Annotations[primaryContainerAnnotation]; ok { 183 return 184 } 185 pod.ObjectMeta.Annotations[primaryContainerAnnotation] = containerName 186 } 187 188 // podSpecSeccompUnconfined sets to seccomp profile to `Unconfined` to avoid serious performance 189 // degradation possible with seccomp. We only set the pod level seccompProfile, and only set 190 // it if it hasn't been set - users can then override at either the pod or container level 191 // in the GameServer spec. 192 func podSpecSeccompUnconfined(podSpec *corev1.PodSpec) { 193 if podSpec.SecurityContext != nil && podSpec.SecurityContext.SeccompProfile != nil { 194 return 195 } 196 if podSpec.SecurityContext == nil { 197 podSpec.SecurityContext = &corev1.PodSecurityContext{} 198 } 199 podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{Type: corev1.SeccompProfileTypeUnconfined} 200 } 201 202 func (g *gkeAutopilot) SetEviction(ev *agonesv1.Eviction, pod *corev1.Pod) error { 203 if g.useExtendedDurationPods { 204 return eviction.SetEviction(ev, pod) 205 } 206 return setEvictionNoExtended(ev, pod) 207 } 208 209 // setEvictionNoExtended sets disruption controls based on GameServer.Status.Eviction. For Autopilot: 210 // - Since the safe-to-evict pod annotation is not supported if "false", we delete it (if it's set 211 // to anything else, we allow it - Autopilot only rejects "false"). 212 // - OnUpgrade is not supported and rejected by validation above. Since we can't support 213 // safe-to-evict=false but can support a restrictive PDB, we can support Never and Always, but 214 // OnUpgrade doesn't make sense on Autopilot today. - an overly restrictive PDB prevents 215 // any sort of graceful eviction. 216 func setEvictionNoExtended(ev *agonesv1.Eviction, pod *corev1.Pod) error { 217 if safeAnnotation := pod.ObjectMeta.Annotations[agonesv1.PodSafeToEvictAnnotation]; safeAnnotation == agonesv1.False { 218 delete(pod.ObjectMeta.Annotations, agonesv1.PodSafeToEvictAnnotation) 219 } 220 if ev == nil { 221 return errors.New("No eviction value set. Should be the default value") 222 } 223 if _, exists := pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel]; !exists { 224 switch ev.Safe { 225 case agonesv1.EvictionSafeAlways: 226 // For EvictionSafeAlways, we use a label value that does not match the 227 // agones-gameserver-safe-to-evict-false PDB. But we go ahead and label 228 // it, in case someone wants to adopt custom logic for this group of 229 // game servers. 230 pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.True 231 case agonesv1.EvictionSafeNever: 232 pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.False 233 default: 234 return errors.Errorf("eviction.safe == %s, which webhook should have rejected on Autopilot", ev.Safe) 235 } 236 } 237 return nil 238 } 239 240 func hasPortPolicy(gs *agonesv1.GameServerSpec, portPolicy agonesv1.PortPolicy) bool { 241 for _, p := range gs.Ports { 242 if p.PortPolicy == portPolicy { 243 return true 244 } 245 } 246 return false 247 } 248 249 type autopilotPortAllocator struct { 250 minPort int32 251 maxPort int32 252 } 253 254 func (*autopilotPortAllocator) Run(_ context.Context) error { return nil } 255 func (*autopilotPortAllocator) DeAllocate(_ *agonesv1.GameServer) {} 256 257 func (apa *autopilotPortAllocator) Allocate(gs *agonesv1.GameServer) *agonesv1.GameServer { 258 if len(gs.Spec.Ports) == 0 { 259 return gs // Nothing to do. 260 } 261 262 var ports []agonesv1.GameServerPort 263 for i, p := range gs.Spec.Ports { 264 if !(p.PortPolicy == agonesv1.Dynamic || p.PortPolicy == agonesv1.Passthrough) { 265 logger.WithField("gs", gs.Name).WithField("portPolicy", p.PortPolicy).Error( 266 "GameServer has invalid PortPolicy for Autopilot - this should have been rejected by webhooks. Refusing to assign ports.") 267 return gs 268 } 269 p.HostPort = int32(i + 1) // Autopilot expects _some_ host port - use a value unique to this GameServer Port. 270 271 if p.Protocol == agonesv1.ProtocolTCPUDP { 272 tcp := p 273 tcp.Name = p.Name + "-tcp" 274 tcp.Protocol = corev1.ProtocolTCP 275 ports = append(ports, tcp) 276 277 p.Name += "-udp" 278 p.Protocol = corev1.ProtocolUDP 279 } 280 ports = append(ports, p) 281 } 282 283 hpa := hostPortAssignment{Min: apa.minPort, Max: apa.maxPort} 284 hpaJSON, err := json.Marshal(hpa) 285 if err != nil { 286 logger.WithError(err).WithField("hostPort", hpa).WithField("gs", gs.Name).Error("Internal error marshalling hostPortAssignment for GameServer") 287 // In error cases, return the original gs - on Autopilot this will result in a policy failure. 288 return gs 289 } 290 291 // No errors past here. 292 gs.Spec.Ports = ports 293 if gs.Spec.Template.ObjectMeta.Annotations == nil { 294 gs.Spec.Template.ObjectMeta.Annotations = make(map[string]string) 295 } 296 gs.Spec.Template.ObjectMeta.Annotations[hostPortAssignmentAnnotation] = string(hpaJSON) 297 return gs 298 } 299 300 // validatePortPolicy is a helper function to validate a single GameServerPort's PortPolicy 301 // for GKE Autopilot constraints. 302 func validatePortPolicy(p agonesv1.GameServerPort, i int, fldPath *field.Path) field.ErrorList { 303 var allErrs field.ErrorList 304 portPath := fldPath.Child("ports").Index(i) 305 306 switch p.PortPolicy { 307 case agonesv1.Dynamic, agonesv1.Passthrough: 308 // These policies are always valid on GKE Autopilot. 309 case agonesv1.None: 310 // "None" is valid only if the feature gate FeaturePortPolicyNone is enabled. 311 if !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone) { 312 allErrs = append(allErrs, field.Invalid(portPath.Child("portPolicy"), p.PortPolicy, "PortPolicy 'None' is not enabled")) 313 } 314 default: 315 // Any other port policy, such as "Static", is considered invalid on GKE Autopilot. 316 allErrs = append(allErrs, field.Invalid(portPath.Child("portPolicy"), p.PortPolicy, "portPolicy must be Dynamic, Passthrough, or None on GKE Autopilot")) 317 } 318 319 if p.Range != agonesv1.DefaultPortRange && (p.PortPolicy != agonesv1.None || !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone)) { 320 allErrs = append(allErrs, field.Invalid(fldPath.Child("ports").Index(i).Child("range"), p.Range, errRangeInvalid)) 321 } 322 323 return allErrs 324 }