agones.dev/agones@v1.53.0/pkg/cloudproduct/gke/gke.go (about) 1 // Copyright 2022 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package gke implements the GKE cloud product (specifically Autopilot for now) 16 package gke 17 18 import ( 19 "context" 20 "encoding/json" 21 "fmt" 22 23 "agones.dev/agones/pkg/apis" 24 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 25 "agones.dev/agones/pkg/client/informers/externalversions" 26 "agones.dev/agones/pkg/cloudproduct/eviction" 27 "agones.dev/agones/pkg/portallocator" 28 "agones.dev/agones/pkg/util/runtime" 29 "cloud.google.com/go/compute/metadata" 30 "github.com/pkg/errors" 31 corev1 "k8s.io/api/core/v1" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/util/validation/field" 34 "k8s.io/client-go/informers" 35 "k8s.io/client-go/kubernetes" 36 ) 37 38 const ( 39 hostPortAssignmentAnnotation = "autopilot.gke.io/host-port-assignment" 40 primaryContainerAnnotation = "autopilot.gke.io/primary-container" 41 42 errPortPolicyMustBeDynamicOrNone = "portPolicy must be Dynamic or None on GKE Autopilot" 43 errRangeInvalid = "range must not be used on GKE Autopilot" 44 errSchedulingMustBePacked = "scheduling strategy must be Packed on GKE Autopilot" 45 errEvictionSafeOnUpgradeInvalid = "eviction.safe OnUpgrade not supported on GKE Autopilot" 46 ) 47 48 var ( 49 autopilotMutatingWebhooks = []string{ 50 "workload-defaulter.config.common-webhooks.networking.gke.io", // pre-1.26 51 "sasecret-redacter.config.common-webhooks.networking.gke.io", // 1.26+ 52 } 53 noWorkloadDefaulter = fmt.Sprintf("found no MutatingWebhookConfigurations matching %v", autopilotMutatingWebhooks) 54 55 logger = runtime.NewLoggerWithSource("gke") 56 ) 57 58 type gkeAutopilot struct { 59 useExtendedDurationPods bool 60 } 61 62 // hostPortAssignment is the JSON structure of the `host-port-assignment` annotation 63 // 64 //nolint:govet // API-like, keep consistent 65 type hostPortAssignment struct { 66 Min int32 `json:"min,omitempty"` 67 Max int32 `json:"max,omitempty"` 68 PortsAssigned map[int32]int32 `json:"portsAssigned,omitempty"` // old -> new 69 } 70 71 // Detect whether we're running on GKE and/or Autopilot and return the appropriate 72 // cloud product string. 73 func Detect(ctx context.Context, kc *kubernetes.Clientset) string { 74 if !metadata.OnGCE() { 75 return "" 76 } 77 // Look for the workload defaulter - this is the current best method to detect Autopilot 78 found := false 79 for _, webhook := range autopilotMutatingWebhooks { 80 if _, err := kc.AdmissionregistrationV1().MutatingWebhookConfigurations().Get( 81 ctx, webhook, metav1.GetOptions{}); err != nil { 82 logger.WithError(err).WithField("webhook", webhook).Info("Detecting Autopilot MutatingWebhookConfiguration") 83 } else { 84 found = true 85 break 86 } 87 } 88 if !found { 89 logger.WithField("reason", noWorkloadDefaulter).Info( 90 "Assuming GKE Standard and defaulting to generic provider (expected if not on GKE Autopilot)") 91 return "" // GKE standard, but we don't need an interface for it just yet. 92 } 93 logger.Info("Running on GKE Autopilot (skip detection with --cloud-product=gke-autopilot)") 94 return "gke-autopilot" 95 } 96 97 // Autopilot returns a GKE Autopilot cloud product 98 // 99 //nolint:revive // ignore the unexported return; implements ControllerHooksInterface 100 func Autopilot() *gkeAutopilot { 101 return &gkeAutopilot{useExtendedDurationPods: runtime.FeatureEnabled(runtime.FeatureGKEAutopilotExtendedDurationPods)} 102 } 103 104 func (*gkeAutopilot) SyncPodPortsToGameServer(gs *agonesv1.GameServer, pod *corev1.Pod) error { 105 // If applyGameServerAddressAndPort has already filled in Status, SyncPodPortsToGameServer 106 // has already run. Skip syncing from the Pod again - this avoids having to reason 107 // about whether we're re-applying the old->new mapping. 108 if len(gs.Status.Ports) == len(gs.Spec.Ports) { 109 return nil 110 } 111 annotation, ok := pod.ObjectMeta.Annotations[hostPortAssignmentAnnotation] 112 if !ok { 113 return nil 114 } 115 var hpa hostPortAssignment 116 if err := json.Unmarshal([]byte(annotation), &hpa); err != nil { 117 return errors.Wrapf(err, "could not unmarshal annotation %s (value %q)", hostPortAssignmentAnnotation, annotation) 118 } 119 for i, p := range gs.Spec.Ports { 120 if newPort, ok := hpa.PortsAssigned[p.HostPort]; ok { 121 gs.Spec.Ports[i].HostPort = newPort 122 } 123 } 124 return nil 125 } 126 127 func (*gkeAutopilot) NewPortAllocator(portRanges map[string]portallocator.PortRange, 128 _ informers.SharedInformerFactory, 129 _ externalversions.SharedInformerFactory, 130 ) portallocator.Interface { 131 defPortRange := portRanges[agonesv1.DefaultPortRange] 132 return &autopilotPortAllocator{minPort: defPortRange.MinPort, maxPort: defPortRange.MaxPort} 133 } 134 135 func (*gkeAutopilot) WaitOnFreePorts() bool { return true } 136 137 func checkPassthroughPortPolicy(portPolicy agonesv1.PortPolicy) bool { 138 // if feature is not enabled and port is Passthrough return true because that should be an invalid port 139 // if feature is not enabled and port is not Passthrough you can return false because there's no error but check for None port 140 // if feature is enabled and port is passthrough return false because there is no error 141 // if feature is enabled and port is not passthrough return false because there is no error but check for None port 142 return (!runtime.FeatureEnabled(runtime.FeatureAutopilotPassthroughPort) && portPolicy == agonesv1.Passthrough) || portPolicy == agonesv1.Static 143 } 144 145 func (g *gkeAutopilot) ValidateGameServerSpec(gss *agonesv1.GameServerSpec, fldPath *field.Path) field.ErrorList { 146 allErrs := g.ValidateScheduling(gss.Scheduling, fldPath.Child("scheduling")) 147 for i, p := range gss.Ports { 148 if p.PortPolicy != agonesv1.Dynamic && (p.PortPolicy != agonesv1.None || !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone)) && checkPassthroughPortPolicy(p.PortPolicy) { 149 allErrs = append(allErrs, field.Invalid(fldPath.Child("ports").Index(i).Child("portPolicy"), string(p.PortPolicy), errPortPolicyMustBeDynamicOrNone)) 150 } 151 if p.Range != agonesv1.DefaultPortRange && (p.PortPolicy != agonesv1.None || !runtime.FeatureEnabled(runtime.FeaturePortPolicyNone)) { 152 allErrs = append(allErrs, field.Invalid(fldPath.Child("ports").Index(i).Child("range"), p.Range, errRangeInvalid)) 153 } 154 } 155 // See SetEviction comment below for why we block EvictionSafeOnUpgrade, if Extended Duration pods aren't supported. 156 if !g.useExtendedDurationPods && gss.Eviction.Safe == agonesv1.EvictionSafeOnUpgrade { 157 allErrs = append(allErrs, field.Invalid(fldPath.Child("eviction").Child("safe"), string(gss.Eviction.Safe), errEvictionSafeOnUpgradeInvalid)) 158 } 159 return allErrs 160 } 161 162 func (*gkeAutopilot) ValidateScheduling(ss apis.SchedulingStrategy, fldPath *field.Path) field.ErrorList { 163 if ss != apis.Packed { 164 return field.ErrorList{ 165 field.Invalid(fldPath, string(ss), errSchedulingMustBePacked), 166 } 167 } 168 return nil 169 } 170 171 func (*gkeAutopilot) MutateGameServerPod(gss *agonesv1.GameServerSpec, pod *corev1.Pod) error { 172 setPassthroughLabel(gss, pod) 173 setPrimaryContainer(pod, gss.Container) 174 podSpecSeccompUnconfined(&pod.Spec) 175 return nil 176 } 177 178 // setPassthroughLabel sets the agones.dev/port: "autopilot-passthrough" label to the game server container. 179 // This will help to back the container port from the allocated port using an objectSelector of this label 180 // in GameServers that are using Passthrough Port Policy 181 func setPassthroughLabel(gs *agonesv1.GameServerSpec, pod *corev1.Pod) { 182 if runtime.FeatureEnabled(runtime.FeatureAutopilotPassthroughPort) && hasPortPolicy(gs, agonesv1.Passthrough) { 183 pod.ObjectMeta.Labels[agonesv1.GameServerPortPolicyPodLabel] = "autopilot-passthrough" 184 } 185 } 186 187 // setPrimaryContainer sets the autopilot.gke.io/primary-container annotation to the game server container. 188 // This acts as a hint to Autopilot for which container to add resources to during resource adjustment. 189 // See https://cloud.google.com/kubernetes-engine/docs/concepts/autopilot-resource-requests#autopilot-resource-management 190 // for more details. 191 func setPrimaryContainer(pod *corev1.Pod, containerName string) { 192 if _, ok := pod.ObjectMeta.Annotations[primaryContainerAnnotation]; ok { 193 return 194 } 195 pod.ObjectMeta.Annotations[primaryContainerAnnotation] = containerName 196 } 197 198 // podSpecSeccompUnconfined sets to seccomp profile to `Unconfined` to avoid serious performance 199 // degradation possible with seccomp. We only set the pod level seccompProfile, and only set 200 // it if it hasn't been set - users can then override at either the pod or container level 201 // in the GameServer spec. 202 func podSpecSeccompUnconfined(podSpec *corev1.PodSpec) { 203 if podSpec.SecurityContext != nil && podSpec.SecurityContext.SeccompProfile != nil { 204 return 205 } 206 if podSpec.SecurityContext == nil { 207 podSpec.SecurityContext = &corev1.PodSecurityContext{} 208 } 209 podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{Type: corev1.SeccompProfileTypeUnconfined} 210 } 211 212 func (g *gkeAutopilot) SetEviction(ev *agonesv1.Eviction, pod *corev1.Pod) error { 213 if g.useExtendedDurationPods { 214 return eviction.SetEviction(ev, pod) 215 } 216 return setEvictionNoExtended(ev, pod) 217 } 218 219 // setEvictionNoExtended sets disruption controls based on GameServer.Status.Eviction. For Autopilot: 220 // - Since the safe-to-evict pod annotation is not supported if "false", we delete it (if it's set 221 // to anything else, we allow it - Autopilot only rejects "false"). 222 // - OnUpgrade is not supported and rejected by validation above. Since we can't support 223 // safe-to-evict=false but can support a restrictive PDB, we can support Never and Always, but 224 // OnUpgrade doesn't make sense on Autopilot today. - an overly restrictive PDB prevents 225 // any sort of graceful eviction. 226 func setEvictionNoExtended(ev *agonesv1.Eviction, pod *corev1.Pod) error { 227 if safeAnnotation := pod.ObjectMeta.Annotations[agonesv1.PodSafeToEvictAnnotation]; safeAnnotation == agonesv1.False { 228 delete(pod.ObjectMeta.Annotations, agonesv1.PodSafeToEvictAnnotation) 229 } 230 if ev == nil { 231 return errors.New("No eviction value set. Should be the default value") 232 } 233 if _, exists := pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel]; !exists { 234 switch ev.Safe { 235 case agonesv1.EvictionSafeAlways: 236 // For EvictionSafeAlways, we use a label value that does not match the 237 // agones-gameserver-safe-to-evict-false PDB. But we go ahead and label 238 // it, in case someone wants to adopt custom logic for this group of 239 // game servers. 240 pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.True 241 case agonesv1.EvictionSafeNever: 242 pod.ObjectMeta.Labels[agonesv1.SafeToEvictLabel] = agonesv1.False 243 default: 244 return errors.Errorf("eviction.safe == %s, which webhook should have rejected on Autopilot", ev.Safe) 245 } 246 } 247 return nil 248 } 249 250 func hasPortPolicy(gs *agonesv1.GameServerSpec, portPolicy agonesv1.PortPolicy) bool { 251 for _, p := range gs.Ports { 252 if p.PortPolicy == portPolicy { 253 return true 254 } 255 } 256 return false 257 } 258 259 type autopilotPortAllocator struct { 260 minPort int32 261 maxPort int32 262 } 263 264 func (*autopilotPortAllocator) Run(_ context.Context) error { return nil } 265 func (*autopilotPortAllocator) DeAllocate(_ *agonesv1.GameServer) {} 266 267 func checkPassthroughPortPolicyForAutopilot(portPolicy agonesv1.PortPolicy) bool { 268 // Autopilot can have Dynamic or Passthrough 269 // if feature is not enabled and port is Passthrough -> true 270 // if feature is not enabled and port is not Passthrough -> true 271 // if feature is enabled and port is Passthrough -> false 272 // if feature is enabled and port is not Passthrough -> true 273 return !(runtime.FeatureEnabled(runtime.FeatureAutopilotPassthroughPort) && portPolicy == agonesv1.Passthrough) 274 } 275 276 func (apa *autopilotPortAllocator) Allocate(gs *agonesv1.GameServer) *agonesv1.GameServer { 277 if len(gs.Spec.Ports) == 0 { 278 return gs // Nothing to do. 279 } 280 281 var ports []agonesv1.GameServerPort 282 for i, p := range gs.Spec.Ports { 283 if p.PortPolicy != agonesv1.Dynamic && checkPassthroughPortPolicyForAutopilot(p.PortPolicy) { 284 logger.WithField("gs", gs.Name).WithField("portPolicy", p.PortPolicy).Error( 285 "GameServer has invalid PortPolicy for Autopilot - this should have been rejected by webhooks. Refusing to assign ports.") 286 return gs 287 } 288 p.HostPort = int32(i + 1) // Autopilot expects _some_ host port - use a value unique to this GameServer Port. 289 290 if p.Protocol == agonesv1.ProtocolTCPUDP { 291 tcp := p 292 tcp.Name = p.Name + "-tcp" 293 tcp.Protocol = corev1.ProtocolTCP 294 ports = append(ports, tcp) 295 296 p.Name += "-udp" 297 p.Protocol = corev1.ProtocolUDP 298 } 299 ports = append(ports, p) 300 } 301 302 hpa := hostPortAssignment{Min: apa.minPort, Max: apa.maxPort} 303 hpaJSON, err := json.Marshal(hpa) 304 if err != nil { 305 logger.WithError(err).WithField("hostPort", hpa).WithField("gs", gs.Name).Error("Internal error marshalling hostPortAssignment for GameServer") 306 // In error cases, return the original gs - on Autopilot this will result in a policy failure. 307 return gs 308 } 309 310 // No errors past here. 311 gs.Spec.Ports = ports 312 if gs.Spec.Template.ObjectMeta.Annotations == nil { 313 gs.Spec.Template.ObjectMeta.Annotations = make(map[string]string) 314 } 315 gs.Spec.Template.ObjectMeta.Annotations[hostPortAssignmentAnnotation] = string(hpaJSON) 316 return gs 317 }