sigs.k8s.io/cluster-api-provider-aws@v1.5.5/templates/cluster-template-eks-managedmachinepool-gpu.yaml (about) 1 --- 2 apiVersion: cluster.x-k8s.io/v1beta1 3 kind: Cluster 4 metadata: 5 name: "${CLUSTER_NAME}" 6 labels: 7 gpu: "nvidia" 8 spec: 9 clusterNetwork: 10 pods: 11 cidrBlocks: ["192.168.0.0/16"] 12 infrastructureRef: 13 kind: AWSManagedControlPlane 14 apiVersion: controlplane.cluster.x-k8s.io/v1beta1 15 name: "${CLUSTER_NAME}-control-plane" 16 controlPlaneRef: 17 kind: AWSManagedControlPlane 18 apiVersion: controlplane.cluster.x-k8s.io/v1beta1 19 name: "${CLUSTER_NAME}-control-plane" 20 --- 21 kind: AWSManagedControlPlane 22 apiVersion: controlplane.cluster.x-k8s.io/v1beta1 23 metadata: 24 name: "${CLUSTER_NAME}-control-plane" 25 spec: 26 region: "${AWS_REGION}" 27 sshKeyName: "${AWS_SSH_KEY_NAME}" 28 version: "${KUBERNETES_VERSION}" 29 addons: 30 - name: "vpc-cni" 31 version: "${VPC_ADDON_VERSION:=v1.7.5-eksbuild.1}" 32 conflictResolution: "overwrite" 33 --- 34 apiVersion: cluster.x-k8s.io/v1beta1 35 kind: MachinePool 36 metadata: 37 name: "${CLUSTER_NAME}-pool-0" 38 spec: 39 clusterName: "${CLUSTER_NAME}" 40 replicas: ${WORKER_MACHINE_COUNT} 41 template: 42 spec: 43 clusterName: "${CLUSTER_NAME}" 44 bootstrap: 45 dataSecretName: "" 46 infrastructureRef: 47 name: "${CLUSTER_NAME}-pool-0" 48 apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 49 kind: AWSManagedMachinePool 50 --- 51 apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 52 kind: AWSManagedMachinePool 53 metadata: 54 name: "${CLUSTER_NAME}-pool-0" 55 spec: 56 amiType: "AL2_x86_64_GPU" 57 instanceType: "g4dn.xlarge" 58 --- 59 apiVersion: addons.cluster.x-k8s.io/v1beta1 60 kind: ClusterResourceSet 61 metadata: 62 name: crs-nvidia 63 spec: 64 strategy: "ApplyOnce" 65 clusterSelector: 66 matchLabels: 67 gpu: "nvidia" 68 resources: 69 - name: nvidia-addon 70 kind: ConfigMap 71 --- 72 apiVersion: v1 73 kind: ConfigMap 74 metadata: 75 name: nvidia-addon 76 data: 77 nvidia-device-plugin.yaml: | 78 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 79 # 80 # Licensed under the Apache License, Version 2.0 (the "License"); 81 # you may not use this file except in compliance with the License. 82 # You may obtain a copy of the License at 83 # 84 # http://www.apache.org/licenses/LICENSE-2.0 85 # 86 # Unless required by applicable law or agreed to in writing, software 87 # distributed under the License is distributed on an "AS IS" BASIS, 88 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 89 # See the License for the specific language governing permissions and 90 # limitations under the License. 91 92 apiVersion: apps/v1 93 kind: DaemonSet 94 metadata: 95 name: nvidia-device-plugin-daemonset 96 namespace: kube-system 97 spec: 98 selector: 99 matchLabels: 100 name: nvidia-device-plugin-ds 101 updateStrategy: 102 type: RollingUpdate 103 template: 104 metadata: 105 # This annotation is deprecated. Kept here for backward compatibility 106 # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 107 annotations: 108 scheduler.alpha.kubernetes.io/critical-pod: "" 109 labels: 110 name: nvidia-device-plugin-ds 111 spec: 112 tolerations: 113 # This toleration is deprecated. Kept here for backward compatibility 114 # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 115 - key: CriticalAddonsOnly 116 operator: Exists 117 - key: nvidia.com/gpu 118 operator: Exists 119 effect: NoSchedule 120 # Mark this pod as a critical add-on; when enabled, the critical add-on 121 # scheduler reserves resources for critical add-on pods so that they can 122 # be rescheduled after a failure. 123 # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ 124 priorityClassName: "system-node-critical" 125 containers: 126 - image: nvidia/k8s-device-plugin:v0.8.0 127 name: nvidia-device-plugin-ctr 128 args: ["--fail-on-init-error=false"] 129 securityContext: 130 allowPrivilegeEscalation: false 131 capabilities: 132 drop: ["ALL"] 133 volumeMounts: 134 - name: device-plugin 135 mountPath: /var/lib/kubelet/device-plugins 136 volumes: 137 - name: device-plugin 138 hostPath: 139 path: /var/lib/kubelet/device-plugins