sigs.k8s.io/cluster-api-provider-aws@v1.5.5/templates/cluster-template-eks-managedmachinepool-gpu.yaml

sigs.k8s.io/cluster-api-provider-aws@v1.5.5/templates/cluster-template-eks-managedmachinepool-gpu.yaml (about)

     1  ---
     2  apiVersion: cluster.x-k8s.io/v1beta1
     3  kind: Cluster
     4  metadata:
     5    name: "${CLUSTER_NAME}"
     6    labels:
     7      gpu: "nvidia"
     8  spec:
     9    clusterNetwork:
    10      pods:
    11        cidrBlocks: ["192.168.0.0/16"]
    12    infrastructureRef:
    13      kind: AWSManagedControlPlane
    14      apiVersion: controlplane.cluster.x-k8s.io/v1beta1
    15      name: "${CLUSTER_NAME}-control-plane"
    16    controlPlaneRef:
    17      kind: AWSManagedControlPlane
    18      apiVersion: controlplane.cluster.x-k8s.io/v1beta1
    19      name: "${CLUSTER_NAME}-control-plane"
    20  ---
    21  kind: AWSManagedControlPlane
    22  apiVersion: controlplane.cluster.x-k8s.io/v1beta1
    23  metadata:
    24    name: "${CLUSTER_NAME}-control-plane"
    25  spec:
    26    region: "${AWS_REGION}"
    27    sshKeyName: "${AWS_SSH_KEY_NAME}"
    28    version: "${KUBERNETES_VERSION}"
    29    addons:
    30      - name: "vpc-cni"
    31        version: "${VPC_ADDON_VERSION:=v1.7.5-eksbuild.1}"
    32        conflictResolution: "overwrite"
    33  ---
    34  apiVersion: cluster.x-k8s.io/v1beta1
    35  kind: MachinePool
    36  metadata:
    37    name: "${CLUSTER_NAME}-pool-0"
    38  spec:
    39    clusterName: "${CLUSTER_NAME}"
    40    replicas: ${WORKER_MACHINE_COUNT}
    41    template:
    42      spec:
    43        clusterName: "${CLUSTER_NAME}"
    44        bootstrap:
    45          dataSecretName: ""
    46        infrastructureRef:
    47          name: "${CLUSTER_NAME}-pool-0"
    48          apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
    49          kind: AWSManagedMachinePool
    50  ---
    51  apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
    52  kind: AWSManagedMachinePool
    53  metadata:
    54    name: "${CLUSTER_NAME}-pool-0"
    55  spec:
    56    amiType: "AL2_x86_64_GPU"
    57    instanceType: "g4dn.xlarge"
    58  ---
    59  apiVersion: addons.cluster.x-k8s.io/v1beta1
    60  kind: ClusterResourceSet
    61  metadata:
    62   name: crs-nvidia
    63  spec:
    64   strategy: "ApplyOnce"
    65   clusterSelector:
    66     matchLabels:
    67       gpu: "nvidia"
    68   resources:
    69     - name: nvidia-addon
    70       kind: ConfigMap
    71  ---
    72  apiVersion: v1
    73  kind: ConfigMap
    74  metadata:
    75    name: nvidia-addon
    76  data:
    77    nvidia-device-plugin.yaml: |
    78      # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
    79      #
    80      # Licensed under the Apache License, Version 2.0 (the "License");
    81      # you may not use this file except in compliance with the License.
    82      # You may obtain a copy of the License at
    83      #
    84      #     http://www.apache.org/licenses/LICENSE-2.0
    85      #
    86      # Unless required by applicable law or agreed to in writing, software
    87      # distributed under the License is distributed on an "AS IS" BASIS,
    88      # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    89      # See the License for the specific language governing permissions and
    90      # limitations under the License.
    91  
    92      apiVersion: apps/v1
    93      kind: DaemonSet
    94      metadata:
    95        name: nvidia-device-plugin-daemonset
    96        namespace: kube-system
    97      spec:
    98        selector:
    99          matchLabels:
   100            name: nvidia-device-plugin-ds
   101        updateStrategy:
   102          type: RollingUpdate
   103        template:
   104          metadata:
   105            # This annotation is deprecated. Kept here for backward compatibility
   106            # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
   107            annotations:
   108              scheduler.alpha.kubernetes.io/critical-pod: ""
   109            labels:
   110              name: nvidia-device-plugin-ds
   111          spec:
   112            tolerations:
   113            # This toleration is deprecated. Kept here for backward compatibility
   114            # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
   115            - key: CriticalAddonsOnly
   116              operator: Exists
   117            - key: nvidia.com/gpu
   118              operator: Exists
   119              effect: NoSchedule
   120            # Mark this pod as a critical add-on; when enabled, the critical add-on
   121            # scheduler reserves resources for critical add-on pods so that they can
   122            # be rescheduled after a failure.
   123            # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
   124            priorityClassName: "system-node-critical"
   125            containers:
   126            - image: nvidia/k8s-device-plugin:v0.8.0
   127              name: nvidia-device-plugin-ctr
   128              args: ["--fail-on-init-error=false"]
   129              securityContext:
   130                allowPrivilegeEscalation: false
   131                capabilities:
   132                  drop: ["ALL"]
   133              volumeMounts:
   134                - name: device-plugin
   135                  mountPath: /var/lib/kubelet/device-plugins
   136            volumes:
   137              - name: device-plugin
   138                hostPath:
   139                  path: /var/lib/kubelet/device-plugins