k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/jobs/kubernetes/sig-scalability/sig-scalability-release-blocking-jobs.yaml (about)

     1  periodics:
     2  # This is a sig-release-master-blocking job.
     3  # The frequency was cut to reduce infrastructure costs.
     4  - cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC)
     5    name: ci-kubernetes-e2e-gce-scale-correctness
     6    cluster: k8s-infra-prow-build
     7    labels:
     8      preset-service-account: "true"
     9      preset-k8s-ssh: "true"
    10      preset-e2e-scalability-common: "true"
    11      preset-e2e-scalability-periodics: "true"
    12      preset-e2e-scalability-periodics-master: "true"
    13    decorate: true
    14    decoration_config:
    15      timeout: 270m
    16    annotations:
    17      testgrid-num-failures-to-alert: '2'
    18      testgrid-alert-email: kubernetes-sig-scale@googlegroups.com, kubernetes-scalability-tickets@google.com, release-team@kubernetes.io
    19      testgrid-dashboards: sig-release-master-informing, sig-scalability-gce, google-gce
    20      testgrid-tab-name: gce-master-scale-correctness
    21      testgrid-base-options: 'exclude-filter-by-regex=^(kubetest\.Test|ci-kubernetes-e2e-gce-scale-correctness\.Overall)$'
    22      description: "Uses kubetest to run correctness tests against a 5000-node cluster created with cluster/kube-up.sh"
    23    spec:
    24      containers:
    25      - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20240515-17c6d50e24-master
    26        command:
    27        - runner.sh
    28        - /workspace/scenarios/kubernetes_e2e.py
    29        args:
    30        - --cluster=gce-scale-cluster
    31        - --env=CONCURRENT_SERVICE_SYNCS=20 # support 20 LoadBalancer Services in parallel to deal with existing CI load #122286
    32        - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
    33        - --extract=ci/fast/latest-fast
    34        - --extract-ci-bucket=k8s-release-dev
    35        - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms
    36        # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics.
    37        - --env=CONTROLLER_MANAGER_TEST_ARGS=--profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100
    38        - --gcp-master-image=gci
    39        - --gcp-node-image=gci
    40        - --gcp-node-size=e2-small
    41        - --gcp-nodes=5000
    42        - --gcp-project-type=scalability-scale-project
    43        - --gcp-ssh-proxy-instance-name=gce-scale-cluster-master
    44        - --gcp-zone=us-east1-b
    45        - --ginkgo-parallel=40
    46        - --provider=gce
    47        - --test_args=--ginkgo.skip=\[Driver:.gcepd\]|\[Serial\]|\[Disruptive\]|\[Flaky\]|\[Feature:([^L].*|L[^o].*|Lo[^a].*|Loa[^d].*)\] --minStartupPods=8 --node-schedulable-timeout=90m
    48        - --timeout=240m
    49        - --use-logexporter
    50        - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID)
    51        resources:
    52          requests:
    53            cpu: 6
    54            memory: "39Gi"
    55          limits:
    56            cpu: 6
    57            memory: "39Gi"
    58  
    59  # This is a sig-release-master-blocking job.
    60  # The frequency was cut to reduce infrastructure costs.
    61  - cron: '1 17 1-31/2 * *' # Run on odd days at 9:01PST (17:01 UTC)
    62    name: ci-kubernetes-e2e-gce-scale-performance
    63    tags:
    64    - "perfDashPrefix: gce-5000Nodes"
    65    - "perfDashBuildsCount: 270"
    66    - "perfDashJobType: performance"
    67    cluster: k8s-infra-prow-build
    68    labels:
    69      preset-service-account: "true"
    70      preset-k8s-ssh: "true"
    71      preset-e2e-scalability-common: "true"
    72      preset-e2e-scalability-periodics: "true"
    73      preset-e2e-scalability-periodics-master: "true"
    74    decorate: true
    75    decoration_config:
    76      timeout: 450m
    77    extra_refs:
    78    - org: kubernetes
    79      repo: kubernetes
    80      base_ref: master
    81      path_alias: k8s.io/kubernetes
    82    - org: kubernetes
    83      repo: perf-tests
    84      base_ref: master
    85      path_alias: k8s.io/perf-tests
    86    annotations:
    87      testgrid-alert-email: kubernetes-sig-scale@googlegroups.com, kubernetes-scalability-tickets@google.com, release-team@kubernetes.io
    88      testgrid-dashboards: sig-release-master-informing, sig-scalability-gce, google-gce
    89      testgrid-tab-name: gce-master-scale-performance
    90      description: "Uses kubetest to run k8s.io/perf-tests/run-e2e.sh against a 5000-node cluster created with cluster/kube-up.sh"
    91    spec:
    92      containers:
    93      - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20240515-17c6d50e24-master
    94        command:
    95        - runner.sh
    96        - /workspace/scenarios/kubernetes_e2e.py
    97        args:
    98        - --cluster=gce-scale-cluster
    99        - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
   100        # TODO(mborsz): Adjust or remove this change once we understand coredns
   101        # memory usage regression.
   102        - --env=KUBE_DNS_MEMORY_LIMIT=300Mi
   103        - --extract=ci/fast/latest-fast
   104        - --extract-ci-bucket=k8s-release-dev
   105        - --gcp-nodes=5000
   106        - --gcp-project-type=scalability-scale-project
   107        - --gcp-zone=us-east1-b
   108        - --provider=gce
   109        - --metadata-sources=cl2-metadata.json
   110        - --env=CL2_LOAD_TEST_THROUGHPUT=50
   111        - --env=CL2_DELETE_TEST_THROUGHPUT=50
   112        - --env=CL2_RATE_LIMIT_POD_CREATION=false
   113        - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms
   114        # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics.
   115        - --env=CONTROLLER_MANAGER_TEST_ARGS=--profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100
   116        # Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics.
   117        # TODO(#1311): Clean this up after the experiment - it should allow
   118        #   to hugely decrease pod-startup-latency across the whole test.
   119        #   Given that individual controllers have separate QPS limits, we allow
   120        #   scheduler to keep up with the load from deployment, daemonset and job
   121        #   performing pod creations at once.
   122        - --env=SCHEDULER_TEST_ARGS=--profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500
   123        # With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0.
   124        - --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0
   125        - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true
   126        - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5
   127        - --test=false
   128        - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
   129        - --test-cmd-args=cluster-loader2
   130        - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
   131        - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
   132        - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
   133        - --test-cmd-args=--nodes=5000
   134        - --test-cmd-args=--prometheus-scrape-node-exporter
   135        - --test-cmd-args=--provider=gce
   136        - --test-cmd-args=--report-dir=$(ARTIFACTS)
   137        - --test-cmd-args=--testconfig=testing/load/config.yaml
   138        - --test-cmd-args=--testconfig=testing/huge-service/config.yaml
   139        - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml
   140        - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml
   141        - --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml
   142        - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml
   143        - --test-cmd-name=ClusterLoaderV2
   144        - --timeout=420m
   145        - --use-logexporter
   146        - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID)
   147        resources:
   148          requests:
   149            cpu: 6
   150            memory: "16Gi"
   151          limits:
   152            cpu: 6
   153            memory: "16Gi"
   154  
   155  - interval: 30m
   156    cluster: k8s-infra-prow-build
   157    name: ci-kubernetes-e2e-gci-gce-scalability
   158    tags:
   159    - "perfDashPrefix: gce-100Nodes-master"
   160    - "perfDashJobType: performance"
   161    - "perfDashBuildsCount: 500"
   162    labels:
   163      preset-service-account: "true"
   164      preset-k8s-ssh: "true"
   165      preset-e2e-scalability-common: "true"
   166      preset-e2e-scalability-periodics: "true"
   167      preset-e2e-scalability-periodics-master: "true"
   168    decorate: true
   169    decoration_config:
   170      timeout: 140m
   171    extra_refs:
   172    - org: kubernetes
   173      repo: kubernetes
   174      base_ref: master
   175      path_alias: k8s.io/kubernetes
   176    - org: kubernetes
   177      repo: perf-tests
   178      base_ref: master
   179      path_alias: k8s.io/perf-tests
   180    annotations:
   181      fork-per-release: "true"
   182      fork-per-release-cron: 0 */6 * * *, 0 0/12 * * *, 0 4-16/12 * * *, 0 8-20/12 * * *, 0 8-20/24 * * *
   183      fork-per-release-deletions: "preset-e2e-scalability-periodics-master"
   184      fork-per-release-replacements: "--extract=ci/fast/latest-fast -> --extract=ci/latest-{{.Version}}, gce-100Nodes-master -> gce-100Nodes-{{.Version}}"
   185      testgrid-dashboards: sig-release-master-blocking, sig-scalability-gce, google-gce, google-gci
   186      testgrid-tab-name: gce-cos-master-scalability-100
   187      testgrid-alert-email: kubernetes-sig-scale@googlegroups.com, kubernetes-scalability-tickets@google.com
   188      description: "Uses kubetest to run k8s.io/perf-tests/run-e2e.sh against a 100-node cluster created with cluster/kube-up.sh"
   189      testgrid-num-failures-to-alert: '2'
   190    spec:
   191      containers:
   192      - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20240515-17c6d50e24-master
   193        command:
   194        - runner.sh
   195        - /workspace/scenarios/kubernetes_e2e.py
   196        args:
   197        - --check-leaked-resources
   198        - --cluster=e2e-big
   199        - --env=APISERVER_TEST_ARGS=--max-requests-inflight=80 --max-mutating-requests-inflight=0 --profiling --contention-profiling
   200        - --env=HEAPSTER_MACHINE_TYPE=e2-standard-8
   201        - --extract=ci/fast/latest-fast
   202        - --extract-ci-bucket=k8s-release-dev
   203        - --gcp-node-image=gci
   204        - --gcp-nodes=100
   205        - --gcp-project-type=scalability-project
   206        - --gcp-zone=us-east1-b
   207        - --provider=gce
   208        - --metadata-sources=cl2-metadata.json
   209        - --env=CL2_ENABLE_DNS_PROGRAMMING=true
   210        - --env=CL2_SCHEDULER_THROUGHPUT_THRESHOLD=0
   211        - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true
   212        - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5
   213        - --test=false
   214        - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
   215        - --test-cmd-args=cluster-loader2
   216        - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
   217        - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
   218        - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
   219        - --test-cmd-args=--nodes=100
   220        - --test-cmd-args=--prometheus-scrape-kubelets=true
   221        - --test-cmd-args=--prometheus-scrape-node-exporter
   222        - --test-cmd-args=--provider=gce
   223        - --test-cmd-args=--report-dir=$(ARTIFACTS)
   224        - --test-cmd-args=--testconfig=testing/load/config.yaml
   225        - --test-cmd-args=--testconfig=testing/huge-service/config.yaml
   226        - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml
   227        # TODO(oxddr): re-enable this once we understand its impact on tests, https://github.com/kubernetes/kubernetes/issues/89051
   228        # - --test-cmd-args=--testoverrides=./testing/chaosmonkey/override.yaml
   229        # - --test-cmd-args=--testoverrides=./testing/chaosmonkey/ignore_node_killer_container_restarts_100.yaml
   230        - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml
   231        - --test-cmd-args=--testoverrides=./testing/experiments/use_simple_latency_query.yaml
   232        - --test-cmd-args=--testoverrides=./testing/overrides/load_throughput.yaml
   233        - --test-cmd-name=ClusterLoaderV2
   234        - --timeout=120m
   235        - --use-logexporter
   236        - --logexporter-gcs-path=gs://sig-scalability-logs/$(JOB_NAME)/$(BUILD_ID)
   237        resources:
   238          requests:
   239            cpu: 2
   240            memory: 6Gi
   241          limits:
   242            cpu: 2
   243            memory: 6Gi