k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/jobs/kubernetes/sig-scalability/sig-scalability-release-blocking-jobs.yaml (about) 1 periodics: 2 # This is a sig-release-master-blocking job. 3 # The frequency was cut to reduce infrastructure costs. 4 - cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC) 5 name: ci-kubernetes-e2e-gce-scale-correctness 6 cluster: k8s-infra-prow-build 7 labels: 8 preset-service-account: "true" 9 preset-k8s-ssh: "true" 10 preset-e2e-scalability-common: "true" 11 preset-e2e-scalability-periodics: "true" 12 preset-e2e-scalability-periodics-master: "true" 13 decorate: true 14 decoration_config: 15 timeout: 270m 16 annotations: 17 testgrid-num-failures-to-alert: '2' 18 testgrid-alert-email: kubernetes-sig-scale@googlegroups.com, kubernetes-scalability-tickets@google.com, release-team@kubernetes.io 19 testgrid-dashboards: sig-release-master-informing, sig-scalability-gce, google-gce 20 testgrid-tab-name: gce-master-scale-correctness 21 testgrid-base-options: 'exclude-filter-by-regex=^(kubetest\.Test|ci-kubernetes-e2e-gce-scale-correctness\.Overall)$' 22 description: "Uses kubetest to run correctness tests against a 5000-node cluster created with cluster/kube-up.sh" 23 spec: 24 containers: 25 - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20240515-17c6d50e24-master 26 command: 27 - runner.sh 28 - /workspace/scenarios/kubernetes_e2e.py 29 args: 30 - --cluster=gce-scale-cluster 31 - --env=CONCURRENT_SERVICE_SYNCS=20 # support 20 LoadBalancer Services in parallel to deal with existing CI load #122286 32 - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32 33 - --extract=ci/fast/latest-fast 34 - --extract-ci-bucket=k8s-release-dev 35 - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms 36 # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics. 37 - --env=CONTROLLER_MANAGER_TEST_ARGS=--profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100 38 - --gcp-master-image=gci 39 - --gcp-node-image=gci 40 - --gcp-node-size=e2-small 41 - --gcp-nodes=5000 42 - --gcp-project-type=scalability-scale-project 43 - --gcp-ssh-proxy-instance-name=gce-scale-cluster-master 44 - --gcp-zone=us-east1-b 45 - --ginkgo-parallel=40 46 - --provider=gce 47 - --test_args=--ginkgo.skip=\[Driver:.gcepd\]|\[Serial\]|\[Disruptive\]|\[Flaky\]|\[Feature:([^L].*|L[^o].*|Lo[^a].*|Loa[^d].*)\] --minStartupPods=8 --node-schedulable-timeout=90m 48 - --timeout=240m 49 - --use-logexporter 50 - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID) 51 resources: 52 requests: 53 cpu: 6 54 memory: "39Gi" 55 limits: 56 cpu: 6 57 memory: "39Gi" 58 59 # This is a sig-release-master-blocking job. 60 # The frequency was cut to reduce infrastructure costs. 61 - cron: '1 17 1-31/2 * *' # Run on odd days at 9:01PST (17:01 UTC) 62 name: ci-kubernetes-e2e-gce-scale-performance 63 tags: 64 - "perfDashPrefix: gce-5000Nodes" 65 - "perfDashBuildsCount: 270" 66 - "perfDashJobType: performance" 67 cluster: k8s-infra-prow-build 68 labels: 69 preset-service-account: "true" 70 preset-k8s-ssh: "true" 71 preset-e2e-scalability-common: "true" 72 preset-e2e-scalability-periodics: "true" 73 preset-e2e-scalability-periodics-master: "true" 74 decorate: true 75 decoration_config: 76 timeout: 450m 77 extra_refs: 78 - org: kubernetes 79 repo: kubernetes 80 base_ref: master 81 path_alias: k8s.io/kubernetes 82 - org: kubernetes 83 repo: perf-tests 84 base_ref: master 85 path_alias: k8s.io/perf-tests 86 annotations: 87 testgrid-alert-email: kubernetes-sig-scale@googlegroups.com, kubernetes-scalability-tickets@google.com, release-team@kubernetes.io 88 testgrid-dashboards: sig-release-master-informing, sig-scalability-gce, google-gce 89 testgrid-tab-name: gce-master-scale-performance 90 description: "Uses kubetest to run k8s.io/perf-tests/run-e2e.sh against a 5000-node cluster created with cluster/kube-up.sh" 91 spec: 92 containers: 93 - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20240515-17c6d50e24-master 94 command: 95 - runner.sh 96 - /workspace/scenarios/kubernetes_e2e.py 97 args: 98 - --cluster=gce-scale-cluster 99 - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32 100 # TODO(mborsz): Adjust or remove this change once we understand coredns 101 # memory usage regression. 102 - --env=KUBE_DNS_MEMORY_LIMIT=300Mi 103 - --extract=ci/fast/latest-fast 104 - --extract-ci-bucket=k8s-release-dev 105 - --gcp-nodes=5000 106 - --gcp-project-type=scalability-scale-project 107 - --gcp-zone=us-east1-b 108 - --provider=gce 109 - --metadata-sources=cl2-metadata.json 110 - --env=CL2_LOAD_TEST_THROUGHPUT=50 111 - --env=CL2_DELETE_TEST_THROUGHPUT=50 112 - --env=CL2_RATE_LIMIT_POD_CREATION=false 113 - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms 114 # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics. 115 - --env=CONTROLLER_MANAGER_TEST_ARGS=--profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100 116 # Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics. 117 # TODO(#1311): Clean this up after the experiment - it should allow 118 # to hugely decrease pod-startup-latency across the whole test. 119 # Given that individual controllers have separate QPS limits, we allow 120 # scheduler to keep up with the load from deployment, daemonset and job 121 # performing pod creations at once. 122 - --env=SCHEDULER_TEST_ARGS=--profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500 123 # With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0. 124 - --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0 125 - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true 126 - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5 127 - --test=false 128 - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh 129 - --test-cmd-args=cluster-loader2 130 - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true 131 - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID) 132 - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true 133 - --test-cmd-args=--nodes=5000 134 - --test-cmd-args=--prometheus-scrape-node-exporter 135 - --test-cmd-args=--provider=gce 136 - --test-cmd-args=--report-dir=$(ARTIFACTS) 137 - --test-cmd-args=--testconfig=testing/load/config.yaml 138 - --test-cmd-args=--testconfig=testing/huge-service/config.yaml 139 - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml 140 - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml 141 - --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml 142 - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml 143 - --test-cmd-name=ClusterLoaderV2 144 - --timeout=420m 145 - --use-logexporter 146 - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID) 147 resources: 148 requests: 149 cpu: 6 150 memory: "16Gi" 151 limits: 152 cpu: 6 153 memory: "16Gi" 154 155 - interval: 30m 156 cluster: k8s-infra-prow-build 157 name: ci-kubernetes-e2e-gci-gce-scalability 158 tags: 159 - "perfDashPrefix: gce-100Nodes-master" 160 - "perfDashJobType: performance" 161 - "perfDashBuildsCount: 500" 162 labels: 163 preset-service-account: "true" 164 preset-k8s-ssh: "true" 165 preset-e2e-scalability-common: "true" 166 preset-e2e-scalability-periodics: "true" 167 preset-e2e-scalability-periodics-master: "true" 168 decorate: true 169 decoration_config: 170 timeout: 140m 171 extra_refs: 172 - org: kubernetes 173 repo: kubernetes 174 base_ref: master 175 path_alias: k8s.io/kubernetes 176 - org: kubernetes 177 repo: perf-tests 178 base_ref: master 179 path_alias: k8s.io/perf-tests 180 annotations: 181 fork-per-release: "true" 182 fork-per-release-cron: 0 */6 * * *, 0 0/12 * * *, 0 4-16/12 * * *, 0 8-20/12 * * *, 0 8-20/24 * * * 183 fork-per-release-deletions: "preset-e2e-scalability-periodics-master" 184 fork-per-release-replacements: "--extract=ci/fast/latest-fast -> --extract=ci/latest-{{.Version}}, gce-100Nodes-master -> gce-100Nodes-{{.Version}}" 185 testgrid-dashboards: sig-release-master-blocking, sig-scalability-gce, google-gce, google-gci 186 testgrid-tab-name: gce-cos-master-scalability-100 187 testgrid-alert-email: kubernetes-sig-scale@googlegroups.com, kubernetes-scalability-tickets@google.com 188 description: "Uses kubetest to run k8s.io/perf-tests/run-e2e.sh against a 100-node cluster created with cluster/kube-up.sh" 189 testgrid-num-failures-to-alert: '2' 190 spec: 191 containers: 192 - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20240515-17c6d50e24-master 193 command: 194 - runner.sh 195 - /workspace/scenarios/kubernetes_e2e.py 196 args: 197 - --check-leaked-resources 198 - --cluster=e2e-big 199 - --env=APISERVER_TEST_ARGS=--max-requests-inflight=80 --max-mutating-requests-inflight=0 --profiling --contention-profiling 200 - --env=HEAPSTER_MACHINE_TYPE=e2-standard-8 201 - --extract=ci/fast/latest-fast 202 - --extract-ci-bucket=k8s-release-dev 203 - --gcp-node-image=gci 204 - --gcp-nodes=100 205 - --gcp-project-type=scalability-project 206 - --gcp-zone=us-east1-b 207 - --provider=gce 208 - --metadata-sources=cl2-metadata.json 209 - --env=CL2_ENABLE_DNS_PROGRAMMING=true 210 - --env=CL2_SCHEDULER_THROUGHPUT_THRESHOLD=0 211 - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true 212 - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5 213 - --test=false 214 - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh 215 - --test-cmd-args=cluster-loader2 216 - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true 217 - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID) 218 - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true 219 - --test-cmd-args=--nodes=100 220 - --test-cmd-args=--prometheus-scrape-kubelets=true 221 - --test-cmd-args=--prometheus-scrape-node-exporter 222 - --test-cmd-args=--provider=gce 223 - --test-cmd-args=--report-dir=$(ARTIFACTS) 224 - --test-cmd-args=--testconfig=testing/load/config.yaml 225 - --test-cmd-args=--testconfig=testing/huge-service/config.yaml 226 - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml 227 # TODO(oxddr): re-enable this once we understand its impact on tests, https://github.com/kubernetes/kubernetes/issues/89051 228 # - --test-cmd-args=--testoverrides=./testing/chaosmonkey/override.yaml 229 # - --test-cmd-args=--testoverrides=./testing/chaosmonkey/ignore_node_killer_container_restarts_100.yaml 230 - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml 231 - --test-cmd-args=--testoverrides=./testing/experiments/use_simple_latency_query.yaml 232 - --test-cmd-args=--testoverrides=./testing/overrides/load_throughput.yaml 233 - --test-cmd-name=ClusterLoaderV2 234 - --timeout=120m 235 - --use-logexporter 236 - --logexporter-gcs-path=gs://sig-scalability-logs/$(JOB_NAME)/$(BUILD_ID) 237 resources: 238 requests: 239 cpu: 2 240 memory: 6Gi 241 limits: 242 cpu: 2 243 memory: 6Gi