volcano.sh/volcano@v1.9.0/installer/volcano-monitoring-v1.9.0.yaml (about) 1 --- 2 # Source: volcano/templates/prometheus.yaml 3 apiVersion: v1 4 kind: ConfigMap 5 metadata: 6 name: prometheus-server-conf 7 labels: 8 name: prometheus-server-conf 9 namespace: volcano-monitoring 10 data: 11 prometheus.rules: |- 12 groups: 13 - name: devopscube demo alert 14 rules: 15 - alert: High Pod Memory 16 expr: sum(container_memory_usage_bytes) > 1 17 for: 1m 18 labels: 19 severity: slack 20 annotations: 21 summary: High Memory Usage 22 prometheus.yml: |- 23 global: 24 scrape_interval: 5s 25 evaluation_interval: 5s 26 rule_files: 27 - /etc/prometheus/prometheus.rules 28 alerting: 29 alertmanagers: 30 - scheme: http 31 static_configs: 32 - targets: 33 - "alertmanager.monitoring.svc:9093" 34 35 scrape_configs: 36 - job_name: 'kubernetes-apiservers' 37 38 kubernetes_sd_configs: 39 - role: endpoints 40 scheme: https 41 42 tls_config: 43 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 44 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 45 46 relabel_configs: 47 - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] 48 action: keep 49 regex: default;kubernetes;https 50 51 - job_name: 'kubernetes-nodes' 52 53 scheme: https 54 55 tls_config: 56 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 57 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 58 59 kubernetes_sd_configs: 60 - role: node 61 62 relabel_configs: 63 - action: labelmap 64 regex: __meta_kubernetes_node_label_(.+) 65 - target_label: __address__ 66 replacement: kubernetes.default.svc:443 67 - source_labels: [__meta_kubernetes_node_name] 68 regex: (.+) 69 target_label: __metrics_path__ 70 replacement: /api/v1/nodes/${1}/proxy/metrics 71 72 73 - job_name: 'kubernetes-pods' 74 75 kubernetes_sd_configs: 76 - role: pod 77 78 relabel_configs: 79 - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] 80 action: keep 81 regex: true 82 - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] 83 action: replace 84 target_label: __metrics_path__ 85 regex: (.+) 86 - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] 87 action: replace 88 regex: ([^:]+)(?::\d+)?;(\d+) 89 replacement: $1:$2 90 target_label: __address__ 91 - action: labelmap 92 regex: __meta_kubernetes_pod_label_(.+) 93 - source_labels: [__meta_kubernetes_namespace] 94 action: replace 95 target_label: kubernetes_namespace 96 - source_labels: [__meta_kubernetes_pod_name] 97 action: replace 98 target_label: kubernetes_pod_name 99 100 - job_name: 'kube-state-metrics' 101 static_configs: 102 - targets: ['kube-state-metrics.volcano-monitoring.svc.cluster.local:8080'] 103 104 - job_name: 'kubernetes-cadvisor' 105 106 scheme: https 107 108 tls_config: 109 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 110 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 111 112 kubernetes_sd_configs: 113 - role: node 114 115 relabel_configs: 116 - action: labelmap 117 regex: __meta_kubernetes_node_label_(.+) 118 - target_label: __address__ 119 replacement: kubernetes.default.svc:443 120 - source_labels: [__meta_kubernetes_node_name] 121 regex: (.+) 122 target_label: __metrics_path__ 123 replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor 124 125 - job_name: 'kubernetes-service-endpoints' 126 127 kubernetes_sd_configs: 128 - role: endpoints 129 130 relabel_configs: 131 - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] 132 action: keep 133 regex: true 134 - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] 135 action: replace 136 target_label: __scheme__ 137 regex: (https?) 138 - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] 139 action: replace 140 target_label: __metrics_path__ 141 regex: (.+) 142 - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] 143 action: replace 144 target_label: __address__ 145 regex: ([^:]+)(?::\d+)?;(\d+) 146 replacement: $1:$2 147 - action: labelmap 148 regex: __meta_kubernetes_service_label_(.+) 149 - source_labels: [__meta_kubernetes_namespace] 150 action: replace 151 target_label: kubernetes_namespace 152 - source_labels: [__meta_kubernetes_service_name] 153 action: replace 154 target_label: kubernetes_name 155 --- 156 # Source: volcano/templates/prometheus.yaml 157 apiVersion: rbac.authorization.k8s.io/v1 158 kind: ClusterRole 159 metadata: 160 name: prometheus-volcano 161 rules: 162 - apiGroups: [""] 163 resources: 164 - nodes 165 - nodes/proxy 166 - services 167 - endpoints 168 - pods 169 verbs: ["get", "list", "watch"] 170 - apiGroups: 171 - extensions 172 resources: 173 - ingresses 174 verbs: ["get", "list", "watch"] 175 - nonResourceURLs: ["/metrics"] 176 verbs: ["get"] 177 --- 178 # Source: volcano/templates/prometheus.yaml 179 apiVersion: rbac.authorization.k8s.io/v1 180 kind: ClusterRoleBinding 181 metadata: 182 name: prometheus-volcano 183 roleRef: 184 apiGroup: rbac.authorization.k8s.io 185 kind: ClusterRole 186 name: prometheus-volcano 187 subjects: 188 - kind: ServiceAccount 189 name: default 190 namespace: volcano-monitoring 191 --- 192 # Source: volcano/templates/prometheus.yaml 193 apiVersion: v1 194 kind: Service 195 metadata: 196 name: prometheus-service 197 namespace: volcano-monitoring 198 annotations: 199 prometheus.io/scrape: 'true' 200 prometheus.io/port: '9090' 201 202 spec: 203 selector: 204 app: prometheus-server 205 type: NodePort 206 ports: 207 - port: 8080 208 targetPort: 9090 209 nodePort: 30003 210 --- 211 # Source: volcano/templates/prometheus.yaml 212 apiVersion: apps/v1 213 kind: Deployment 214 metadata: 215 name: prometheus-deployment 216 namespace: volcano-monitoring 217 labels: 218 app: prometheus-server 219 spec: 220 replicas: 1 221 selector: 222 matchLabels: 223 app: prometheus-server 224 template: 225 metadata: 226 labels: 227 app: prometheus-server 228 spec: 229 containers: 230 - name: prometheus 231 image: prom/prometheus 232 args: 233 - "--config.file=/etc/prometheus/prometheus.yml" 234 - "--storage.tsdb.path=/prometheus/" 235 ports: 236 - containerPort: 9090 237 volumeMounts: 238 - name: prometheus-config-volume 239 mountPath: /etc/prometheus/ 240 - name: prometheus-storage-volume 241 mountPath: /prometheus/ 242 volumes: 243 - name: prometheus-config-volume 244 configMap: 245 defaultMode: 420 246 name: prometheus-server-conf 247 248 - name: prometheus-storage-volume 249 emptyDir: {} 250 --- 251 # Source: volcano/templates/kubestatemetrics.yaml 252 apiVersion: v1 253 kind: ServiceAccount 254 metadata: 255 labels: 256 app.kubernetes.io/name: kube-state-metrics 257 name: kube-state-metrics 258 namespace: volcano-monitoring 259 --- 260 # Source: volcano/templates/kubestatemetrics.yaml 261 apiVersion: rbac.authorization.k8s.io/v1 262 kind: ClusterRole 263 metadata: 264 labels: 265 app.kubernetes.io/name: kube-state-metrics 266 name: kube-state-metrics 267 rules: 268 - apiGroups: 269 - "" 270 resources: 271 - configmaps 272 - secrets 273 - nodes 274 - pods 275 - services 276 - resourcequotas 277 - replicationcontrollers 278 - limitranges 279 - persistentvolumeclaims 280 - persistentvolumes 281 - namespaces 282 - endpoints 283 verbs: 284 - list 285 - watch 286 - apiGroups: 287 - extensions 288 resources: 289 - daemonsets 290 - deployments 291 - replicasets 292 - ingresses 293 verbs: 294 - list 295 - watch 296 - apiGroups: 297 - apps 298 resources: 299 - statefulsets 300 - daemonsets 301 - deployments 302 - replicasets 303 verbs: 304 - list 305 - watch 306 - apiGroups: 307 - batch 308 resources: 309 - cronjobs 310 - jobs 311 verbs: 312 - list 313 - watch 314 - apiGroups: 315 - autoscaling 316 resources: 317 - horizontalpodautoscalers 318 verbs: 319 - list 320 - watch 321 - apiGroups: 322 - authentication.k8s.io 323 resources: 324 - tokenreviews 325 verbs: 326 - create 327 - apiGroups: 328 - authorization.k8s.io 329 resources: 330 - subjectaccessreviews 331 verbs: 332 - create 333 - apiGroups: 334 - policy 335 resources: 336 - poddisruptionbudgets 337 verbs: 338 - list 339 - watch 340 - apiGroups: 341 - certificates.k8s.io 342 resources: 343 - certificatesigningrequests 344 verbs: 345 - list 346 - watch 347 - apiGroups: 348 - storage.k8s.io 349 resources: 350 - storageclasses 351 - volumeattachments 352 verbs: 353 - list 354 - watch 355 - apiGroups: 356 - admissionregistration.k8s.io 357 resources: 358 - mutatingwebhookconfigurations 359 - validatingwebhookconfigurations 360 verbs: 361 - list 362 - watch 363 - apiGroups: 364 - networking.k8s.io 365 resources: 366 - networkpolicies 367 verbs: 368 - list 369 - watch 370 --- 371 # Source: volcano/templates/kubestatemetrics.yaml 372 apiVersion: rbac.authorization.k8s.io/v1 373 kind: ClusterRoleBinding 374 metadata: 375 labels: 376 app.kubernetes.io/name: kube-state-metrics 377 name: kube-state-metrics 378 roleRef: 379 apiGroup: rbac.authorization.k8s.io 380 kind: ClusterRole 381 name: kube-state-metrics 382 subjects: 383 - kind: ServiceAccount 384 name: kube-state-metrics 385 namespace: volcano-monitoring 386 --- 387 # Source: volcano/templates/kubestatemetrics.yaml 388 apiVersion: v1 389 kind: Service 390 metadata: 391 labels: 392 app.kubernetes.io/name: kube-state-metrics 393 name: kube-state-metrics 394 namespace: volcano-monitoring 395 annotations: 396 prometheus.io/path: /metrics 397 prometheus.io/port: "8080" 398 prometheus.io/scrape: "true" 399 spec: 400 ports: 401 - name: http-metrics 402 port: 8080 403 targetPort: http-metrics 404 - name: telemetry 405 port: 8081 406 targetPort: telemetry 407 selector: 408 k8s-app: kube-state-metrics 409 --- 410 # Source: volcano/templates/kubestatemetrics.yaml 411 apiVersion: apps/v1 412 kind: Deployment 413 metadata: 414 name: kube-state-metrics 415 namespace: volcano-monitoring 416 labels: 417 k8s-app: kube-state-metrics 418 spec: 419 progressDeadlineSeconds: 600 420 replicas: 1 421 selector: 422 matchLabels: 423 k8s-app: kube-state-metrics 424 strategy: 425 rollingUpdate: 426 maxSurge: 25% 427 maxUnavailable: 25% 428 type: RollingUpdate 429 template: 430 metadata: 431 labels: 432 k8s-app: kube-state-metrics 433 spec: 434 containers: 435 - image: quay.io/coreos/kube-state-metrics:v1.9.7 436 imagePullPolicy: Always 437 name: kube-state-metrics 438 ports: 439 - name: http-metrics 440 containerPort: 8080 441 readinessProbe: 442 httpGet: 443 path: /healthz 444 port: 8080 445 initialDelaySeconds: 5 446 timeoutSeconds: 5 447 dnsPolicy: ClusterFirst 448 449 nodeSelector: 450 node.kubernetes.io/instance-type: controlpanel 451 452 serviceAccountName: kube-state-metrics 453 --- 454 # Source: volcano/templates/grafana.yaml 455 apiVersion: v1 456 kind: ConfigMap 457 metadata: 458 name: grafana-datasources 459 namespace: volcano-monitoring 460 data: 461 prometheus.yaml: |- 462 { 463 "apiVersion": 1, 464 "datasources": [ 465 { 466 "access":"proxy", 467 "editable": true, 468 "isDefault": true, 469 "name": "prometheus", 470 "orgId": 1, 471 "type": "prometheus", 472 "url": "http://prometheus-service.volcano-monitoring.svc:8080", 473 "version": 1 474 } 475 ] 476 } 477 --- 478 # Source: volcano/templates/grafana.yaml 479 apiVersion: v1 480 kind: ConfigMap 481 metadata: 482 name: grafana-volcano-dashboard-config 483 namespace: volcano-monitoring 484 data: 485 dashboard.yaml: |- 486 apiVersion: 1 487 providers: 488 - name: dashboards 489 type: file 490 updateIntervalSeconds: 30 491 options: 492 path: /var/lib/grafana/dashboards 493 foldersFromFilesStructure: true 494 --- 495 # Source: volcano/templates/grafana.yaml 496 apiVersion: v1 497 kind: ConfigMap 498 metadata: 499 name: grafana-volcano-dashboard 500 namespace: volcano-monitoring 501 data: 502 volcano-globcal-overview-dashboard.json: |- 503 {"annotations":{"list":[{"builtIn":1,"datasource":"prometheus","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":2,"links":[],"panels":[{"datasource":null,"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":0,"y":0},"id":20,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count(max_over_time(kube_pod_container_status_running{job=\"kube-state-metrics\"}[1h]) != 0)","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"TPH –Schedule Task In 1 Hour","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":3,"y":0},"id":21,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_node_info{job=\"kube-state-metrics\"})","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Node","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":6,"y":0},"id":23,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"kube_node_status_capacity{resource=\"nvidia_com_gpu\",job=\"kube-state-metrics\"}","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano GPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":9,"y":0},"id":24,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_node_status_capacity{job=\"kube-state-metrics\", resource=\"memory\"})","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Memory","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":12,"y":0},"id":22,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_node_status_capacity{job=\"kube-state-metrics\", resource=\"cpu\"})","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano CPU","type":"stat"},{"cards":{"cardPadding":null,"cardRound":null},"color":{"cardColor":"#b4ff00","colorScale":"sqrt","colorScheme":"interpolateOranges","exponent":0.5,"mode":"spectrum"},"dataFormat":"timeseries","datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":50}]},"unit":"none"},"overrides":[]},"gridPos":{"h":8,"w":16,"x":0,"y":5},"heatmap":{},"hideZeroBuckets":false,"highlightCards":true,"id":18,"legend":{"show":false},"pluginVersion":"7.3.4","reverseYBuckets":false,"targets":[{"expr":"increase(volcano_e2e_job_scheduling_latency_milliseconds_bucket[1h])","format":"heatmap","instant":false,"interval":"","legendFormat":"{{le}} ms","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Legency Heatmap","tooltip":{"show":true,"showHistogram":false},"transformations":[],"type":"heatmap","xAxis":{"show":true},"xBucketNumber":null,"xBucketSize":null,"yAxis":{"decimals":null,"format":"ms","logBase":2,"max":"500000","min":null,"show":true,"splitFactor":null},"yBucketBound":"auto","yBucketNumber":null,"yBucketSize":null},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":50}]},"unit":"ms"},"overrides":[]},"gridPos":{"h":7,"w":16,"x":0,"y":13},"id":26,"options":{"displayMode":"lcd","orientation":"horizontal","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"showUnfilled":true},"pluginVersion":"7.3.4","targets":[{"expr":"avg(volcano_e2e_job_scheduling_duration{}) by (queue)","interval":"","legendFormat":"{{queue}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Job Scheduling Avg Duration By Queue In 24H","type":"bargauge"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"ms"},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"}]},{"matcher":{"id":"byName","options":"job_namespace"},"properties":[{"id":"custom.width","value":279}]}]},"gridPos":{"h":7,"w":16,"x":0,"y":20},"id":27,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"avg(volcano_e2e_job_scheduling_duration{}) by (job_namespace)","format":"table","instant":true,"interval":"","legendFormat":"Namespace: {{job_namespace}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Job Avg Scheduling Duration By Namespace In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"bytes"}]}]},"gridPos":{"h":8,"w":16,"x":0,"y":27},"id":29,"options":{"showHeader":true},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_volcano_container_resource_requests{resource=\"memory\", unit=\"byte\",job=\"kube-state-metrics\",queue!=\"\"}) by (queue)","format":"table","instant":true,"interval":"","legendFormat":"{{queue}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Resource Usage Sort By Queue In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"}]}]},"gridPos":{"h":8,"w":16,"x":0,"y":35},"id":30,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_volcano_container_resource_requests{resource=\"memory\", unit=\"byte\",job=\"kube-state-metrics\"}) by (volcano_namespace)","format":"table","instant":true,"interval":"","legendFormat":"Namespace : {{volcano_namespace}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Resource Usage Sort By Namespace In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"datasource":null,"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.width","value":651},{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"},{"id":"thresholds","value":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}}]},{"matcher":{"id":"byName","options":"job_name"},"properties":[{"id":"custom.width","value":361}]},{"matcher":{"id":"byName","options":"Volcano Job"},"properties":[{"id":"custom.width","value":228}]}]},"gridPos":{"h":13,"w":16,"x":0,"y":43},"id":16,"options":{"frameIndex":1,"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"increase(volcano_e2e_job_scheduling_duration{}[24h]) != 0","format":"table","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Job Running Legency","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true,"__name__":true,"instance":true,"job":true,"kubernetes_name":true,"kubernetes_namespace":true},"indexByName":{},"renameByName":{"Time":"","job_name":"Volcano Job"}}}],"type":"table"},{"collapsed":false,"datasource":null,"gridPos":{"h":1,"w":24,"x":0,"y":56},"id":13,"panels":[],"title":"Volcano Fairness","type":"row"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":7,"w":16,"x":0,"y":57},"hiddenSeries":false,"id":14,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","options":{"alertThreshold":true},"paceLength":10,"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"stddev(volcano_e2e_job_scheduling_duration)/avg(volcano_e2e_job_scheduling_duration)","format":"time_series","intervalFactor":1,"legendFormat":"CV (Job Duration)","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Job Duration Coefficient Of Variation","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"transparent":true,"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"collapsed":false,"datasource":null,"gridPos":{"h":1,"w":24,"x":0,"y":64},"id":11,"panels":[],"title":"Volcano Effectiveness","type":"row"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"gridPos":{"h":8,"w":5,"x":0,"y":65},"id":2,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(\n(sum(kube_pod_container_resource_requests{resource=\"cpu\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{}) \nby (pod,namespace)))/\nsum(kube_node_status_allocatable{resource=\"cpu\", unit=\"core\"})","format":"time_series","instant":false,"interval":"","intervalFactor":1,"legendFormat":"","refId":"A"}],"thresholds":"0.7,0.9","timeFrom":null,"timeShift":null,"title":"Volcano Cluster Average CPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"gridPos":{"h":8,"w":5,"x":5,"y":65},"id":3,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum((sum(kube_pod_container_resource_requests{resource=\"memory\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{}) by (pod,namespace)))/sum(kube_node_status_allocatable{resource=\"memory\", unit=\"byte\"})","format":"time_series","instant":false,"interval":"","intervalFactor":1,"legendFormat":"","refId":"A"}],"thresholds":"0.7,0.9","timeFrom":null,"timeShift":null,"title":"Volcano Cluster Average Memory Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"gridPos":{"h":8,"w":5,"x":10,"y":65},"id":4,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum((sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{}) by (pod,namespace)))/sum(kube_node_status_capacity{resource=\"nvidia_com_gpu\"})","format":"time_series","instant":false,"interval":"","intervalFactor":1,"legendFormat":"","refId":"A"}],"thresholds":"0.7,0.9","timeFrom":null,"timeShift":null,"title":"Volcano Cluster Average GPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":7,"w":16,"x":0,"y":73},"hiddenSeries":false,"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","options":{"alertThreshold":true},"paceLength":10,"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"stddev(sum by (node) (kube_pod_container_resource_requests{resource=\"cpu\"}))/avg(sum by (node) (kube_pod_container_resource_requests{resource=\"cpu\"}))","format":"time_series","intervalFactor":1,"legendFormat":"CV (CPU)","refId":"A"},{"expr":"stddev(sum by (node) (kube_pod_container_resource_requests{resource=\"memory\"}))/avg(sum by (node) (kube_pod_container_resource_requests{resource=\"memory\"}))","format":"time_series","intervalFactor":1,"legendFormat":"CV (Memory)","refId":"B"},{"expr":"stddev(sum by (node) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}))/avg(sum by (node) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}))","format":"time_series","intervalFactor":1,"legendFormat":"CV (Nvidia GPU)","refId":"C"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Node Resource Coefficient Of Variation","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"transparent":true,"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}}],"refresh":false,"schemaVersion":26,"style":"dark","tags":[],"templating":{"list":[]},"time":{"from":"now-12h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"","title":"Volcano Global Overview Dashboard","uid":"nYn30KvMzf","version":19} 504 volcano-queue-overview-dashboard.json: |- 505 {"annotations":{"list":[{"builtIn":1,"datasource":"prometheus","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":4,"iteration":1607928216980,"links":[],"panels":[{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":0,"y":0},"id":6,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}==1)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":3,"y":0},"id":16,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}==0)","instant":false,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Finished Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":6,"y":0},"id":17,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count((max_over_time(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}[10m]) != 0) and kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"} == 0)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Last 10m Finished Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":9,"y":0},"id":7,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running CPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":12,"y":0},"id":8,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running GPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":15,"y":0},"id":2,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Memory","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Time"},"properties":[{"id":"custom.width","value":195}]},{"matcher":{"id":"byName","options":"__name__"},"properties":[{"id":"custom.width","value":267}]},{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"}]}]},"gridPos":{"h":24,"w":12,"x":0,"y":5},"id":14,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"increase(volcano_e2e_job_scheduling_duration{queue=\"$queue\"}[24h]) != 0 ","format":"table","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Queue Running Job Legency","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true,"__name__":true,"instance":true,"job":true,"kubernetes_name":true,"kubernetes_namespace":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":5},"hiddenSeries":false,"id":12,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"CPU Cores","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Queue Running CPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{},"unit":"bytes"},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":13},"hiddenSeries":false,"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Queue Running Memory ","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":21},"hiddenSeries":false,"id":11,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"GPU Cards","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Queue Running GPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}}],"schemaVersion":26,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{"selected":false,"text":"default","value":"default"},"datasource":"prometheus","definition":"label_values(volcano_queue_share,queue_name)","error":null,"hide":0,"includeAll":false,"label":null,"multi":false,"name":"queue","options":[],"query":"label_values(volcano_queue_share,queue_name)","refresh":1,"regex":"","skipUrlSync":false,"sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-6h","to":"now"},"timepicker":{},"timezone":"","title":"Volcano Queue View","uid":"sAtQfo1Mk","version":8} 506 volcano-namespace-overview-dashboard.json: |- 507 {"annotations":{"list":[{"builtIn":1,"datasource":"prometheus","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":3,"iteration":1607928231899,"links":[],"panels":[{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":0,"y":0},"id":6,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"}==1)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":3,"y":0},"id":16,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count(kube_pod_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"}==0)","instant":false,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Finished Job Total","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":6,"y":0},"id":17,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count((max_over_time(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"}[10m]) != 0) and kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"} == 0)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Last 10m Finished Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":9,"y":0},"id":7,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running CPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":12,"y":0},"id":8,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running GPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":15,"y":0},"id":2,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Memory","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Time"},"properties":[{"id":"custom.width","value":195}]},{"matcher":{"id":"byName","options":"__name__"},"properties":[{"id":"custom.width","value":267}]},{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"}]}]},"gridPos":{"h":24,"w":12,"x":0,"y":5},"id":14,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"increase(volcano_e2e_job_scheduling_duration{job_namespace=\"$namespace\"}[24h]) != 0 ","format":"table","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Namespace Running Job Legency In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true,"__name__":true,"instance":true,"job":true,"kubernetes_name":true,"kubernetes_namespace":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":5},"hiddenSeries":false,"id":12,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"CPU Cores","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Namespace Running CPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{},"unit":"bytes"},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":13},"hiddenSeries":false,"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"Memory Bytes","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Namespace Running Memory ","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":21},"hiddenSeries":false,"id":11,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"GPU Cards","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Namespace Running GPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}}],"schemaVersion":26,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{"selected":false,"text":"yu7gvcjd","value":"yu7gvcjd"},"datasource":"prometheus","definition":"label_values(kube_namespace_labels, namespace)","error":null,"hide":0,"includeAll":false,"label":null,"multi":false,"name":"namespace","options":[],"query":"label_values(kube_namespace_labels, namespace)","refresh":1,"regex":"","skipUrlSync":false,"sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-6h","to":"now"},"timepicker":{},"timezone":"","title":"Volcano Namespace View","uid":"TWuLSpJMk","version":14} 508 --- 509 # Source: volcano/templates/grafana.yaml 510 apiVersion: v1 511 kind: Service 512 metadata: 513 name: grafana 514 namespace: volcano-monitoring 515 annotations: 516 prometheus.io/scrape: 'true' 517 prometheus.io/port: '3000' 518 spec: 519 selector: 520 app: grafana 521 type: NodePort 522 ports: 523 - port: 3000 524 targetPort: 3000 525 nodePort: 30004 526 --- 527 # Source: volcano/templates/grafana.yaml 528 apiVersion: apps/v1 529 kind: Deployment 530 metadata: 531 name: grafana 532 namespace: volcano-monitoring 533 spec: 534 replicas: 1 535 selector: 536 matchLabels: 537 app: grafana 538 template: 539 metadata: 540 name: grafana 541 labels: 542 app: grafana 543 spec: 544 containers: 545 - name: grafana 546 image: grafana/grafana:latest 547 readinessProbe: 548 httpGet: 549 path: /api/health 550 port: 3000 551 livenessProbe: 552 httpGet: 553 path: /api/health 554 port: 3000 555 initialDelaySeconds: 10 556 periodSeconds: 10 557 ports: 558 - name: grafana 559 containerPort: 3000 560 resources: 561 limits: 562 memory: "2Gi" 563 cpu: "1000m" 564 requests: 565 memory: "1Gi" 566 cpu: "500m" 567 volumeMounts: 568 - mountPath: /var/lib/grafana 569 name: grafana-storage 570 - mountPath: /etc/grafana/provisioning/datasources 571 name: grafana-datasources 572 readOnly: false 573 - mountPath: /var/lib/grafana/dashboards 574 name: grafana-volcano-dashboard 575 readOnly: false 576 - mountPath: /etc/grafana/provisioning/dashboards 577 name: grafana-volcano-dashboard-config 578 readOnly: true 579 volumes: 580 - name: grafana-storage 581 emptyDir: {} 582 - name: grafana-volcano-dashboard 583 configMap: 584 defaultMode: 420 585 name: grafana-volcano-dashboard 586 - name: grafana-datasources 587 configMap: 588 defaultMode: 420 589 name: grafana-datasources 590 - name: grafana-volcano-dashboard-config 591 configMap: 592 defaultMode: 420 593 name: grafana-volcano-dashboard-config