volcano.sh/volcano@v1.9.0/installer/volcano-monitoring-v1.9.0.yaml (about)

     1  ---
     2  # Source: volcano/templates/prometheus.yaml
     3  apiVersion: v1
     4  kind: ConfigMap
     5  metadata:
     6    name: prometheus-server-conf
     7    labels:
     8      name: prometheus-server-conf
     9    namespace: volcano-monitoring
    10  data:
    11    prometheus.rules: |-
    12      groups:
    13      - name: devopscube demo alert
    14        rules:
    15        - alert: High Pod Memory
    16          expr: sum(container_memory_usage_bytes) > 1
    17          for: 1m
    18          labels:
    19            severity: slack
    20          annotations:
    21            summary: High Memory Usage
    22    prometheus.yml: |-
    23      global:
    24        scrape_interval: 5s
    25        evaluation_interval: 5s
    26      rule_files:
    27        - /etc/prometheus/prometheus.rules
    28      alerting:
    29        alertmanagers:
    30        - scheme: http
    31          static_configs:
    32          - targets:
    33            - "alertmanager.monitoring.svc:9093"
    34  
    35      scrape_configs:
    36        - job_name: 'kubernetes-apiservers'
    37  
    38          kubernetes_sd_configs:
    39          - role: endpoints
    40          scheme: https
    41  
    42          tls_config:
    43            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    44          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    45  
    46          relabel_configs:
    47          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
    48            action: keep
    49            regex: default;kubernetes;https
    50  
    51        - job_name: 'kubernetes-nodes'
    52  
    53          scheme: https
    54  
    55          tls_config:
    56            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    57          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    58  
    59          kubernetes_sd_configs:
    60          - role: node
    61  
    62          relabel_configs:
    63          - action: labelmap
    64            regex: __meta_kubernetes_node_label_(.+)
    65          - target_label: __address__
    66            replacement: kubernetes.default.svc:443
    67          - source_labels: [__meta_kubernetes_node_name]
    68            regex: (.+)
    69            target_label: __metrics_path__
    70            replacement: /api/v1/nodes/${1}/proxy/metrics
    71  
    72        
    73        - job_name: 'kubernetes-pods'
    74  
    75          kubernetes_sd_configs:
    76          - role: pod
    77  
    78          relabel_configs:
    79          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
    80            action: keep
    81            regex: true
    82          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
    83            action: replace
    84            target_label: __metrics_path__
    85            regex: (.+)
    86          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
    87            action: replace
    88            regex: ([^:]+)(?::\d+)?;(\d+)
    89            replacement: $1:$2
    90            target_label: __address__
    91          - action: labelmap
    92            regex: __meta_kubernetes_pod_label_(.+)
    93          - source_labels: [__meta_kubernetes_namespace]
    94            action: replace
    95            target_label: kubernetes_namespace
    96          - source_labels: [__meta_kubernetes_pod_name]
    97            action: replace
    98            target_label: kubernetes_pod_name
    99        
   100        - job_name: 'kube-state-metrics'
   101          static_configs:
   102            - targets: ['kube-state-metrics.volcano-monitoring.svc.cluster.local:8080']
   103  
   104        - job_name: 'kubernetes-cadvisor'
   105  
   106          scheme: https
   107  
   108          tls_config:
   109            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
   110          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
   111  
   112          kubernetes_sd_configs:
   113          - role: node
   114  
   115          relabel_configs:
   116          - action: labelmap
   117            regex: __meta_kubernetes_node_label_(.+)
   118          - target_label: __address__
   119            replacement: kubernetes.default.svc:443
   120          - source_labels: [__meta_kubernetes_node_name]
   121            regex: (.+)
   122            target_label: __metrics_path__
   123            replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
   124        
   125        - job_name: 'kubernetes-service-endpoints'
   126  
   127          kubernetes_sd_configs:
   128          - role: endpoints
   129  
   130          relabel_configs:
   131          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
   132            action: keep
   133            regex: true
   134          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
   135            action: replace
   136            target_label: __scheme__
   137            regex: (https?)
   138          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
   139            action: replace
   140            target_label: __metrics_path__
   141            regex: (.+)
   142          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
   143            action: replace
   144            target_label: __address__
   145            regex: ([^:]+)(?::\d+)?;(\d+)
   146            replacement: $1:$2
   147          - action: labelmap
   148            regex: __meta_kubernetes_service_label_(.+)
   149          - source_labels: [__meta_kubernetes_namespace]
   150            action: replace
   151            target_label: kubernetes_namespace
   152          - source_labels: [__meta_kubernetes_service_name]
   153            action: replace
   154            target_label: kubernetes_name
   155  ---
   156  # Source: volcano/templates/prometheus.yaml
   157  apiVersion: rbac.authorization.k8s.io/v1
   158  kind: ClusterRole
   159  metadata:
   160    name: prometheus-volcano
   161  rules:
   162  - apiGroups: [""]
   163    resources:
   164    - nodes
   165    - nodes/proxy
   166    - services
   167    - endpoints
   168    - pods
   169    verbs: ["get", "list", "watch"]
   170  - apiGroups:
   171    - extensions
   172    resources:
   173    - ingresses
   174    verbs: ["get", "list", "watch"]
   175  - nonResourceURLs: ["/metrics"]
   176    verbs: ["get"]
   177  ---
   178  # Source: volcano/templates/prometheus.yaml
   179  apiVersion: rbac.authorization.k8s.io/v1
   180  kind: ClusterRoleBinding
   181  metadata:
   182    name: prometheus-volcano
   183  roleRef:
   184    apiGroup: rbac.authorization.k8s.io
   185    kind: ClusterRole
   186    name: prometheus-volcano
   187  subjects:
   188  - kind: ServiceAccount
   189    name: default
   190    namespace: volcano-monitoring
   191  ---
   192  # Source: volcano/templates/prometheus.yaml
   193  apiVersion: v1
   194  kind: Service
   195  metadata:
   196    name: prometheus-service
   197    namespace: volcano-monitoring
   198    annotations:
   199        prometheus.io/scrape: 'true'
   200        prometheus.io/port:   '9090'
   201    
   202  spec:
   203    selector: 
   204      app: prometheus-server
   205    type: NodePort  
   206    ports:
   207      - port: 8080
   208        targetPort: 9090 
   209        nodePort: 30003
   210  ---
   211  # Source: volcano/templates/prometheus.yaml
   212  apiVersion: apps/v1
   213  kind: Deployment
   214  metadata:
   215    name: prometheus-deployment
   216    namespace: volcano-monitoring
   217    labels:
   218      app: prometheus-server
   219  spec:
   220    replicas: 1
   221    selector:
   222      matchLabels:
   223        app: prometheus-server
   224    template:
   225      metadata:
   226        labels:
   227          app: prometheus-server
   228      spec:
   229        containers:
   230          - name: prometheus
   231            image: prom/prometheus
   232            args:
   233              - "--config.file=/etc/prometheus/prometheus.yml"
   234              - "--storage.tsdb.path=/prometheus/"
   235            ports:
   236              - containerPort: 9090
   237            volumeMounts:
   238              - name: prometheus-config-volume
   239                mountPath: /etc/prometheus/
   240              - name: prometheus-storage-volume
   241                mountPath: /prometheus/
   242        volumes:
   243          - name: prometheus-config-volume
   244            configMap:
   245              defaultMode: 420
   246              name: prometheus-server-conf
   247    
   248          - name: prometheus-storage-volume
   249            emptyDir: {}
   250  ---
   251  # Source: volcano/templates/kubestatemetrics.yaml
   252  apiVersion: v1
   253  kind: ServiceAccount
   254  metadata:
   255    labels:
   256      app.kubernetes.io/name: kube-state-metrics
   257    name: kube-state-metrics
   258    namespace: volcano-monitoring
   259  ---
   260  # Source: volcano/templates/kubestatemetrics.yaml
   261  apiVersion: rbac.authorization.k8s.io/v1
   262  kind: ClusterRole
   263  metadata:
   264    labels:
   265      app.kubernetes.io/name: kube-state-metrics
   266    name: kube-state-metrics
   267  rules:
   268  - apiGroups:
   269    - ""
   270    resources:
   271    - configmaps
   272    - secrets
   273    - nodes
   274    - pods
   275    - services
   276    - resourcequotas
   277    - replicationcontrollers
   278    - limitranges
   279    - persistentvolumeclaims
   280    - persistentvolumes
   281    - namespaces
   282    - endpoints
   283    verbs:
   284    - list
   285    - watch
   286  - apiGroups:
   287    - extensions
   288    resources:
   289    - daemonsets
   290    - deployments
   291    - replicasets
   292    - ingresses
   293    verbs:
   294    - list
   295    - watch
   296  - apiGroups:
   297    - apps
   298    resources:
   299    - statefulsets
   300    - daemonsets
   301    - deployments
   302    - replicasets
   303    verbs:
   304    - list
   305    - watch
   306  - apiGroups:
   307    - batch
   308    resources:
   309    - cronjobs
   310    - jobs
   311    verbs:
   312    - list
   313    - watch
   314  - apiGroups:
   315    - autoscaling
   316    resources:
   317    - horizontalpodautoscalers
   318    verbs:
   319    - list
   320    - watch
   321  - apiGroups:
   322    - authentication.k8s.io
   323    resources:
   324    - tokenreviews
   325    verbs:
   326    - create
   327  - apiGroups:
   328    - authorization.k8s.io
   329    resources:
   330    - subjectaccessreviews
   331    verbs:
   332    - create
   333  - apiGroups:
   334    - policy
   335    resources:
   336    - poddisruptionbudgets
   337    verbs:
   338    - list
   339    - watch
   340  - apiGroups:
   341    - certificates.k8s.io
   342    resources:
   343    - certificatesigningrequests
   344    verbs:
   345    - list
   346    - watch
   347  - apiGroups:
   348    - storage.k8s.io
   349    resources:
   350    - storageclasses
   351    - volumeattachments
   352    verbs:
   353    - list
   354    - watch
   355  - apiGroups:
   356    - admissionregistration.k8s.io
   357    resources:
   358    - mutatingwebhookconfigurations
   359    - validatingwebhookconfigurations
   360    verbs:
   361    - list
   362    - watch
   363  - apiGroups:
   364    - networking.k8s.io
   365    resources:
   366    - networkpolicies
   367    verbs:
   368    - list
   369    - watch
   370  ---
   371  # Source: volcano/templates/kubestatemetrics.yaml
   372  apiVersion: rbac.authorization.k8s.io/v1
   373  kind: ClusterRoleBinding
   374  metadata:
   375    labels:
   376      app.kubernetes.io/name: kube-state-metrics
   377    name: kube-state-metrics
   378  roleRef:
   379    apiGroup: rbac.authorization.k8s.io
   380    kind: ClusterRole
   381    name: kube-state-metrics
   382  subjects:
   383  - kind: ServiceAccount
   384    name: kube-state-metrics
   385    namespace: volcano-monitoring
   386  ---
   387  # Source: volcano/templates/kubestatemetrics.yaml
   388  apiVersion: v1
   389  kind: Service
   390  metadata:
   391    labels:
   392      app.kubernetes.io/name: kube-state-metrics
   393    name: kube-state-metrics
   394    namespace: volcano-monitoring
   395    annotations:
   396      prometheus.io/path: /metrics
   397      prometheus.io/port: "8080"
   398      prometheus.io/scrape: "true"
   399  spec:
   400    ports:
   401    - name: http-metrics
   402      port: 8080
   403      targetPort: http-metrics
   404    - name: telemetry
   405      port: 8081
   406      targetPort: telemetry
   407    selector:
   408      k8s-app: kube-state-metrics
   409  ---
   410  # Source: volcano/templates/kubestatemetrics.yaml
   411  apiVersion: apps/v1
   412  kind: Deployment
   413  metadata:
   414    name: kube-state-metrics
   415    namespace: volcano-monitoring
   416    labels:
   417      k8s-app: kube-state-metrics
   418  spec:
   419    progressDeadlineSeconds: 600
   420    replicas: 1
   421    selector:
   422      matchLabels:
   423        k8s-app: kube-state-metrics
   424    strategy:
   425      rollingUpdate:
   426        maxSurge: 25%
   427        maxUnavailable: 25%
   428      type: RollingUpdate
   429    template:
   430      metadata:
   431        labels:
   432          k8s-app: kube-state-metrics
   433      spec:
   434        containers:
   435          - image: quay.io/coreos/kube-state-metrics:v1.9.7
   436            imagePullPolicy: Always
   437            name: kube-state-metrics
   438            ports:
   439              - name: http-metrics
   440                containerPort: 8080
   441            readinessProbe:
   442              httpGet:
   443                path: /healthz
   444                port: 8080
   445              initialDelaySeconds: 5
   446              timeoutSeconds: 5
   447        dnsPolicy: ClusterFirst
   448        
   449        nodeSelector:
   450          node.kubernetes.io/instance-type: controlpanel
   451        
   452        serviceAccountName: kube-state-metrics
   453  ---
   454  # Source: volcano/templates/grafana.yaml
   455  apiVersion: v1
   456  kind: ConfigMap
   457  metadata:
   458    name: grafana-datasources
   459    namespace: volcano-monitoring
   460  data:
   461    prometheus.yaml: |-
   462      {
   463          "apiVersion": 1,
   464          "datasources": [
   465              {
   466                 "access":"proxy",
   467                 "editable": true,
   468                 "isDefault": true,
   469                 "name": "prometheus",
   470                 "orgId": 1,
   471                 "type": "prometheus",
   472                 "url": "http://prometheus-service.volcano-monitoring.svc:8080",
   473                 "version": 1
   474              }
   475          ]
   476      }
   477  ---
   478  # Source: volcano/templates/grafana.yaml
   479  apiVersion: v1
   480  kind: ConfigMap
   481  metadata:
   482    name: grafana-volcano-dashboard-config
   483    namespace: volcano-monitoring
   484  data:
   485    dashboard.yaml: |-
   486      apiVersion: 1
   487      providers:
   488      - name: dashboards
   489        type: file
   490        updateIntervalSeconds: 30
   491        options:
   492          path: /var/lib/grafana/dashboards 
   493          foldersFromFilesStructure: true
   494  ---
   495  # Source: volcano/templates/grafana.yaml
   496  apiVersion: v1
   497  kind: ConfigMap
   498  metadata:
   499    name: grafana-volcano-dashboard
   500    namespace: volcano-monitoring
   501  data:
   502    volcano-globcal-overview-dashboard.json: |-
   503      {"annotations":{"list":[{"builtIn":1,"datasource":"prometheus","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":2,"links":[],"panels":[{"datasource":null,"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":0,"y":0},"id":20,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count(max_over_time(kube_pod_container_status_running{job=\"kube-state-metrics\"}[1h]) != 0)","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"TPH –Schedule Task In 1 Hour","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":3,"y":0},"id":21,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_node_info{job=\"kube-state-metrics\"})","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Node","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":6,"y":0},"id":23,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"kube_node_status_capacity{resource=\"nvidia_com_gpu\",job=\"kube-state-metrics\"}","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano GPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":9,"y":0},"id":24,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_node_status_capacity{job=\"kube-state-metrics\", resource=\"memory\"})","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Memory","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":12,"y":0},"id":22,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_node_status_capacity{job=\"kube-state-metrics\", resource=\"cpu\"})","interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano CPU","type":"stat"},{"cards":{"cardPadding":null,"cardRound":null},"color":{"cardColor":"#b4ff00","colorScale":"sqrt","colorScheme":"interpolateOranges","exponent":0.5,"mode":"spectrum"},"dataFormat":"timeseries","datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":50}]},"unit":"none"},"overrides":[]},"gridPos":{"h":8,"w":16,"x":0,"y":5},"heatmap":{},"hideZeroBuckets":false,"highlightCards":true,"id":18,"legend":{"show":false},"pluginVersion":"7.3.4","reverseYBuckets":false,"targets":[{"expr":"increase(volcano_e2e_job_scheduling_latency_milliseconds_bucket[1h])","format":"heatmap","instant":false,"interval":"","legendFormat":"{{le}} ms","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Legency Heatmap","tooltip":{"show":true,"showHistogram":false},"transformations":[],"type":"heatmap","xAxis":{"show":true},"xBucketNumber":null,"xBucketSize":null,"yAxis":{"decimals":null,"format":"ms","logBase":2,"max":"500000","min":null,"show":true,"splitFactor":null},"yBucketBound":"auto","yBucketNumber":null,"yBucketSize":null},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":50}]},"unit":"ms"},"overrides":[]},"gridPos":{"h":7,"w":16,"x":0,"y":13},"id":26,"options":{"displayMode":"lcd","orientation":"horizontal","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"showUnfilled":true},"pluginVersion":"7.3.4","targets":[{"expr":"avg(volcano_e2e_job_scheduling_duration{}) by (queue)","interval":"","legendFormat":"{{queue}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Job Scheduling Avg Duration By Queue In 24H","type":"bargauge"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"ms"},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"}]},{"matcher":{"id":"byName","options":"job_namespace"},"properties":[{"id":"custom.width","value":279}]}]},"gridPos":{"h":7,"w":16,"x":0,"y":20},"id":27,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"avg(volcano_e2e_job_scheduling_duration{}) by (job_namespace)","format":"table","instant":true,"interval":"","legendFormat":"Namespace: {{job_namespace}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Job Avg Scheduling Duration By Namespace In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"bytes"}]}]},"gridPos":{"h":8,"w":16,"x":0,"y":27},"id":29,"options":{"showHeader":true},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_volcano_container_resource_requests{resource=\"memory\", unit=\"byte\",job=\"kube-state-metrics\",queue!=\"\"}) by (queue)","format":"table","instant":true,"interval":"","legendFormat":"{{queue}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Resource Usage Sort By Queue In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"}]}]},"gridPos":{"h":8,"w":16,"x":0,"y":35},"id":30,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_volcano_container_resource_requests{resource=\"memory\", unit=\"byte\",job=\"kube-state-metrics\"}) by (volcano_namespace)","format":"table","instant":true,"interval":"","legendFormat":"Namespace : {{volcano_namespace}}","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Resource Usage Sort By Namespace In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"datasource":null,"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.width","value":651},{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"},{"id":"thresholds","value":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}}]},{"matcher":{"id":"byName","options":"job_name"},"properties":[{"id":"custom.width","value":361}]},{"matcher":{"id":"byName","options":"Volcano Job"},"properties":[{"id":"custom.width","value":228}]}]},"gridPos":{"h":13,"w":16,"x":0,"y":43},"id":16,"options":{"frameIndex":1,"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"increase(volcano_e2e_job_scheduling_duration{}[24h]) != 0","format":"table","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Volcano Job Running Legency","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true,"__name__":true,"instance":true,"job":true,"kubernetes_name":true,"kubernetes_namespace":true},"indexByName":{},"renameByName":{"Time":"","job_name":"Volcano Job"}}}],"type":"table"},{"collapsed":false,"datasource":null,"gridPos":{"h":1,"w":24,"x":0,"y":56},"id":13,"panels":[],"title":"Volcano Fairness","type":"row"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":7,"w":16,"x":0,"y":57},"hiddenSeries":false,"id":14,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","options":{"alertThreshold":true},"paceLength":10,"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"stddev(volcano_e2e_job_scheduling_duration)/avg(volcano_e2e_job_scheduling_duration)","format":"time_series","intervalFactor":1,"legendFormat":"CV (Job Duration)","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Job Duration Coefficient Of Variation","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"transparent":true,"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"collapsed":false,"datasource":null,"gridPos":{"h":1,"w":24,"x":0,"y":64},"id":11,"panels":[],"title":"Volcano Effectiveness","type":"row"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"gridPos":{"h":8,"w":5,"x":0,"y":65},"id":2,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(\n(sum(kube_pod_container_resource_requests{resource=\"cpu\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{}) \nby (pod,namespace)))/\nsum(kube_node_status_allocatable{resource=\"cpu\", unit=\"core\"})","format":"time_series","instant":false,"interval":"","intervalFactor":1,"legendFormat":"","refId":"A"}],"thresholds":"0.7,0.9","timeFrom":null,"timeShift":null,"title":"Volcano Cluster Average CPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"gridPos":{"h":8,"w":5,"x":5,"y":65},"id":3,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum((sum(kube_pod_container_resource_requests{resource=\"memory\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{}) by (pod,namespace)))/sum(kube_node_status_allocatable{resource=\"memory\", unit=\"byte\"})","format":"time_series","instant":false,"interval":"","intervalFactor":1,"legendFormat":"","refId":"A"}],"thresholds":"0.7,0.9","timeFrom":null,"timeShift":null,"title":"Volcano Cluster Average Memory Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"gridPos":{"h":8,"w":5,"x":10,"y":65},"id":4,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum((sum(kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{}) by (pod,namespace)))/sum(kube_node_status_capacity{resource=\"nvidia_com_gpu\"})","format":"time_series","instant":false,"interval":"","intervalFactor":1,"legendFormat":"","refId":"A"}],"thresholds":"0.7,0.9","timeFrom":null,"timeShift":null,"title":"Volcano Cluster Average GPU Usage","transparent":true,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":7,"w":16,"x":0,"y":73},"hiddenSeries":false,"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","options":{"alertThreshold":true},"paceLength":10,"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"stddev(sum by (node) (kube_pod_container_resource_requests{resource=\"cpu\"}))/avg(sum by (node) (kube_pod_container_resource_requests{resource=\"cpu\"}))","format":"time_series","intervalFactor":1,"legendFormat":"CV (CPU)","refId":"A"},{"expr":"stddev(sum by (node) (kube_pod_container_resource_requests{resource=\"memory\"}))/avg(sum by (node) (kube_pod_container_resource_requests{resource=\"memory\"}))","format":"time_series","intervalFactor":1,"legendFormat":"CV (Memory)","refId":"B"},{"expr":"stddev(sum by (node) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}))/avg(sum by (node) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"}))","format":"time_series","intervalFactor":1,"legendFormat":"CV (Nvidia GPU)","refId":"C"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Node Resource Coefficient Of Variation","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"transparent":true,"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}}],"refresh":false,"schemaVersion":26,"style":"dark","tags":[],"templating":{"list":[]},"time":{"from":"now-12h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"","title":"Volcano Global Overview Dashboard","uid":"nYn30KvMzf","version":19}
   504    volcano-queue-overview-dashboard.json: |-
   505      {"annotations":{"list":[{"builtIn":1,"datasource":"prometheus","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":4,"iteration":1607928216980,"links":[],"panels":[{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":0,"y":0},"id":6,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}==1)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":3,"y":0},"id":16,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}==0)","instant":false,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Finished Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":6,"y":0},"id":17,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count((max_over_time(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}[10m]) != 0) and kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"} == 0)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Last 10m Finished Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":9,"y":0},"id":7,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running CPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":12,"y":0},"id":8,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running GPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":15,"y":0},"id":2,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Memory","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"percentage","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Time"},"properties":[{"id":"custom.width","value":195}]},{"matcher":{"id":"byName","options":"__name__"},"properties":[{"id":"custom.width","value":267}]},{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"}]}]},"gridPos":{"h":24,"w":12,"x":0,"y":5},"id":14,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"increase(volcano_e2e_job_scheduling_duration{queue=\"$queue\"}[24h]) != 0  ","format":"table","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Queue Running Job Legency","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true,"__name__":true,"instance":true,"job":true,"kubernetes_name":true,"kubernetes_namespace":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":5},"hiddenSeries":false,"id":12,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"CPU Cores","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Queue Running CPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{},"unit":"bytes"},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":13},"hiddenSeries":false,"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",queue=\"$queue\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Queue Running Memory ","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":21},"hiddenSeries":false,"id":11,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",queue=\"$queue\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"GPU Cards","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Queue Running GPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}}],"schemaVersion":26,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{"selected":false,"text":"default","value":"default"},"datasource":"prometheus","definition":"label_values(volcano_queue_share,queue_name)","error":null,"hide":0,"includeAll":false,"label":null,"multi":false,"name":"queue","options":[],"query":"label_values(volcano_queue_share,queue_name)","refresh":1,"regex":"","skipUrlSync":false,"sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-6h","to":"now"},"timepicker":{},"timezone":"","title":"Volcano Queue View","uid":"sAtQfo1Mk","version":8}
   506    volcano-namespace-overview-dashboard.json: |-
   507      {"annotations":{"list":[{"builtIn":1,"datasource":"prometheus","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":3,"iteration":1607928231899,"links":[],"panels":[{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":0,"y":0},"id":6,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(kube_pod_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"}==1)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":3,"y":0},"id":16,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count(kube_pod_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"}==0)","instant":false,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Finished Job Total","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":3,"x":6,"y":0},"id":17,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"count((max_over_time(kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"}[10m]) != 0) and kube_pod_volcano_container_status_running{job=\"kube-state-metrics\",namespace=\"$namespace\"} == 0)","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Last 10m Finished Job","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":9,"y":0},"id":7,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running CPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"short"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":12,"y":0},"id":8,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running GPU","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes"},"overrides":[]},"gridPos":{"h":5,"w":3,"x":15,"y":0},"id":2,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"textMode":"auto"},"pluginVersion":"7.3.4","targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","instant":true,"interval":"","legendFormat":"volcano_job","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Running Memory","type":"stat"},{"datasource":null,"fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[{"matcher":{"id":"byName","options":"Time"},"properties":[{"id":"custom.width","value":195}]},{"matcher":{"id":"byName","options":"__name__"},"properties":[{"id":"custom.width","value":267}]},{"matcher":{"id":"byName","options":"Value"},"properties":[{"id":"custom.displayMode","value":"lcd-gauge"},{"id":"unit","value":"ms"}]}]},"gridPos":{"h":24,"w":12,"x":0,"y":5},"id":14,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Value"}]},"pluginVersion":"7.3.4","targets":[{"expr":"increase(volcano_e2e_job_scheduling_duration{job_namespace=\"$namespace\"}[24h]) != 0 ","format":"table","instant":true,"interval":"","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Namespace Running Job Legency In 24H","transformations":[{"id":"organize","options":{"excludeByName":{"Time":true,"__name__":true,"instance":true,"job":true,"kubernetes_name":true,"kubernetes_namespace":true},"indexByName":{},"renameByName":{}}}],"type":"table"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":5},"hiddenSeries":false,"id":12,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"CPU Cores","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Namespace Running CPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{},"unit":"bytes"},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":13},"hiddenSeries":false,"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"Memory Bytes","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Namespace Running Memory ","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":null,"fieldConfig":{"defaults":{"custom":{}},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":21},"hiddenSeries":false,"id":11,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"7.3.4","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(\n(sum(kube_pod_volcano_container_resource_requests{resource=\"gpu\",job=\"kube-state-metrics\",volcano_namespace=\"$namespace\"}) by (pod,namespace)) * on(pod) (max(kube_pod_container_status_running{job=\"kube-state-metrics\"}) \nby (pod,namespace))) ","interval":"","legendFormat":"GPU Cards","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Namespace Running GPU","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}}],"schemaVersion":26,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{"selected":false,"text":"yu7gvcjd","value":"yu7gvcjd"},"datasource":"prometheus","definition":"label_values(kube_namespace_labels, namespace)","error":null,"hide":0,"includeAll":false,"label":null,"multi":false,"name":"namespace","options":[],"query":"label_values(kube_namespace_labels, namespace)","refresh":1,"regex":"","skipUrlSync":false,"sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-6h","to":"now"},"timepicker":{},"timezone":"","title":"Volcano Namespace  View","uid":"TWuLSpJMk","version":14}
   508  ---
   509  # Source: volcano/templates/grafana.yaml
   510  apiVersion: v1
   511  kind: Service
   512  metadata:
   513    name: grafana
   514    namespace: volcano-monitoring
   515    annotations:
   516        prometheus.io/scrape: 'true'
   517        prometheus.io/port:   '3000'
   518  spec:
   519    selector: 
   520      app: grafana
   521    type: NodePort  
   522    ports:
   523      - port: 3000
   524        targetPort: 3000
   525        nodePort: 30004
   526  ---
   527  # Source: volcano/templates/grafana.yaml
   528  apiVersion: apps/v1
   529  kind: Deployment
   530  metadata:
   531    name: grafana
   532    namespace: volcano-monitoring
   533  spec:
   534    replicas: 1
   535    selector:
   536      matchLabels:
   537        app: grafana
   538    template:
   539      metadata:
   540        name: grafana
   541        labels:
   542          app: grafana
   543      spec:
   544        containers:
   545        - name: grafana
   546          image: grafana/grafana:latest
   547          readinessProbe:
   548            httpGet:
   549              path: /api/health
   550              port: 3000
   551          livenessProbe:
   552            httpGet:
   553              path: /api/health
   554              port: 3000
   555            initialDelaySeconds: 10
   556            periodSeconds: 10
   557          ports:
   558          - name: grafana
   559            containerPort: 3000
   560          resources:
   561            limits:
   562              memory: "2Gi"
   563              cpu: "1000m"
   564            requests: 
   565              memory: "1Gi"
   566              cpu: "500m"
   567          volumeMounts:
   568            - mountPath: /var/lib/grafana
   569              name: grafana-storage
   570            - mountPath: /etc/grafana/provisioning/datasources
   571              name: grafana-datasources
   572              readOnly: false
   573            - mountPath: /var/lib/grafana/dashboards
   574              name: grafana-volcano-dashboard 
   575              readOnly: false 
   576            - mountPath: /etc/grafana/provisioning/dashboards 
   577              name: grafana-volcano-dashboard-config
   578              readOnly: true
   579        volumes:
   580          - name: grafana-storage
   581            emptyDir: {}
   582          - name: grafana-volcano-dashboard
   583            configMap:
   584                defaultMode: 420
   585                name: grafana-volcano-dashboard
   586          - name: grafana-datasources
   587            configMap:
   588                defaultMode: 420
   589                name: grafana-datasources
   590          - name: grafana-volcano-dashboard-config
   591            configMap:
   592                defaultMode: 420
   593                name: grafana-volcano-dashboard-config