github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/examples/sdk/cluster-performance.ipynb (about)

     1  {
     2   "cells": [
     3    {
     4     "cell_type": "markdown",
     5     "metadata": {},
     6     "source": [
     7      "## Monitoring Cluster Performance with Python SDK"
     8     ]
     9    },
    10    {
    11     "cell_type": "code",
    12     "execution_count": null,
    13     "metadata": {},
    14     "outputs": [],
    15     "source": [
    16      "pip install matplotlib ipython aistore"
    17     ]
    18    },
    19    {
    20     "cell_type": "markdown",
    21     "metadata": {},
    22     "source": [
    23      "### Creating an AIStore Cluster Instance"
    24     ]
    25    },
    26    {
    27     "cell_type": "code",
    28     "execution_count": 6,
    29     "metadata": {},
    30     "outputs": [],
    31     "source": [
    32      "from aistore.sdk import Client\n",
    33      "\n",
    34      "# Use the client class to access the AIS cluster:\n",
    35      "ais_url = \"http://localhost:8080\"\n",
    36      "client = Client(ais_url)\n",
    37      "cluster = client.cluster()"
    38     ]
    39    },
    40    {
    41     "cell_type": "markdown",
    42     "metadata": {},
    43     "source": [
    44      "### Performance Metrics\n",
    45      "AIStore offers detailed insights into cluster performance through three primary classes: NodeThroughput, NodeLatency, and NodeCounter. Here's a brief overview of each:\n",
    46      "\n",
    47      "- NodeThroughput: Measures data processing rates, focusing on the bandwidth for both data reading (GET) and writing (PUT) operations.\n",
    48      "- NodeLatency: Captures delays in data transfer, offering average sizes for GET and PUT operations and detailing the latency of accessing infrequently used (cold) data.\n",
    49      "- NodeCounter: Tracks operational events and errors, including counts for various operations (e.g., GET, PUT, DELETE) and specific events like cache evictions or version changes."
    50     ]
    51    },
    52    {
    53     "cell_type": "code",
    54     "execution_count": null,
    55     "metadata": {},
    56     "outputs": [],
    57     "source": [
    58      "cluster_performance = cluster.get_performance()\n",
    59      "cluster_performance.as_dict()"
    60     ]
    61    },
    62    {
    63     "cell_type": "markdown",
    64     "metadata": {},
    65     "source": [
    66      "### Metrics in Human Readable Format"
    67     ]
    68    },
    69    {
    70     "cell_type": "code",
    71     "execution_count": null,
    72     "metadata": {},
    73     "outputs": [],
    74     "source": [
    75      "throughput_dict = cluster_performance.throughput\n",
    76      "for target_id, throughput in throughput_dict.items():\n",
    77      "    throughput = throughput.as_dict()\n",
    78      "    print(f\"{target_id}: {throughput}\")"
    79     ]
    80    },
    81    {
    82     "cell_type": "markdown",
    83     "metadata": {},
    84     "source": [
    85      "### Example - Plotting throughput"
    86     ]
    87    },
    88    {
    89     "cell_type": "code",
    90     "execution_count": 2,
    91     "metadata": {},
    92     "outputs": [],
    93     "source": [
    94      "import matplotlib.pyplot as plt\n",
    95      "from IPython.display import clear_output\n",
    96      "import time\n",
    97      "import numpy as np\n",
    98      "\n",
    99      "get_bw_values = []\n",
   100      "put_bw_values = []\n",
   101      "\n",
   102      "\n",
   103      "# Helper function to convert bandwidth to MB/s\n",
   104      "def bw_to_mbps(bw_value):\n",
   105      "    bw_mbps = bw_value / 1e6\n",
   106      "    return bw_mbps\n",
   107      "\n",
   108      "\n",
   109      "def plot(cluster_performance):\n",
   110      "    target_ids = list(cluster_performance.throughput.keys())\n",
   111      "\n",
   112      "    # Initialize a figure\n",
   113      "    _, axs = plt.subplots(\n",
   114      "        len(target_ids), 1, figsize=(10, 5 * len(target_ids)), squeeze=False\n",
   115      "    )\n",
   116      "\n",
   117      "    for idx, target_id in enumerate(target_ids):\n",
   118      "        throughput_data = cluster_performance.throughput[target_id]\n",
   119      "        get_bw_values.append(bw_to_mbps(throughput_data.get_bw))\n",
   120      "        put_bw_values.append(bw_to_mbps(throughput_data.put_bw))\n",
   121      "        time_steps = np.arange(len(get_bw_values))  # Create a time axis\n",
   122      "\n",
   123      "        # Plot \"get\" and \"put\" bandwidth\n",
   124      "        axs[idx][0].plot(\n",
   125      "            time_steps, get_bw_values, label=\"GET Bandwidth (MB/s)\", marker=\"o\"\n",
   126      "        )\n",
   127      "        axs[idx][0].plot(\n",
   128      "            time_steps, put_bw_values, label=\"PUT Bandwidth (MB/s)\", marker=\"x\"\n",
   129      "        )\n",
   130      "\n",
   131      "        axs[idx][0].set_title(f\"Node {target_id} Throughput Over Time\")\n",
   132      "        axs[idx][0].set_xlabel(\"Time\")\n",
   133      "        axs[idx][0].set_ylabel(\"Bandwidth (MB/s)\")\n",
   134      "        axs[idx][0].legend()\n",
   135      "    plt.tight_layout()\n",
   136      "    plt.show()\n",
   137      "\n",
   138      "\n",
   139      "def plot_live_metrics(cluster, duration, frequency=2):\n",
   140      "    passed = 0\n",
   141      "    while True:\n",
   142      "        cluster_performance = cluster.get_performance()\n",
   143      "        # Clear the current plot to prepare for the next one\n",
   144      "        clear_output(wait=True)\n",
   145      "        plot(cluster_performance)\n",
   146      "\n",
   147      "        # Wait before the next update\n",
   148      "        time.sleep(frequency)\n",
   149      "        passed += frequency\n",
   150      "\n",
   151      "        if passed >= duration:\n",
   152      "            get_bw_values.clear()\n",
   153      "            put_bw_values.clear()\n",
   154      "            break\n",
   155      "\n",
   156      "        plt.clf()"
   157     ]
   158    },
   159    {
   160     "cell_type": "code",
   161     "execution_count": null,
   162     "metadata": {},
   163     "outputs": [],
   164     "source": [
   165      "plot_live_metrics(cluster, duration=60, frequency=10)"
   166     ]
   167    }
   168   ],
   169   "metadata": {
   170    "kernelspec": {
   171     "display_name": "Python 3 (ipykernel)",
   172     "language": "python",
   173     "name": "python3"
   174    }
   175   },
   176   "nbformat": 4,
   177   "nbformat_minor": 2
   178  }