github.com/m3db/m3@v1.5.0/scripts/docker-integration-tests/aggregator/test.sh (about)

     1  #!/usr/bin/env bash
     2  
     3  set -xe
     4  
     5  source "$M3_PATH"/scripts/docker-integration-tests/common.sh
     6  REVISION=$(git rev-parse HEAD)
     7  COMPOSE_FILE="$M3_PATH"/scripts/docker-integration-tests/aggregator/docker-compose.yml
     8  # quay.io/m3db/prometheus_remote_client_golang @ v0.4.3
     9  PROMREMOTECLI_IMAGE=quay.io/m3db/prometheus_remote_client_golang:v0.4.3
    10  JQ_IMAGE=realguess/jq:1.4@sha256:300c5d9fb1d74154248d155ce182e207cf6630acccbaadd0168e18b15bfaa786
    11  export REVISION
    12  
    13  echo "Pull containers required for test"
    14  docker pull $PROMREMOTECLI_IMAGE
    15  docker pull $JQ_IMAGE
    16  
    17  echo "Run m3dbnode"
    18  docker-compose -f ${COMPOSE_FILE} up -d dbnode01
    19  
    20  # Stop containers on exit
    21  METRIC_EMIT_PID="-1"
    22  function defer {
    23    docker-compose -f ${COMPOSE_FILE} down || echo "unable to shutdown containers" # CI fails to stop all containers sometimes
    24    if [ "$METRIC_EMIT_PID" != "-1" ]; then
    25      echo "Kill metric emit process"
    26      kill $METRIC_EMIT_PID
    27    fi
    28  }
    29  trap defer EXIT
    30  
    31  echo "Setup DB node"
    32  AGG_RESOLUTION=10s AGG_RETENTION=6h setup_single_m3db_node
    33  
    34  echo "Initializing aggregator topology"
    35  curl -vvvsSf -X POST -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/services/m3aggregator/placement/init -d '{
    36      "num_shards": 64,
    37      "replication_factor": 2,
    38      "instances": [
    39          {
    40              "id": "m3aggregator01",
    41              "isolation_group": "availability-zone-a",
    42              "zone": "embedded",
    43              "weight": 100,
    44              "endpoint": "m3aggregator01:6000",
    45              "hostname": "m3aggregator01",
    46              "port": 6000
    47          },
    48          {
    49              "id": "m3aggregator02",
    50              "isolation_group": "availability-zone-b",
    51              "zone": "embedded",
    52              "weight": 100,
    53              "endpoint": "m3aggregator02:6000",
    54              "hostname": "m3aggregator02",
    55              "port": 6000
    56          }
    57      ]
    58  }'
    59  
    60  echo "Initializing m3msg inbound topic for m3aggregator ingestion from m3coordinators"
    61  curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic/init -d '{
    62      "numberOfShards": 64
    63  }'
    64  
    65  # Do this after placement and topic for m3aggregator is created.
    66  echo "Adding m3aggregator as a consumer to the aggregator ingest topic"
    67  curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic -d '{
    68    "consumerService": {
    69      "serviceId": {
    70        "name": "m3aggregator",
    71        "environment": "override_test_env",
    72        "zone": "embedded"
    73      },
    74      "consumptionType": "REPLICATED",
    75      "messageTtlNanos": "600000000000"
    76    }
    77  }' # msgs will be discarded after 600000000000ns = 10mins
    78  
    79  echo "Initializing m3coordinator topology"
    80  curl -vvvsSf -X POST localhost:7201/api/v1/services/m3coordinator/placement/init -d '{
    81      "instances": [
    82          {
    83              "id": "m3coordinator01",
    84              "zone": "embedded",
    85              "endpoint": "m3coordinator01:7507",
    86              "hostname": "m3coordinator01",
    87              "port": 7507
    88          }
    89      ]
    90  }'
    91  echo "Done initializing m3coordinator topology"
    92  
    93  echo "Validating m3coordinator topology"
    94  [ "$(curl -sSf localhost:7201/api/v1/services/m3coordinator/placement | jq .placement.instances.m3coordinator01.id)" == '"m3coordinator01"' ]
    95  echo "Done validating topology"
    96  
    97  # Do this after placement for m3coordinator is created.
    98  echo "Initializing m3msg outbound topic for m3coordinator ingestion from m3aggregators"
    99  curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic/init -d '{
   100      "numberOfShards": 64
   101  }'
   102  
   103  echo "Adding m3coordinator as a consumer to the aggregator publish topic"
   104  curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic -d '{
   105    "consumerService": {
   106      "serviceId": {
   107        "name": "m3coordinator",
   108        "environment": "default_env",
   109        "zone": "embedded"
   110      },
   111      "consumptionType": "SHARED",
   112      "messageTtlNanos": "600000000000"
   113    }
   114  }' # msgs will be discarded after 600000000000ns = 10mins
   115  
   116  echo "Running m3coordinator container"
   117  echo "> port 7202 is coordinator API"
   118  echo "> port 7203 is coordinator metrics"
   119  echo "> port 7204 is coordinator graphite ingest"
   120  echo "> port 7507 is coordinator m3msg ingest from aggregator ingest"
   121  docker-compose -f ${COMPOSE_FILE} up -d m3coordinator01
   122  COORDINATOR_API="localhost:7202"
   123  
   124  echo "Running m3aggregator containers"
   125  docker-compose -f ${COMPOSE_FILE} up -d m3aggregator01
   126  docker-compose -f ${COMPOSE_FILE} up -d m3aggregator02
   127  
   128  echo "Verifying aggregation with remote aggregators"
   129  
   130  function read_carbon {
   131    target=$1
   132    expected_val=$2
   133    end=$(date +%s)
   134    start=$(($end-1000))
   135    RESPONSE=$(curl -sSfg "http://${COORDINATOR_API}/api/v1/graphite/render?target=$target&from=$start&until=$end")
   136    test "$(echo "$RESPONSE" | jq ".[0].datapoints | .[][0] | select(. != null)" | jq -s last)" = "$expected_val"
   137    return $?
   138  }
   139  
   140  # Send metric values 40 and 44 every second
   141  echo "Sending unaggregated carbon metrics to m3coordinator"
   142  bash -c 'while true; do t=$(date +%s); echo "foo.bar.baz 40 $t" | nc 0.0.0.0 7204; echo "foo.bar.baz 44 $t" | nc 0.0.0.0 7204; sleep 1; done' &
   143  
   144  # Track PID to kill on exit
   145  METRIC_EMIT_PID="$!"
   146  
   147  function test_aggregated_graphite_metric {
   148    # Read back the averaged averaged metric, we configured graphite
   149    # aggregation policy to average each tile and we are emitting
   150    # values 40 and 44 to get an average of 42 each tile
   151    echo "Read back aggregated averaged metric"
   152    ATTEMPTS=100 TIMEOUT=1 MAX_TIMEOUT=4 retry_with_backoff read_carbon foo.bar.* 42
   153  
   154    echo "Finished with carbon metrics"
   155    kill $METRIC_EMIT_PID
   156    export METRIC_EMIT_PID="-1"
   157  }
   158  
   159  function prometheus_remote_write {
   160    local metric_name=$1
   161    local datapoint_timestamp=$2
   162    local datapoint_value=$3
   163    local expect_success=$4
   164    local expect_success_err=$5
   165    local expect_status=$6
   166    local expect_status_err=$7
   167    local label0_name=${label0_name:-label0}
   168    local label0_value=${label0_value:-label0}
   169    local label1_name=${label1_name:-label1}
   170    local label1_value=${label1_value:-label1}
   171    local label2_name=${label2_name:-label2}
   172    local label2_value=${label2_value:-label2}
   173    local metric_type=${metric_type:counter}
   174  
   175    network_name="aggregator"
   176    network=$(docker network ls | fgrep $network_name | tr -s ' ' | cut -f 1 -d ' ' | tail -n 1)
   177    out=$((docker run -it --rm --network $network           \
   178      $PROMREMOTECLI_IMAGE                                  \
   179      -u http://m3coordinator01:7202/api/v1/prom/remote/write \
   180      -h Prometheus-Metric-Type:${metric_type}                        \
   181      -t __name__:${metric_name}                            \
   182      -t ${label0_name}:${label0_value}                     \
   183      -t ${label1_name}:${label1_value}                     \
   184      -t ${label2_name}:${label2_value}                     \
   185      -d ${datapoint_timestamp},${datapoint_value} | grep -v promremotecli_log) || true)
   186    success=$(echo $out | grep -v promremotecli_log | docker run --rm -i $JQ_IMAGE jq .success)
   187    status=$(echo $out | grep -v promremotecli_log | docker run --rm -i $JQ_IMAGE jq .statusCode)
   188    if [[ "$success" != "$expect_success" ]]; then
   189      echo $expect_success_err
   190      return 1
   191    fi
   192    if [[ "$status" != "$expect_status" ]]; then
   193      echo "${expect_status_err}: actual=${status}"
   194      return 1
   195    fi
   196    echo "Returned success=${success}, status=${status} as expected"
   197    return 0
   198  }
   199  
   200  function prometheus_query_native {
   201    local endpoint=${endpoint:-}
   202    local query=${query:-}
   203    local params=${params:-}
   204    local metrics_type=${metrics_type:-}
   205    local metrics_storage_policy=${metrics_storage_policy:-}
   206    local jq_path=${jq_path:-}
   207    local expected_value=${expected_value:-}
   208  
   209    params_prefixed=""
   210    if [[ "$params" != "" ]]; then
   211      params_prefixed='&'"${params}"
   212    fi
   213  
   214    result=$(curl -s                                    \
   215      -H "M3-Metrics-Type: ${metrics_type}"             \
   216      -H "M3-Storage-Policy: ${metrics_storage_policy}" \
   217      "0.0.0.0:7202/api/v1/${endpoint}?query=${query}${params_prefixed}" | jq -r "${jq_path}" | jq -s last)
   218    test "$result" = "$expected_value"
   219    return $?
   220  }
   221  
   222  function dbnode_fetch {
   223    local namespace=${namespace}
   224    local id=${id}
   225    local rangeStart=${rangeStart}
   226    local rangeEnd=${rangeEnd}
   227    local jq_path=${jq_path:-}
   228    local expected_value=${expected_value:-}
   229    result=$(curl -s                                    \
   230      "0.0.0.0:9002/fetch" \
   231      "-d" \
   232      "{\"namespace\": \"${namespace}\", \"id\": \"${id}\", \"rangeStart\": ${rangeStart}, \"rangeEnd\": ${rangeEnd}}" | jq -r "${jq_path}")
   233    test "$result" = "$expected_value"
   234    return $?
   235  }
   236  
   237  function test_aggregated_rollup_rule {
   238    resolution_seconds="10"
   239    now=$(date +"%s")
   240    now_truncate_by=$(( $now % $resolution_seconds ))
   241    now_truncated=$(( $now - $now_truncate_by ))
   242  
   243    echo "Test write with rollup rule"
   244  
   245    # Emit values for endpoint /foo/bar (to ensure right values aggregated)
   246    write_at="$now_truncated"
   247    value="42"
   248    value_rate="22"
   249    value_inc_by=$(( $value_rate * $resolution_seconds ))
   250    for i in $(seq 1 10); do
   251      label0_name="app" label0_value="nginx_edge" \
   252        label1_name="status_code" label1_value="500" \
   253        label2_name="endpoint" label2_value="/foo/bar" \
   254        metric_type="counter" \
   255        prometheus_remote_write \
   256        http_requests $write_at $value \
   257        true "Expected request to succeed" \
   258        200 "Expected request to return status code 200"
   259      write_at=$(( $write_at + $resolution_seconds ))
   260      value=$(( $value + $value_inc_by ))
   261    done
   262  
   263    # Emit values for endpoint /foo/baz (to ensure right values aggregated)
   264    write_at="$now_truncated"
   265    value="84"
   266    value_rate="4"
   267    value_inc_by=$(( $value_rate * $resolution_seconds ))
   268    for i in $(seq 1 10); do
   269      label0_name="app" label0_value="nginx_edge" \
   270        label1_name="status_code" label1_value="500" \
   271        label2_name="endpoint" label2_value="/foo/baz" \
   272        metric_type="gauge" \
   273        prometheus_remote_write \
   274        http_requests $write_at $value \
   275        true "Expected request to succeed" \
   276        200 "Expected request to return status code 200"
   277      write_at=$(( $write_at + $resolution_seconds ))
   278      value=$(( $value + $value_inc_by ))
   279    done
   280  
   281    start=$(( $now - 3600 ))
   282    end=$(( $now + 3600 ))
   283    step="30s"
   284    params_range="start=${start}"'&'"end=${end}"'&'"step=30s"
   285    jq_path=".data.result[0].values | .[][1] | select(. != null)"
   286  
   287    echo "Test query rollup rule"
   288  
   289    # Test by values are rolled up by second, then sum (for endpoint="/foo/bar")
   290    ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 \
   291      endpoint=query_range query="http_requests_by_status_code\{endpoint=\"/foo/bar\"\}" \
   292      params="$params_range" \
   293      jq_path="$jq_path" expected_value="22" \
   294      metrics_type="aggregated" metrics_storage_policy="10s:6h" \
   295      retry_with_backoff prometheus_query_native
   296  
   297    # Test by values are rolled up by second, then sum (for endpoint="/foo/bar")
   298    ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 \
   299      endpoint=query_range query="http_requests_by_status_code\{endpoint=\"/foo/baz\"\}" \
   300      params="$params_range" \
   301      jq_path="$jq_path" expected_value="4" \
   302      metrics_type="aggregated" metrics_storage_policy="10s:6h" \
   303      retry_with_backoff prometheus_query_native
   304  }
   305  
   306  function test_metric_type_survives_aggregation {
   307    echo "Test metric type should be kept after aggregation"
   308    now=$(date +"%s")
   309    value="42"
   310  
   311    metric_type="counter" \
   312    prometheus_remote_write \
   313    metric_type_test $now $value \
   314    true "Expected request to succeed" \
   315    200 "Expected request to return status code 200"
   316  
   317    start=$(( $now - 3600 ))
   318    end=$(( $now + 3600 ))
   319    jq_path=".datapoints[0].annotation"
   320  
   321    echo "Test query metric type"
   322  
   323    # Test if metric type is stored in aggregated namespace
   324    # "CAEQAQ==" is the protobuf encoded base64 value with the metric type on it
   325    ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 \
   326      namespace="agg" \
   327      id='{__name__=\"metric_type_test\",label0=\"label0\",label1=\"label1\",label2=\"label2\"}' \
   328      rangeStart=${start} \
   329      rangeEnd=${end} \
   330      jq_path="$jq_path" expected_value="CAEQAQ==" \
   331      retry_with_backoff dbnode_fetch
   332    
   333    # Additional test to ensure correct value is stored
   334    ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 \
   335      namespace="agg" \
   336      id='{__name__=\"metric_type_test\",label0=\"label0\",label1=\"label1\",label2=\"label2\"}' \
   337      rangeStart=${start} \
   338      rangeEnd=${end} \
   339      jq_path=".datapoints[0].value" expected_value="45"
   340  }
   341  
   342  echo "Run tests"
   343  test_aggregated_graphite_metric
   344  test_aggregated_rollup_rule
   345  test_metric_type_survives_aggregation