github.com/m3db/m3@v1.5.0/scripts/docker-integration-tests/aggregator/test.sh (about) 1 #!/usr/bin/env bash 2 3 set -xe 4 5 source "$M3_PATH"/scripts/docker-integration-tests/common.sh 6 REVISION=$(git rev-parse HEAD) 7 COMPOSE_FILE="$M3_PATH"/scripts/docker-integration-tests/aggregator/docker-compose.yml 8 # quay.io/m3db/prometheus_remote_client_golang @ v0.4.3 9 PROMREMOTECLI_IMAGE=quay.io/m3db/prometheus_remote_client_golang:v0.4.3 10 JQ_IMAGE=realguess/jq:1.4@sha256:300c5d9fb1d74154248d155ce182e207cf6630acccbaadd0168e18b15bfaa786 11 export REVISION 12 13 echo "Pull containers required for test" 14 docker pull $PROMREMOTECLI_IMAGE 15 docker pull $JQ_IMAGE 16 17 echo "Run m3dbnode" 18 docker-compose -f ${COMPOSE_FILE} up -d dbnode01 19 20 # Stop containers on exit 21 METRIC_EMIT_PID="-1" 22 function defer { 23 docker-compose -f ${COMPOSE_FILE} down || echo "unable to shutdown containers" # CI fails to stop all containers sometimes 24 if [ "$METRIC_EMIT_PID" != "-1" ]; then 25 echo "Kill metric emit process" 26 kill $METRIC_EMIT_PID 27 fi 28 } 29 trap defer EXIT 30 31 echo "Setup DB node" 32 AGG_RESOLUTION=10s AGG_RETENTION=6h setup_single_m3db_node 33 34 echo "Initializing aggregator topology" 35 curl -vvvsSf -X POST -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/services/m3aggregator/placement/init -d '{ 36 "num_shards": 64, 37 "replication_factor": 2, 38 "instances": [ 39 { 40 "id": "m3aggregator01", 41 "isolation_group": "availability-zone-a", 42 "zone": "embedded", 43 "weight": 100, 44 "endpoint": "m3aggregator01:6000", 45 "hostname": "m3aggregator01", 46 "port": 6000 47 }, 48 { 49 "id": "m3aggregator02", 50 "isolation_group": "availability-zone-b", 51 "zone": "embedded", 52 "weight": 100, 53 "endpoint": "m3aggregator02:6000", 54 "hostname": "m3aggregator02", 55 "port": 6000 56 } 57 ] 58 }' 59 60 echo "Initializing m3msg inbound topic for m3aggregator ingestion from m3coordinators" 61 curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic/init -d '{ 62 "numberOfShards": 64 63 }' 64 65 # Do this after placement and topic for m3aggregator is created. 66 echo "Adding m3aggregator as a consumer to the aggregator ingest topic" 67 curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic -d '{ 68 "consumerService": { 69 "serviceId": { 70 "name": "m3aggregator", 71 "environment": "override_test_env", 72 "zone": "embedded" 73 }, 74 "consumptionType": "REPLICATED", 75 "messageTtlNanos": "600000000000" 76 } 77 }' # msgs will be discarded after 600000000000ns = 10mins 78 79 echo "Initializing m3coordinator topology" 80 curl -vvvsSf -X POST localhost:7201/api/v1/services/m3coordinator/placement/init -d '{ 81 "instances": [ 82 { 83 "id": "m3coordinator01", 84 "zone": "embedded", 85 "endpoint": "m3coordinator01:7507", 86 "hostname": "m3coordinator01", 87 "port": 7507 88 } 89 ] 90 }' 91 echo "Done initializing m3coordinator topology" 92 93 echo "Validating m3coordinator topology" 94 [ "$(curl -sSf localhost:7201/api/v1/services/m3coordinator/placement | jq .placement.instances.m3coordinator01.id)" == '"m3coordinator01"' ] 95 echo "Done validating topology" 96 97 # Do this after placement for m3coordinator is created. 98 echo "Initializing m3msg outbound topic for m3coordinator ingestion from m3aggregators" 99 curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic/init -d '{ 100 "numberOfShards": 64 101 }' 102 103 echo "Adding m3coordinator as a consumer to the aggregator publish topic" 104 curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: override_test_env" localhost:7201/api/v1/topic -d '{ 105 "consumerService": { 106 "serviceId": { 107 "name": "m3coordinator", 108 "environment": "default_env", 109 "zone": "embedded" 110 }, 111 "consumptionType": "SHARED", 112 "messageTtlNanos": "600000000000" 113 } 114 }' # msgs will be discarded after 600000000000ns = 10mins 115 116 echo "Running m3coordinator container" 117 echo "> port 7202 is coordinator API" 118 echo "> port 7203 is coordinator metrics" 119 echo "> port 7204 is coordinator graphite ingest" 120 echo "> port 7507 is coordinator m3msg ingest from aggregator ingest" 121 docker-compose -f ${COMPOSE_FILE} up -d m3coordinator01 122 COORDINATOR_API="localhost:7202" 123 124 echo "Running m3aggregator containers" 125 docker-compose -f ${COMPOSE_FILE} up -d m3aggregator01 126 docker-compose -f ${COMPOSE_FILE} up -d m3aggregator02 127 128 echo "Verifying aggregation with remote aggregators" 129 130 function read_carbon { 131 target=$1 132 expected_val=$2 133 end=$(date +%s) 134 start=$(($end-1000)) 135 RESPONSE=$(curl -sSfg "http://${COORDINATOR_API}/api/v1/graphite/render?target=$target&from=$start&until=$end") 136 test "$(echo "$RESPONSE" | jq ".[0].datapoints | .[][0] | select(. != null)" | jq -s last)" = "$expected_val" 137 return $? 138 } 139 140 # Send metric values 40 and 44 every second 141 echo "Sending unaggregated carbon metrics to m3coordinator" 142 bash -c 'while true; do t=$(date +%s); echo "foo.bar.baz 40 $t" | nc 0.0.0.0 7204; echo "foo.bar.baz 44 $t" | nc 0.0.0.0 7204; sleep 1; done' & 143 144 # Track PID to kill on exit 145 METRIC_EMIT_PID="$!" 146 147 function test_aggregated_graphite_metric { 148 # Read back the averaged averaged metric, we configured graphite 149 # aggregation policy to average each tile and we are emitting 150 # values 40 and 44 to get an average of 42 each tile 151 echo "Read back aggregated averaged metric" 152 ATTEMPTS=100 TIMEOUT=1 MAX_TIMEOUT=4 retry_with_backoff read_carbon foo.bar.* 42 153 154 echo "Finished with carbon metrics" 155 kill $METRIC_EMIT_PID 156 export METRIC_EMIT_PID="-1" 157 } 158 159 function prometheus_remote_write { 160 local metric_name=$1 161 local datapoint_timestamp=$2 162 local datapoint_value=$3 163 local expect_success=$4 164 local expect_success_err=$5 165 local expect_status=$6 166 local expect_status_err=$7 167 local label0_name=${label0_name:-label0} 168 local label0_value=${label0_value:-label0} 169 local label1_name=${label1_name:-label1} 170 local label1_value=${label1_value:-label1} 171 local label2_name=${label2_name:-label2} 172 local label2_value=${label2_value:-label2} 173 local metric_type=${metric_type:counter} 174 175 network_name="aggregator" 176 network=$(docker network ls | fgrep $network_name | tr -s ' ' | cut -f 1 -d ' ' | tail -n 1) 177 out=$((docker run -it --rm --network $network \ 178 $PROMREMOTECLI_IMAGE \ 179 -u http://m3coordinator01:7202/api/v1/prom/remote/write \ 180 -h Prometheus-Metric-Type:${metric_type} \ 181 -t __name__:${metric_name} \ 182 -t ${label0_name}:${label0_value} \ 183 -t ${label1_name}:${label1_value} \ 184 -t ${label2_name}:${label2_value} \ 185 -d ${datapoint_timestamp},${datapoint_value} | grep -v promremotecli_log) || true) 186 success=$(echo $out | grep -v promremotecli_log | docker run --rm -i $JQ_IMAGE jq .success) 187 status=$(echo $out | grep -v promremotecli_log | docker run --rm -i $JQ_IMAGE jq .statusCode) 188 if [[ "$success" != "$expect_success" ]]; then 189 echo $expect_success_err 190 return 1 191 fi 192 if [[ "$status" != "$expect_status" ]]; then 193 echo "${expect_status_err}: actual=${status}" 194 return 1 195 fi 196 echo "Returned success=${success}, status=${status} as expected" 197 return 0 198 } 199 200 function prometheus_query_native { 201 local endpoint=${endpoint:-} 202 local query=${query:-} 203 local params=${params:-} 204 local metrics_type=${metrics_type:-} 205 local metrics_storage_policy=${metrics_storage_policy:-} 206 local jq_path=${jq_path:-} 207 local expected_value=${expected_value:-} 208 209 params_prefixed="" 210 if [[ "$params" != "" ]]; then 211 params_prefixed='&'"${params}" 212 fi 213 214 result=$(curl -s \ 215 -H "M3-Metrics-Type: ${metrics_type}" \ 216 -H "M3-Storage-Policy: ${metrics_storage_policy}" \ 217 "0.0.0.0:7202/api/v1/${endpoint}?query=${query}${params_prefixed}" | jq -r "${jq_path}" | jq -s last) 218 test "$result" = "$expected_value" 219 return $? 220 } 221 222 function dbnode_fetch { 223 local namespace=${namespace} 224 local id=${id} 225 local rangeStart=${rangeStart} 226 local rangeEnd=${rangeEnd} 227 local jq_path=${jq_path:-} 228 local expected_value=${expected_value:-} 229 result=$(curl -s \ 230 "0.0.0.0:9002/fetch" \ 231 "-d" \ 232 "{\"namespace\": \"${namespace}\", \"id\": \"${id}\", \"rangeStart\": ${rangeStart}, \"rangeEnd\": ${rangeEnd}}" | jq -r "${jq_path}") 233 test "$result" = "$expected_value" 234 return $? 235 } 236 237 function test_aggregated_rollup_rule { 238 resolution_seconds="10" 239 now=$(date +"%s") 240 now_truncate_by=$(( $now % $resolution_seconds )) 241 now_truncated=$(( $now - $now_truncate_by )) 242 243 echo "Test write with rollup rule" 244 245 # Emit values for endpoint /foo/bar (to ensure right values aggregated) 246 write_at="$now_truncated" 247 value="42" 248 value_rate="22" 249 value_inc_by=$(( $value_rate * $resolution_seconds )) 250 for i in $(seq 1 10); do 251 label0_name="app" label0_value="nginx_edge" \ 252 label1_name="status_code" label1_value="500" \ 253 label2_name="endpoint" label2_value="/foo/bar" \ 254 metric_type="counter" \ 255 prometheus_remote_write \ 256 http_requests $write_at $value \ 257 true "Expected request to succeed" \ 258 200 "Expected request to return status code 200" 259 write_at=$(( $write_at + $resolution_seconds )) 260 value=$(( $value + $value_inc_by )) 261 done 262 263 # Emit values for endpoint /foo/baz (to ensure right values aggregated) 264 write_at="$now_truncated" 265 value="84" 266 value_rate="4" 267 value_inc_by=$(( $value_rate * $resolution_seconds )) 268 for i in $(seq 1 10); do 269 label0_name="app" label0_value="nginx_edge" \ 270 label1_name="status_code" label1_value="500" \ 271 label2_name="endpoint" label2_value="/foo/baz" \ 272 metric_type="gauge" \ 273 prometheus_remote_write \ 274 http_requests $write_at $value \ 275 true "Expected request to succeed" \ 276 200 "Expected request to return status code 200" 277 write_at=$(( $write_at + $resolution_seconds )) 278 value=$(( $value + $value_inc_by )) 279 done 280 281 start=$(( $now - 3600 )) 282 end=$(( $now + 3600 )) 283 step="30s" 284 params_range="start=${start}"'&'"end=${end}"'&'"step=30s" 285 jq_path=".data.result[0].values | .[][1] | select(. != null)" 286 287 echo "Test query rollup rule" 288 289 # Test by values are rolled up by second, then sum (for endpoint="/foo/bar") 290 ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 \ 291 endpoint=query_range query="http_requests_by_status_code\{endpoint=\"/foo/bar\"\}" \ 292 params="$params_range" \ 293 jq_path="$jq_path" expected_value="22" \ 294 metrics_type="aggregated" metrics_storage_policy="10s:6h" \ 295 retry_with_backoff prometheus_query_native 296 297 # Test by values are rolled up by second, then sum (for endpoint="/foo/bar") 298 ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 \ 299 endpoint=query_range query="http_requests_by_status_code\{endpoint=\"/foo/baz\"\}" \ 300 params="$params_range" \ 301 jq_path="$jq_path" expected_value="4" \ 302 metrics_type="aggregated" metrics_storage_policy="10s:6h" \ 303 retry_with_backoff prometheus_query_native 304 } 305 306 function test_metric_type_survives_aggregation { 307 echo "Test metric type should be kept after aggregation" 308 now=$(date +"%s") 309 value="42" 310 311 metric_type="counter" \ 312 prometheus_remote_write \ 313 metric_type_test $now $value \ 314 true "Expected request to succeed" \ 315 200 "Expected request to return status code 200" 316 317 start=$(( $now - 3600 )) 318 end=$(( $now + 3600 )) 319 jq_path=".datapoints[0].annotation" 320 321 echo "Test query metric type" 322 323 # Test if metric type is stored in aggregated namespace 324 # "CAEQAQ==" is the protobuf encoded base64 value with the metric type on it 325 ATTEMPTS=50 TIMEOUT=2 MAX_TIMEOUT=4 \ 326 namespace="agg" \ 327 id='{__name__=\"metric_type_test\",label0=\"label0\",label1=\"label1\",label2=\"label2\"}' \ 328 rangeStart=${start} \ 329 rangeEnd=${end} \ 330 jq_path="$jq_path" expected_value="CAEQAQ==" \ 331 retry_with_backoff dbnode_fetch 332 333 # Additional test to ensure correct value is stored 334 ATTEMPTS=5 TIMEOUT=2 MAX_TIMEOUT=4 \ 335 namespace="agg" \ 336 id='{__name__=\"metric_type_test\",label0=\"label0\",label1=\"label1\",label2=\"label2\"}' \ 337 rangeStart=${start} \ 338 rangeEnd=${end} \ 339 jq_path=".datapoints[0].value" expected_value="45" 340 } 341 342 echo "Run tests" 343 test_aggregated_graphite_metric 344 test_aggregated_rollup_rule 345 test_metric_type_survives_aggregation