github.com/m3db/m3@v1.5.0/src/aggregator/integration/multi_server_forwarding_pipeline_test.go (about) 1 // +build integration 2 3 // Copyright (c) 2018 Uber Technologies, Inc. 4 // 5 // Permission is hereby granted, free of charge, to any person obtaining a copy 6 // of this software and associated documentation files (the "Software"), to deal 7 // in the Software without restriction, including without limitation the rights 8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 // copies of the Software, and to permit persons to whom the Software is 10 // furnished to do so, subject to the following conditions: 11 // 12 // The above copyright notice and this permission notice shall be included in 13 // all copies or substantial portions of the Software. 14 // 15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 // THE SOFTWARE. 22 23 package integration 24 25 import ( 26 "math" 27 "sort" 28 "sync" 29 "sync/atomic" 30 "testing" 31 "time" 32 33 "github.com/m3db/m3/src/aggregator/aggregation" 34 aggclient "github.com/m3db/m3/src/aggregator/client" 35 "github.com/m3db/m3/src/cluster/kv" 36 memcluster "github.com/m3db/m3/src/cluster/mem" 37 "github.com/m3db/m3/src/cluster/placement" 38 maggregation "github.com/m3db/m3/src/metrics/aggregation" 39 "github.com/m3db/m3/src/metrics/metadata" 40 "github.com/m3db/m3/src/metrics/metric" 41 "github.com/m3db/m3/src/metrics/metric/aggregated" 42 "github.com/m3db/m3/src/metrics/pipeline" 43 "github.com/m3db/m3/src/metrics/pipeline/applied" 44 "github.com/m3db/m3/src/metrics/policy" 45 "github.com/m3db/m3/src/metrics/transformation" 46 "github.com/m3db/m3/src/x/instrument" 47 xtest "github.com/m3db/m3/src/x/test" 48 xtime "github.com/m3db/m3/src/x/time" 49 50 "github.com/google/go-cmp/cmp" 51 "github.com/stretchr/testify/require" 52 "go.uber.org/zap" 53 ) 54 55 func TestMultiServerForwardingPipelineKeepNaNAggregatedValues(t *testing.T) { 56 testMultiServerForwardingPipeline(t, false) 57 } 58 59 func TestMultiServerForwardingPipelineDiscardNaNAggregatedValues(t *testing.T) { 60 testMultiServerForwardingPipeline(t, true) 61 } 62 63 func testMultiServerForwardingPipeline(t *testing.T, discardNaNAggregatedValues bool) { 64 if testing.Short() { 65 t.SkipNow() 66 } 67 68 aggregatorClientType, err := getAggregatorClientTypeFromEnv() 69 require.NoError(t, err) 70 71 // Clock setup. 72 clock := newTestClock(time.Now().Truncate(time.Hour)) 73 74 // Placement setup. 75 var ( 76 numTotalShards = 1024 77 placementKey = "/placement" 78 ) 79 multiServerSetup := []struct { 80 rawTCPAddr string 81 httpAddr string 82 m3MsgAddr string 83 instanceConfig placementInstanceConfig 84 }{ 85 { 86 rawTCPAddr: "localhost:6000", 87 httpAddr: "localhost:16000", 88 m3MsgAddr: "localhost:26000", 89 instanceConfig: placementInstanceConfig{ 90 shardSetID: 1, 91 shardStartInclusive: 0, 92 shardEndExclusive: 512, 93 }, 94 }, 95 { 96 rawTCPAddr: "localhost:6001", 97 httpAddr: "localhost:16001", 98 m3MsgAddr: "localhost:26001", 99 instanceConfig: placementInstanceConfig{ 100 shardSetID: 1, 101 shardStartInclusive: 0, 102 shardEndExclusive: 512, 103 }, 104 }, 105 { 106 rawTCPAddr: "localhost:6002", 107 httpAddr: "localhost:16002", 108 m3MsgAddr: "localhost:26002", 109 instanceConfig: placementInstanceConfig{ 110 shardSetID: 2, 111 shardStartInclusive: 512, 112 shardEndExclusive: 1024, 113 }, 114 }, 115 { 116 rawTCPAddr: "localhost:6003", 117 httpAddr: "localhost:16003", 118 m3MsgAddr: "localhost:26003", 119 instanceConfig: placementInstanceConfig{ 120 shardSetID: 2, 121 shardStartInclusive: 512, 122 shardEndExclusive: 1024, 123 }, 124 }, 125 } 126 127 for i, mss := range multiServerSetup { 128 multiServerSetup[i].instanceConfig.instanceID = mss.rawTCPAddr 129 if aggregatorClientType == aggclient.M3MsgAggregatorClient { 130 multiServerSetup[i].instanceConfig.instanceID = mss.m3MsgAddr 131 } 132 } 133 134 clusterClient := memcluster.New(kv.NewOverrideOptions()) 135 instances := make([]placement.Instance, 0, len(multiServerSetup)) 136 for _, mss := range multiServerSetup { 137 instance := mss.instanceConfig.newPlacementInstance() 138 instances = append(instances, instance) 139 } 140 initPlacement := newPlacement(numTotalShards, instances).SetReplicaFactor(2) 141 setPlacement(t, placementKey, clusterClient, initPlacement) 142 topicService, err := initializeTopic(defaultTopicName, clusterClient, numTotalShards) 143 require.NoError(t, err) 144 145 // Election cluster setup. 146 electionCluster := newTestCluster(t) 147 148 // Sharding function maps all metrics to shard 0 except for the rollup metric, 149 // which gets mapped to the last shard. 150 pipelineRollupID := "pipelineRollup" 151 shardFn := func(id []byte, numShards uint32) uint32 { 152 if pipelineRollupID == string(id) { 153 return numShards - 1 154 } 155 return 0 156 } 157 158 // Admin client connection options setup. 159 connectionOpts := aggclient.NewConnectionOptions(). 160 SetInitReconnectThreshold(1). 161 SetMaxReconnectThreshold(1). 162 SetMaxReconnectDuration(2 * time.Second). 163 SetWriteTimeout(time.Second) 164 165 // Create servers. 166 servers := make(testServerSetups, 0, len(multiServerSetup)) 167 for _, mss := range multiServerSetup { 168 instrumentOpts := instrument.NewOptions() 169 logger := instrumentOpts.Logger().With( 170 zap.String("serverAddr", mss.rawTCPAddr), 171 ) 172 instrumentOpts = instrumentOpts.SetLogger(logger) 173 serverOpts := newTestServerOptions(t). 174 SetClockOptions(clock.Options()). 175 SetInstrumentOptions(instrumentOpts). 176 SetElectionCluster(electionCluster). 177 SetRawTCPAddr(mss.rawTCPAddr). 178 SetHTTPAddr(mss.httpAddr). 179 SetM3MsgAddr(mss.m3MsgAddr). 180 SetInstanceID(mss.instanceConfig.instanceID). 181 SetClusterClient(clusterClient). 182 SetTopicService(topicService). 183 SetTopicName(defaultTopicName). 184 SetShardFn(shardFn). 185 SetShardSetID(mss.instanceConfig.shardSetID). 186 SetClientConnectionOptions(connectionOpts). 187 SetDiscardNaNAggregatedValues(discardNaNAggregatedValues) 188 server := newTestServerSetup(t, serverOpts) 189 servers = append(servers, server) 190 } 191 192 // Start the servers. 193 log := xtest.NewLogger(t) 194 log.Info("test forwarding pipeline") 195 for i, server := range servers { 196 require.NoError(t, server.startServer()) 197 log.Sugar().Infof("server %d is now up", i) 198 } 199 200 // Create clients for writing to the servers. 201 client := servers.newClient(t) 202 require.NoError(t, client.connect()) 203 204 // Waiting for two leaders to come up. 205 var ( 206 leaders = make(map[int]struct{}) 207 leaderCh = make(chan int, len(servers)/2) 208 numLeaders int32 209 wg sync.WaitGroup 210 ) 211 wg.Add(len(servers) / 2) 212 for i, server := range servers { 213 i, server := i, server 214 go func() { 215 if err := server.waitUntilLeader(); err == nil { 216 res := int(atomic.AddInt32(&numLeaders, 1)) 217 if res <= len(servers)/2 { 218 leaderCh <- i 219 wg.Done() 220 } 221 } 222 }() 223 } 224 wg.Wait() 225 close(leaderCh) 226 227 for i := range leaderCh { 228 leaders[i] = struct{}{} 229 log.Sugar().Infof("server %d has become the leader", i) 230 } 231 log.Sugar().Infof("%d servers have become leaders", len(leaders)) 232 233 var ( 234 idPrefix = "foo" 235 numIDs = 2 236 start = clock.Now() 237 stop = start.Add(12 * time.Second) 238 interval = time.Second 239 storagePolicies = policy.StoragePolicies{ 240 policy.NewStoragePolicy(2*time.Second, xtime.Second, time.Hour), 241 policy.NewStoragePolicy(4*time.Second, xtime.Second, 24*time.Hour), 242 } 243 ) 244 245 ids := generateTestIDs(idPrefix, numIDs) 246 stagedMetadatas := metadata.StagedMetadatas{ 247 { 248 CutoverNanos: 0, 249 Tombstoned: false, 250 Metadata: metadata.Metadata{ 251 Pipelines: []metadata.PipelineMetadata{ 252 { 253 AggregationID: maggregation.DefaultID, 254 StoragePolicies: storagePolicies, 255 Pipeline: applied.NewPipeline([]applied.OpUnion{ 256 { 257 Type: pipeline.TransformationOpType, 258 Transformation: pipeline.TransformationOp{Type: transformation.PerSecond}, 259 }, 260 { 261 Type: pipeline.RollupOpType, 262 Rollup: applied.RollupOp{ 263 ID: []byte(pipelineRollupID), 264 AggregationID: maggregation.MustCompressTypes(maggregation.Sum), 265 }, 266 }, 267 }), 268 }, 269 }, 270 }, 271 }, 272 } 273 metricTypeFn := constantMetricTypeFnFactory(metric.GaugeType) 274 valueGenOpts := valueGenOpts{ 275 untimed: untimedValueGenOpts{ 276 gaugeValueGenFn: func(intervalIdx, idIdx int) float64 { 277 // Each gauge will have two datapoints within the same aggregation window. 278 // The first value is 0.0 and should be ignored, and the second value will 279 // be used for computing the `PerSecond` value and should result in a `PerSecond` 280 // value of 1 that is then forwarded to the next aggregation server. 281 if intervalIdx%2 == 0 { 282 return 0.0 283 } 284 return float64(intervalIdx + idIdx) 285 }, 286 }, 287 } 288 metadataFn := func(idx int) metadataUnion { 289 return metadataUnion{ 290 mType: stagedMetadatasType, 291 stagedMetadatas: stagedMetadatas, 292 } 293 } 294 dataset := mustGenerateTestDataset(t, datasetGenOpts{ 295 start: start, 296 stop: stop, 297 interval: interval, 298 ids: ids, 299 category: untimedMetric, 300 typeFn: metricTypeFn, 301 valueGenOpts: valueGenOpts, 302 metadataFn: metadataFn, 303 }) 304 305 for _, data := range dataset { 306 clock.SetNow(data.timestamp) 307 308 for _, mm := range data.metricWithMetadatas { 309 require.NoError(t, client.writeUntimedMetricWithMetadatas(mm.metric.untimed, mm.metadata.stagedMetadatas)) 310 } 311 require.NoError(t, client.flush()) 312 313 // Give server some time to process the incoming packets. 314 time.Sleep(time.Second) 315 } 316 317 // Move time forward using the larger resolution and wait for flushing to happen 318 // at the originating server (where the raw metrics are aggregated). 319 originatingServerflushTime := stop.Add(2 * storagePolicies[1].Resolution().Window) 320 for currTime := stop; !currTime.After(originatingServerflushTime); currTime = currTime.Add(time.Second) { 321 clock.SetNow(currTime) 322 time.Sleep(time.Second) 323 } 324 325 // Move time forward using the larger resolution again and wait for flushing to 326 // happen at the destination server (where the rollup metrics are aggregated). 327 destinationServerflushTime := originatingServerflushTime.Add(2 * storagePolicies[1].Resolution().Window) 328 for currTime := originatingServerflushTime; !currTime.After(destinationServerflushTime); currTime = currTime.Add(time.Second) { 329 clock.SetNow(currTime) 330 time.Sleep(time.Second) 331 } 332 333 // Remove all the topic consumers before closing clients and servers. This allows to close the 334 // connections between servers while they still are running. Otherwise, during server shutdown, 335 // the yet-to-be-closed servers would repeatedly try to reconnect to recently closed ones, which 336 // results in longer shutdown times. 337 require.NoError(t, removeAllTopicConsumers(topicService, defaultTopicName)) 338 339 // Stop the client. 340 require.NoError(t, client.close()) 341 342 // Stop the servers. 343 for i, server := range servers { 344 require.NoError(t, server.stopServer()) 345 log.Sugar().Infof("server %d is now down", i) 346 } 347 348 // Validate results. 349 var destinationServer *testServerSetup 350 if _, exists := leaders[2]; exists { 351 destinationServer = servers[2] 352 } else if _, exists = leaders[3]; exists { 353 destinationServer = servers[3] 354 } else { 355 require.Fail(t, "there must exist a leader between server 2 and server 3") 356 } 357 358 aggregatorOpts := destinationServer.aggregatorOpts 359 expectedMetricKeyList := []metricKey{ 360 { 361 category: forwardedMetric, 362 typ: metric.GaugeType, 363 id: pipelineRollupID, 364 storagePolicy: storagePolicies[0], 365 }, 366 { 367 category: forwardedMetric, 368 typ: metric.GaugeType, 369 id: pipelineRollupID, 370 storagePolicy: storagePolicies[1], 371 }, 372 } 373 // Expected results for 2s:1h storage policy. 374 expectedValuesByTimeList := []valuesByTime{ 375 make(valuesByTime), 376 make(valuesByTime), 377 } 378 expectedValuesList := [][]float64{ 379 { 380 math.NaN(), 381 float64(numIDs), 382 float64(numIDs), 383 float64(numIDs), 384 float64(numIDs), 385 float64(numIDs), 386 }, 387 { 388 math.NaN(), 389 float64(numIDs), 390 float64(numIDs), 391 }, 392 } 393 for spIdx := 0; spIdx < len(storagePolicies); spIdx++ { 394 storagePolicy := storagePolicies[spIdx] 395 for i := 0; i < len(expectedValuesList[spIdx]); i++ { 396 if discardNaNAggregatedValues && math.IsNaN(expectedValuesList[spIdx][i]) { 397 continue 398 } 399 currTime := start.Add(time.Duration(i+1) * storagePolicy.Resolution().Window) 400 instrumentOpts := aggregatorOpts.InstrumentOptions() 401 agg := aggregation.NewGauge(aggregation.NewOptions(instrumentOpts)) 402 expectedAnnotation := generateAnnotation(metric.GaugeType, numIDs-1) 403 agg.Update(time.Now(), expectedValuesList[spIdx][i], expectedAnnotation) 404 expectedValuesByTimeList[spIdx][currTime.UnixNano()] = agg 405 } 406 } 407 408 var expectedResultsFlattened []aggregated.MetricWithStoragePolicy 409 for i := 0; i < len(storagePolicies); i++ { 410 expectedDatapointsByID := datapointsByID{ 411 expectedMetricKeyList[i]: expectedValuesByTimeList[i], 412 } 413 expectedBuckets := []aggregationBucket{ 414 { 415 key: aggregationKey{ 416 aggregationID: maggregation.MustCompressTypes(maggregation.Sum), 417 storagePolicy: storagePolicies[i], 418 }, 419 data: expectedDatapointsByID, 420 }, 421 } 422 expectedResults, err := computeExpectedAggregationOutput( 423 destinationServerflushTime, 424 expectedBuckets, 425 aggregatorOpts, 426 ) 427 require.NoError(t, err) 428 expectedResultsFlattened = append(expectedResultsFlattened, expectedResults...) 429 } 430 sort.Sort(byTimeIDPolicyAscending(expectedResultsFlattened)) 431 actual := destinationServer.sortedResults() 432 require.True(t, cmp.Equal(expectedResultsFlattened, actual, testCmpOpts...)) 433 }