github.com/m3db/m3@v1.5.0/src/aggregator/integration/multi_server_resend_test.go (about) 1 // +build integration 2 3 // Copyright (c) 2018 Uber Technologies, Inc. 4 // 5 // Permission is hereby granted, free of charge, to any person obtaining a copy 6 // of this software and associated documentation files (the "Software"), to deal 7 // in the Software without restriction, including without limitation the rights 8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 // copies of the Software, and to permit persons to whom the Software is 10 // furnished to do so, subject to the following conditions: 11 // 12 // The above copyright notice and this permission notice shall be included in 13 // all copies or substantial portions of the Software. 14 // 15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 // THE SOFTWARE. 22 23 package integration 24 25 import ( 26 "math" 27 "sort" 28 "sync" 29 "sync/atomic" 30 "testing" 31 "time" 32 33 "github.com/google/go-cmp/cmp" 34 "github.com/stretchr/testify/require" 35 "go.uber.org/zap" 36 37 "github.com/m3db/m3/src/aggregator/aggregation" 38 aggclient "github.com/m3db/m3/src/aggregator/client" 39 "github.com/m3db/m3/src/cluster/kv" 40 memcluster "github.com/m3db/m3/src/cluster/mem" 41 "github.com/m3db/m3/src/cluster/placement" 42 maggregation "github.com/m3db/m3/src/metrics/aggregation" 43 "github.com/m3db/m3/src/metrics/metadata" 44 "github.com/m3db/m3/src/metrics/metric" 45 "github.com/m3db/m3/src/metrics/metric/aggregated" 46 "github.com/m3db/m3/src/metrics/pipeline" 47 "github.com/m3db/m3/src/metrics/pipeline/applied" 48 "github.com/m3db/m3/src/metrics/policy" 49 "github.com/m3db/m3/src/metrics/transformation" 50 "github.com/m3db/m3/src/x/instrument" 51 xtest "github.com/m3db/m3/src/x/test" 52 xtime "github.com/m3db/m3/src/x/time" 53 ) 54 55 //nolint:dupl 56 func TestMultiServerResendAggregatedValues(t *testing.T) { 57 if testing.Short() { 58 t.SkipNow() 59 } 60 61 aggregatorClientType, err := getAggregatorClientTypeFromEnv() 62 require.NoError(t, err) 63 64 // Clock setup. 65 clock := newTestClock(time.Now().Truncate(time.Hour)) 66 67 // Placement setup. 68 var ( 69 numTotalShards = 1024 70 placementKey = "/placement" 71 ) 72 multiServerSetup := []struct { 73 rawTCPAddr string 74 httpAddr string 75 m3MsgAddr string 76 instanceConfig placementInstanceConfig 77 }{ 78 { 79 rawTCPAddr: "localhost:6000", 80 httpAddr: "localhost:16000", 81 m3MsgAddr: "localhost:26000", 82 instanceConfig: placementInstanceConfig{ 83 instanceID: "localhost:6000", 84 shardSetID: 1, 85 shardStartInclusive: 0, 86 shardEndExclusive: 512, 87 }, 88 }, 89 { 90 rawTCPAddr: "localhost:6001", 91 httpAddr: "localhost:16001", 92 m3MsgAddr: "localhost:26001", 93 instanceConfig: placementInstanceConfig{ 94 instanceID: "localhost:6001", 95 shardSetID: 1, 96 shardStartInclusive: 0, 97 shardEndExclusive: 512, 98 }, 99 }, 100 { 101 rawTCPAddr: "localhost:6002", 102 httpAddr: "localhost:16002", 103 m3MsgAddr: "localhost:26002", 104 instanceConfig: placementInstanceConfig{ 105 instanceID: "localhost:6002", 106 shardSetID: 2, 107 shardStartInclusive: 512, 108 shardEndExclusive: 1024, 109 }, 110 }, 111 { 112 rawTCPAddr: "localhost:6003", 113 httpAddr: "localhost:16003", 114 m3MsgAddr: "localhost:26003", 115 instanceConfig: placementInstanceConfig{ 116 instanceID: "localhost:6003", 117 shardSetID: 2, 118 shardStartInclusive: 512, 119 shardEndExclusive: 1024, 120 }, 121 }, 122 } 123 124 for i, mss := range multiServerSetup { 125 multiServerSetup[i].instanceConfig.instanceID = mss.rawTCPAddr 126 if aggregatorClientType == aggclient.M3MsgAggregatorClient { 127 multiServerSetup[i].instanceConfig.instanceID = mss.m3MsgAddr 128 } 129 } 130 131 clusterClient := memcluster.New(kv.NewOverrideOptions()) 132 instances := make([]placement.Instance, 0, len(multiServerSetup)) 133 for _, mss := range multiServerSetup { 134 instance := mss.instanceConfig.newPlacementInstance() 135 instances = append(instances, instance) 136 } 137 initPlacement := newPlacement(numTotalShards, instances).SetReplicaFactor(2) 138 setPlacement(t, placementKey, clusterClient, initPlacement) 139 topicService, err := initializeTopic(defaultTopicName, clusterClient, numTotalShards) 140 require.NoError(t, err) 141 142 // Election cluster setup. 143 electionCluster := newTestCluster(t) 144 145 // Sharding function maps all metrics to shard 0 except for the rollup metric, 146 // which gets mapped to the last shard. 147 pipelineRollupID := "pipelineRollup" 148 shardFn := func(id []byte, numShards uint32) uint32 { 149 if pipelineRollupID == string(id) { 150 return numShards - 1 151 } 152 return 0 153 } 154 155 // Admin client connection options setup. 156 connectionOpts := aggclient.NewConnectionOptions(). 157 SetInitReconnectThreshold(1). 158 SetMaxReconnectThreshold(1). 159 SetMaxReconnectDuration(2 * time.Second). 160 SetWriteTimeout(time.Second) 161 162 // Create servers. 163 servers := make(testServerSetups, 0, len(multiServerSetup)) 164 for _, mss := range multiServerSetup { 165 instrumentOpts := instrument.NewOptions() 166 logger := instrumentOpts.Logger().With( 167 zap.String("serverAddr", mss.rawTCPAddr), 168 ) 169 instrumentOpts = instrumentOpts.SetLogger(logger) 170 serverOpts := newTestServerOptions(t). 171 SetBufferForPastTimedMetric(time.Minute). 172 SetClockOptions(clock.Options()). 173 SetInstrumentOptions(instrumentOpts). 174 SetElectionCluster(electionCluster). 175 SetHTTPAddr(mss.httpAddr). 176 SetRawTCPAddr(mss.rawTCPAddr). 177 SetM3MsgAddr(mss.m3MsgAddr). 178 SetInstanceID(mss.instanceConfig.instanceID). 179 SetClusterClient(clusterClient). 180 SetTopicService(topicService). 181 SetTopicName(defaultTopicName). 182 SetShardFn(shardFn). 183 SetShardSetID(mss.instanceConfig.shardSetID). 184 SetClientConnectionOptions(connectionOpts). 185 SetDiscardNaNAggregatedValues(false) 186 server := newTestServerSetup(t, serverOpts) 187 servers = append(servers, server) 188 } 189 190 // Start the servers. 191 log := xtest.NewLogger(t) 192 log.Info("test forwarding pipeline") 193 for i, server := range servers { 194 require.NoError(t, server.startServer()) 195 log.Sugar().Infof("server %d is now up", i) 196 } 197 198 // Create clients for writing to the servers. 199 client := servers.newClient(t) 200 require.NoError(t, client.connect()) 201 202 // Waiting for two leaders to come up. 203 var ( 204 leaders = make(map[int]struct{}) 205 leaderCh = make(chan int, len(servers)/2) 206 numLeaders int32 207 wg sync.WaitGroup 208 ) 209 wg.Add(len(servers) / 2) 210 for i, server := range servers { 211 i, server := i, server 212 go func() { 213 if err := server.waitUntilLeader(); err == nil { 214 res := int(atomic.AddInt32(&numLeaders, 1)) 215 if res <= len(servers)/2 { 216 leaderCh <- i 217 wg.Done() 218 } 219 } 220 }() 221 } 222 wg.Wait() 223 close(leaderCh) 224 225 for i := range leaderCh { 226 leaders[i] = struct{}{} 227 log.Sugar().Infof("server %d has become the leader", i) 228 } 229 log.Sugar().Infof("%d servers have become leaders", len(leaders)) 230 231 var ( 232 idPrefix = "foo" 233 numIDs = 2 234 start = clock.Now() 235 stop = start.Add(12 * time.Second) 236 interval = time.Second 237 storagePolicies = policy.StoragePolicies{ 238 policy.NewStoragePolicy(2*time.Second, xtime.Second, time.Hour), 239 policy.NewStoragePolicy(4*time.Second, xtime.Second, 24*time.Hour), 240 } 241 ) 242 243 ids := generateTestIDs(idPrefix, numIDs) 244 stagedMetadatas := metadata.StagedMetadatas{ 245 { 246 CutoverNanos: 0, 247 Tombstoned: false, 248 Metadata: metadata.Metadata{ 249 Pipelines: []metadata.PipelineMetadata{ 250 { 251 AggregationID: maggregation.DefaultID, 252 StoragePolicies: storagePolicies, 253 ResendEnabled: true, 254 Pipeline: applied.NewPipeline([]applied.OpUnion{ 255 { 256 Type: pipeline.TransformationOpType, 257 Transformation: pipeline.TransformationOp{Type: transformation.Increase}, 258 }, 259 { 260 Type: pipeline.RollupOpType, 261 Rollup: applied.RollupOp{ 262 ID: []byte(pipelineRollupID), 263 AggregationID: maggregation.MustCompressTypes(maggregation.Sum), 264 }, 265 }, 266 { 267 Type: pipeline.TransformationOpType, 268 Transformation: pipeline.TransformationOp{Type: transformation.Reset}, 269 }, 270 }), 271 }, 272 }, 273 }, 274 }, 275 } 276 metricTypeFn := constantMetricTypeFnFactory(metric.GaugeType) 277 genOpts := valueGenOpts{ 278 untimed: untimedValueGenOpts{ 279 gaugeValueGenFn: func(intervalIdx, idIdx int) float64 { 280 // Each gauge will have two datapoints within the same aggregation window. 281 // The first value is 0.0 and should be ignored, and the second value will 282 // be used for computing the `Increase` value and should result in a `Increase` 283 // value of 2 that is then forwarded to the next aggregation server. 284 if intervalIdx%2 == 0 { 285 return 0.0 286 } 287 return float64(intervalIdx + 1) 288 }, 289 }, 290 } 291 metadataFn := func(idx int) metadataUnion { 292 return metadataUnion{ 293 mType: stagedMetadatasType, 294 stagedMetadatas: stagedMetadatas, 295 } 296 } 297 // 2 metrics (foo0 and foo1) 298 // 12 datapoints per metric. 299 // 1 datapoint every second, for an interval of 12. 300 // alternates between 0 and a value. 2 per datapoints per aggregation window (2s) 301 // values per metric: (0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12) 302 dataset := mustGenerateTestDataset(t, datasetGenOpts{ 303 start: start, 304 stop: stop, 305 interval: interval, 306 ids: ids, 307 category: untimedMetric, 308 typeFn: metricTypeFn, 309 valueGenOpts: genOpts, 310 metadataFn: metadataFn, 311 }) 312 313 for i, data := range dataset { 314 if i == 3 { 315 // send this datapoint later 316 continue 317 } 318 319 for _, mm := range data.metricWithMetadatas { 320 require.NoError(t, client.writeUntimedMetricWithMetadatas(mm.metric.untimed, mm.metadata.stagedMetadatas)) 321 } 322 require.NoError(t, client.flush()) 323 324 // Give server some time to process the incoming packets. 325 time.Sleep(time.Second) 326 } 327 328 // Move time forward using the larger resolution and wait for flushing to happen 329 // at the originating server (where the raw metrics are aggregated). 330 orgServerflushTime := stop.Add(2 * storagePolicies[1].Resolution().Window) 331 for currTime := stop; !currTime.After(orgServerflushTime); currTime = currTime.Add(time.Second) { 332 clock.SetNow(currTime) 333 time.Sleep(time.Second) 334 } 335 336 // Move time forward using the larger resolution again and wait for flushing to 337 // happen at the destination server (where the rollup metrics are aggregated). 338 dstServerflushTime := orgServerflushTime.Add(2 * storagePolicies[1].Resolution().Window) 339 for currTime := orgServerflushTime; !currTime.After(dstServerflushTime); currTime = currTime.Add(time.Second) { 340 clock.SetNow(currTime) 341 time.Sleep(time.Second) 342 } 343 344 // send a datapoint late 345 data := dataset[3] 346 for _, mm := range data.metricWithMetadatas { 347 require.NoError(t, client.writeUntimedMetricWithMetadatas(mm.metric.untimed, mm.metadata.stagedMetadatas)) 348 } 349 require.NoError(t, client.flush()) 350 351 // Give server some time to process the incoming packets. 352 time.Sleep(time.Second) 353 354 // Flush the late raw metrics 355 flushTime := dstServerflushTime.Add(2 * storagePolicies[1].Resolution().Window) 356 clock.SetNow(flushTime) 357 time.Sleep(time.Second) 358 359 // Flush the late aggregated metrics 360 flushTime = flushTime.Add(2 * storagePolicies[1].Resolution().Window) 361 clock.SetNow(flushTime) 362 time.Sleep(time.Second) 363 364 // Remove all the topic consumers before closing clients and servers. This allows to close the 365 // connections between servers while they still are running. Otherwise, during server shutdown, 366 // the yet-to-be-closed servers would repeatedly try to reconnect to recently closed ones, which 367 // results in longer shutdown times. 368 require.NoError(t, removeAllTopicConsumers(topicService, defaultTopicName)) 369 370 // Stop the client. 371 require.NoError(t, client.close()) 372 373 // Stop the servers. 374 for i, server := range servers { 375 require.NoError(t, server.stopServer()) 376 log.Sugar().Infof("server %d is now down", i) 377 } 378 379 // Validate results. 380 var destinationServer *testServerSetup 381 if _, exists := leaders[2]; exists { 382 destinationServer = servers[2] 383 } else if _, exists = leaders[3]; exists { 384 destinationServer = servers[3] 385 } else { 386 require.Fail(t, "there must exist a leader between server 2 and server 3") 387 } 388 389 aggregatorOpts := destinationServer.aggregatorOpts 390 expectedMetricKeyList := []metricKey{ 391 { 392 category: forwardedMetric, 393 typ: metric.GaugeType, 394 id: pipelineRollupID, 395 storagePolicy: storagePolicies[0], 396 }, 397 { 398 category: forwardedMetric, 399 typ: metric.GaugeType, 400 id: pipelineRollupID, 401 storagePolicy: storagePolicies[1], 402 }, 403 } 404 // Expected results for 2s:1h storage policy. 405 expectedValuesByTimeList := []valuesByTime{ 406 make(valuesByTime), 407 make(valuesByTime), 408 } 409 // expected values per storage policy 410 expectedValuesList := [][]float64{ 411 { 412 4, 413 4, 414 4, 415 4, 416 4, 417 4, 418 }, 419 { 420 8, 421 8, 422 8, 423 }, 424 } 425 for spIdx := 0; spIdx < len(storagePolicies); spIdx++ { 426 storagePolicy := storagePolicies[spIdx] 427 for i := 0; i < len(expectedValuesList[spIdx]); i++ { 428 if math.IsNaN(expectedValuesList[spIdx][i]) { 429 continue 430 } 431 currTime := start.Add(time.Duration(i+1) * storagePolicy.Resolution().Window) 432 instrumentOpts := aggregatorOpts.InstrumentOptions() 433 agg := aggregation.NewGauge(aggregation.NewOptions(instrumentOpts)) 434 expectedAnnotation := generateAnnotation(metric.GaugeType, numIDs-1) 435 agg.Update(time.Now(), expectedValuesList[spIdx][i], expectedAnnotation) 436 expectedValuesByTimeList[spIdx][currTime.UnixNano()] = agg 437 zero := aggregation.NewGauge(aggregation.NewOptions(instrumentOpts)) 438 zero.Update(time.Now(), 0.0, expectedAnnotation) 439 resetTime := currTime.UnixNano() + int64(storagePolicy.Resolution().Window/2) 440 expectedValuesByTimeList[spIdx][resetTime] = zero 441 } 442 } 443 444 var expectedResultsFlattened []aggregated.MetricWithStoragePolicy 445 for i := 0; i < len(storagePolicies); i++ { 446 expectedDatapointsByID := datapointsByID{ 447 expectedMetricKeyList[i]: expectedValuesByTimeList[i], 448 } 449 expectedBuckets := []aggregationBucket{ 450 { 451 key: aggregationKey{ 452 aggregationID: maggregation.MustCompressTypes(maggregation.Sum), 453 storagePolicy: storagePolicies[i], 454 }, 455 data: expectedDatapointsByID, 456 }, 457 } 458 expectedResults, err := computeExpectedAggregationOutput( 459 dstServerflushTime, 460 expectedBuckets, 461 aggregatorOpts, 462 ) 463 require.NoError(t, err) 464 expectedResultsFlattened = append(expectedResultsFlattened, expectedResults...) 465 } 466 sort.Sort(byTimeIDPolicyAscending(expectedResultsFlattened)) 467 actual := destinationServer.sortedResults() 468 if !cmp.Equal(expectedResultsFlattened, actual, testCmpOpts...) { 469 require.Fail(t, "results differ", cmp.Diff(expectedResultsFlattened, actual, testCmpOpts...)) 470 } 471 }