github.com/m3db/m3@v1.5.0/src/aggregator/integration/multi_server_resend_test.go (about)

     1  // +build integration
     2  
     3  // Copyright (c) 2018 Uber Technologies, Inc.
     4  //
     5  // Permission is hereby granted, free of charge, to any person obtaining a copy
     6  // of this software and associated documentation files (the "Software"), to deal
     7  // in the Software without restriction, including without limitation the rights
     8  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     9  // copies of the Software, and to permit persons to whom the Software is
    10  // furnished to do so, subject to the following conditions:
    11  //
    12  // The above copyright notice and this permission notice shall be included in
    13  // all copies or substantial portions of the Software.
    14  //
    15  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    18  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    20  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    21  // THE SOFTWARE.
    22  
    23  package integration
    24  
    25  import (
    26  	"math"
    27  	"sort"
    28  	"sync"
    29  	"sync/atomic"
    30  	"testing"
    31  	"time"
    32  
    33  	"github.com/google/go-cmp/cmp"
    34  	"github.com/stretchr/testify/require"
    35  	"go.uber.org/zap"
    36  
    37  	"github.com/m3db/m3/src/aggregator/aggregation"
    38  	aggclient "github.com/m3db/m3/src/aggregator/client"
    39  	"github.com/m3db/m3/src/cluster/kv"
    40  	memcluster "github.com/m3db/m3/src/cluster/mem"
    41  	"github.com/m3db/m3/src/cluster/placement"
    42  	maggregation "github.com/m3db/m3/src/metrics/aggregation"
    43  	"github.com/m3db/m3/src/metrics/metadata"
    44  	"github.com/m3db/m3/src/metrics/metric"
    45  	"github.com/m3db/m3/src/metrics/metric/aggregated"
    46  	"github.com/m3db/m3/src/metrics/pipeline"
    47  	"github.com/m3db/m3/src/metrics/pipeline/applied"
    48  	"github.com/m3db/m3/src/metrics/policy"
    49  	"github.com/m3db/m3/src/metrics/transformation"
    50  	"github.com/m3db/m3/src/x/instrument"
    51  	xtest "github.com/m3db/m3/src/x/test"
    52  	xtime "github.com/m3db/m3/src/x/time"
    53  )
    54  
    55  //nolint:dupl
    56  func TestMultiServerResendAggregatedValues(t *testing.T) {
    57  	if testing.Short() {
    58  		t.SkipNow()
    59  	}
    60  
    61  	aggregatorClientType, err := getAggregatorClientTypeFromEnv()
    62  	require.NoError(t, err)
    63  
    64  	// Clock setup.
    65  	clock := newTestClock(time.Now().Truncate(time.Hour))
    66  
    67  	// Placement setup.
    68  	var (
    69  		numTotalShards = 1024
    70  		placementKey   = "/placement"
    71  	)
    72  	multiServerSetup := []struct {
    73  		rawTCPAddr     string
    74  		httpAddr       string
    75  		m3MsgAddr      string
    76  		instanceConfig placementInstanceConfig
    77  	}{
    78  		{
    79  			rawTCPAddr: "localhost:6000",
    80  			httpAddr:   "localhost:16000",
    81  			m3MsgAddr:  "localhost:26000",
    82  			instanceConfig: placementInstanceConfig{
    83  				instanceID:          "localhost:6000",
    84  				shardSetID:          1,
    85  				shardStartInclusive: 0,
    86  				shardEndExclusive:   512,
    87  			},
    88  		},
    89  		{
    90  			rawTCPAddr: "localhost:6001",
    91  			httpAddr:   "localhost:16001",
    92  			m3MsgAddr:  "localhost:26001",
    93  			instanceConfig: placementInstanceConfig{
    94  				instanceID:          "localhost:6001",
    95  				shardSetID:          1,
    96  				shardStartInclusive: 0,
    97  				shardEndExclusive:   512,
    98  			},
    99  		},
   100  		{
   101  			rawTCPAddr: "localhost:6002",
   102  			httpAddr:   "localhost:16002",
   103  			m3MsgAddr:  "localhost:26002",
   104  			instanceConfig: placementInstanceConfig{
   105  				instanceID:          "localhost:6002",
   106  				shardSetID:          2,
   107  				shardStartInclusive: 512,
   108  				shardEndExclusive:   1024,
   109  			},
   110  		},
   111  		{
   112  			rawTCPAddr: "localhost:6003",
   113  			httpAddr:   "localhost:16003",
   114  			m3MsgAddr:  "localhost:26003",
   115  			instanceConfig: placementInstanceConfig{
   116  				instanceID:          "localhost:6003",
   117  				shardSetID:          2,
   118  				shardStartInclusive: 512,
   119  				shardEndExclusive:   1024,
   120  			},
   121  		},
   122  	}
   123  
   124  	for i, mss := range multiServerSetup {
   125  		multiServerSetup[i].instanceConfig.instanceID = mss.rawTCPAddr
   126  		if aggregatorClientType == aggclient.M3MsgAggregatorClient {
   127  			multiServerSetup[i].instanceConfig.instanceID = mss.m3MsgAddr
   128  		}
   129  	}
   130  
   131  	clusterClient := memcluster.New(kv.NewOverrideOptions())
   132  	instances := make([]placement.Instance, 0, len(multiServerSetup))
   133  	for _, mss := range multiServerSetup {
   134  		instance := mss.instanceConfig.newPlacementInstance()
   135  		instances = append(instances, instance)
   136  	}
   137  	initPlacement := newPlacement(numTotalShards, instances).SetReplicaFactor(2)
   138  	setPlacement(t, placementKey, clusterClient, initPlacement)
   139  	topicService, err := initializeTopic(defaultTopicName, clusterClient, numTotalShards)
   140  	require.NoError(t, err)
   141  
   142  	// Election cluster setup.
   143  	electionCluster := newTestCluster(t)
   144  
   145  	// Sharding function maps all metrics to shard 0 except for the rollup metric,
   146  	// which gets mapped to the last shard.
   147  	pipelineRollupID := "pipelineRollup"
   148  	shardFn := func(id []byte, numShards uint32) uint32 {
   149  		if pipelineRollupID == string(id) {
   150  			return numShards - 1
   151  		}
   152  		return 0
   153  	}
   154  
   155  	// Admin client connection options setup.
   156  	connectionOpts := aggclient.NewConnectionOptions().
   157  		SetInitReconnectThreshold(1).
   158  		SetMaxReconnectThreshold(1).
   159  		SetMaxReconnectDuration(2 * time.Second).
   160  		SetWriteTimeout(time.Second)
   161  
   162  	// Create servers.
   163  	servers := make(testServerSetups, 0, len(multiServerSetup))
   164  	for _, mss := range multiServerSetup {
   165  		instrumentOpts := instrument.NewOptions()
   166  		logger := instrumentOpts.Logger().With(
   167  			zap.String("serverAddr", mss.rawTCPAddr),
   168  		)
   169  		instrumentOpts = instrumentOpts.SetLogger(logger)
   170  		serverOpts := newTestServerOptions(t).
   171  			SetBufferForPastTimedMetric(time.Minute).
   172  			SetClockOptions(clock.Options()).
   173  			SetInstrumentOptions(instrumentOpts).
   174  			SetElectionCluster(electionCluster).
   175  			SetHTTPAddr(mss.httpAddr).
   176  			SetRawTCPAddr(mss.rawTCPAddr).
   177  			SetM3MsgAddr(mss.m3MsgAddr).
   178  			SetInstanceID(mss.instanceConfig.instanceID).
   179  			SetClusterClient(clusterClient).
   180  			SetTopicService(topicService).
   181  			SetTopicName(defaultTopicName).
   182  			SetShardFn(shardFn).
   183  			SetShardSetID(mss.instanceConfig.shardSetID).
   184  			SetClientConnectionOptions(connectionOpts).
   185  			SetDiscardNaNAggregatedValues(false)
   186  		server := newTestServerSetup(t, serverOpts)
   187  		servers = append(servers, server)
   188  	}
   189  
   190  	// Start the servers.
   191  	log := xtest.NewLogger(t)
   192  	log.Info("test forwarding pipeline")
   193  	for i, server := range servers {
   194  		require.NoError(t, server.startServer())
   195  		log.Sugar().Infof("server %d is now up", i)
   196  	}
   197  
   198  	// Create clients for writing to the servers.
   199  	client := servers.newClient(t)
   200  	require.NoError(t, client.connect())
   201  
   202  	// Waiting for two leaders to come up.
   203  	var (
   204  		leaders    = make(map[int]struct{})
   205  		leaderCh   = make(chan int, len(servers)/2)
   206  		numLeaders int32
   207  		wg         sync.WaitGroup
   208  	)
   209  	wg.Add(len(servers) / 2)
   210  	for i, server := range servers {
   211  		i, server := i, server
   212  		go func() {
   213  			if err := server.waitUntilLeader(); err == nil {
   214  				res := int(atomic.AddInt32(&numLeaders, 1))
   215  				if res <= len(servers)/2 {
   216  					leaderCh <- i
   217  					wg.Done()
   218  				}
   219  			}
   220  		}()
   221  	}
   222  	wg.Wait()
   223  	close(leaderCh)
   224  
   225  	for i := range leaderCh {
   226  		leaders[i] = struct{}{}
   227  		log.Sugar().Infof("server %d has become the leader", i)
   228  	}
   229  	log.Sugar().Infof("%d servers have become leaders", len(leaders))
   230  
   231  	var (
   232  		idPrefix        = "foo"
   233  		numIDs          = 2
   234  		start           = clock.Now()
   235  		stop            = start.Add(12 * time.Second)
   236  		interval        = time.Second
   237  		storagePolicies = policy.StoragePolicies{
   238  			policy.NewStoragePolicy(2*time.Second, xtime.Second, time.Hour),
   239  			policy.NewStoragePolicy(4*time.Second, xtime.Second, 24*time.Hour),
   240  		}
   241  	)
   242  
   243  	ids := generateTestIDs(idPrefix, numIDs)
   244  	stagedMetadatas := metadata.StagedMetadatas{
   245  		{
   246  			CutoverNanos: 0,
   247  			Tombstoned:   false,
   248  			Metadata: metadata.Metadata{
   249  				Pipelines: []metadata.PipelineMetadata{
   250  					{
   251  						AggregationID:   maggregation.DefaultID,
   252  						StoragePolicies: storagePolicies,
   253  						ResendEnabled:   true,
   254  						Pipeline: applied.NewPipeline([]applied.OpUnion{
   255  							{
   256  								Type:           pipeline.TransformationOpType,
   257  								Transformation: pipeline.TransformationOp{Type: transformation.Increase},
   258  							},
   259  							{
   260  								Type: pipeline.RollupOpType,
   261  								Rollup: applied.RollupOp{
   262  									ID:            []byte(pipelineRollupID),
   263  									AggregationID: maggregation.MustCompressTypes(maggregation.Sum),
   264  								},
   265  							},
   266  							{
   267  								Type:           pipeline.TransformationOpType,
   268  								Transformation: pipeline.TransformationOp{Type: transformation.Reset},
   269  							},
   270  						}),
   271  					},
   272  				},
   273  			},
   274  		},
   275  	}
   276  	metricTypeFn := constantMetricTypeFnFactory(metric.GaugeType)
   277  	genOpts := valueGenOpts{
   278  		untimed: untimedValueGenOpts{
   279  			gaugeValueGenFn: func(intervalIdx, idIdx int) float64 {
   280  				// Each gauge will have two datapoints within the same aggregation window.
   281  				// The first value is 0.0 and should be ignored, and the second value will
   282  				// be used for computing the `Increase` value and should result in a `Increase`
   283  				// value of 2 that is then forwarded to the next aggregation server.
   284  				if intervalIdx%2 == 0 {
   285  					return 0.0
   286  				}
   287  				return float64(intervalIdx + 1)
   288  			},
   289  		},
   290  	}
   291  	metadataFn := func(idx int) metadataUnion {
   292  		return metadataUnion{
   293  			mType:           stagedMetadatasType,
   294  			stagedMetadatas: stagedMetadatas,
   295  		}
   296  	}
   297  	// 2 metrics (foo0 and foo1)
   298  	// 12 datapoints per metric.
   299  	// 1 datapoint every second, for an interval of 12.
   300  	// alternates between 0 and a value. 2 per datapoints per aggregation window (2s)
   301  	// values per metric: (0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12)
   302  	dataset := mustGenerateTestDataset(t, datasetGenOpts{
   303  		start:        start,
   304  		stop:         stop,
   305  		interval:     interval,
   306  		ids:          ids,
   307  		category:     untimedMetric,
   308  		typeFn:       metricTypeFn,
   309  		valueGenOpts: genOpts,
   310  		metadataFn:   metadataFn,
   311  	})
   312  
   313  	for i, data := range dataset {
   314  		if i == 3 {
   315  			// send this datapoint later
   316  			continue
   317  		}
   318  
   319  		for _, mm := range data.metricWithMetadatas {
   320  			require.NoError(t, client.writeUntimedMetricWithMetadatas(mm.metric.untimed, mm.metadata.stagedMetadatas))
   321  		}
   322  		require.NoError(t, client.flush())
   323  
   324  		// Give server some time to process the incoming packets.
   325  		time.Sleep(time.Second)
   326  	}
   327  
   328  	// Move time forward using the larger resolution and wait for flushing to happen
   329  	// at the originating server (where the raw metrics are aggregated).
   330  	orgServerflushTime := stop.Add(2 * storagePolicies[1].Resolution().Window)
   331  	for currTime := stop; !currTime.After(orgServerflushTime); currTime = currTime.Add(time.Second) {
   332  		clock.SetNow(currTime)
   333  		time.Sleep(time.Second)
   334  	}
   335  
   336  	// Move time forward using the larger resolution again and wait for flushing to
   337  	// happen at the destination server (where the rollup metrics are aggregated).
   338  	dstServerflushTime := orgServerflushTime.Add(2 * storagePolicies[1].Resolution().Window)
   339  	for currTime := orgServerflushTime; !currTime.After(dstServerflushTime); currTime = currTime.Add(time.Second) {
   340  		clock.SetNow(currTime)
   341  		time.Sleep(time.Second)
   342  	}
   343  
   344  	// send a datapoint late
   345  	data := dataset[3]
   346  	for _, mm := range data.metricWithMetadatas {
   347  		require.NoError(t, client.writeUntimedMetricWithMetadatas(mm.metric.untimed, mm.metadata.stagedMetadatas))
   348  	}
   349  	require.NoError(t, client.flush())
   350  
   351  	// Give server some time to process the incoming packets.
   352  	time.Sleep(time.Second)
   353  
   354  	// Flush the late raw metrics
   355  	flushTime := dstServerflushTime.Add(2 * storagePolicies[1].Resolution().Window)
   356  	clock.SetNow(flushTime)
   357  	time.Sleep(time.Second)
   358  
   359  	// Flush the late aggregated metrics
   360  	flushTime = flushTime.Add(2 * storagePolicies[1].Resolution().Window)
   361  	clock.SetNow(flushTime)
   362  	time.Sleep(time.Second)
   363  
   364  	// Remove all the topic consumers before closing clients and servers. This allows to close the
   365  	// connections between servers while they still are running. Otherwise, during server shutdown,
   366  	// the yet-to-be-closed servers would repeatedly try to reconnect to recently closed ones, which
   367  	// results in longer shutdown times.
   368  	require.NoError(t, removeAllTopicConsumers(topicService, defaultTopicName))
   369  
   370  	// Stop the client.
   371  	require.NoError(t, client.close())
   372  
   373  	// Stop the servers.
   374  	for i, server := range servers {
   375  		require.NoError(t, server.stopServer())
   376  		log.Sugar().Infof("server %d is now down", i)
   377  	}
   378  
   379  	// Validate results.
   380  	var destinationServer *testServerSetup
   381  	if _, exists := leaders[2]; exists {
   382  		destinationServer = servers[2]
   383  	} else if _, exists = leaders[3]; exists {
   384  		destinationServer = servers[3]
   385  	} else {
   386  		require.Fail(t, "there must exist a leader between server 2 and server 3")
   387  	}
   388  
   389  	aggregatorOpts := destinationServer.aggregatorOpts
   390  	expectedMetricKeyList := []metricKey{
   391  		{
   392  			category:      forwardedMetric,
   393  			typ:           metric.GaugeType,
   394  			id:            pipelineRollupID,
   395  			storagePolicy: storagePolicies[0],
   396  		},
   397  		{
   398  			category:      forwardedMetric,
   399  			typ:           metric.GaugeType,
   400  			id:            pipelineRollupID,
   401  			storagePolicy: storagePolicies[1],
   402  		},
   403  	}
   404  	// Expected results for 2s:1h storage policy.
   405  	expectedValuesByTimeList := []valuesByTime{
   406  		make(valuesByTime),
   407  		make(valuesByTime),
   408  	}
   409  	// expected values per storage policy
   410  	expectedValuesList := [][]float64{
   411  		{
   412  			4,
   413  			4,
   414  			4,
   415  			4,
   416  			4,
   417  			4,
   418  		},
   419  		{
   420  			8,
   421  			8,
   422  			8,
   423  		},
   424  	}
   425  	for spIdx := 0; spIdx < len(storagePolicies); spIdx++ {
   426  		storagePolicy := storagePolicies[spIdx]
   427  		for i := 0; i < len(expectedValuesList[spIdx]); i++ {
   428  			if math.IsNaN(expectedValuesList[spIdx][i]) {
   429  				continue
   430  			}
   431  			currTime := start.Add(time.Duration(i+1) * storagePolicy.Resolution().Window)
   432  			instrumentOpts := aggregatorOpts.InstrumentOptions()
   433  			agg := aggregation.NewGauge(aggregation.NewOptions(instrumentOpts))
   434  			expectedAnnotation := generateAnnotation(metric.GaugeType, numIDs-1)
   435  			agg.Update(time.Now(), expectedValuesList[spIdx][i], expectedAnnotation)
   436  			expectedValuesByTimeList[spIdx][currTime.UnixNano()] = agg
   437  			zero := aggregation.NewGauge(aggregation.NewOptions(instrumentOpts))
   438  			zero.Update(time.Now(), 0.0, expectedAnnotation)
   439  			resetTime := currTime.UnixNano() + int64(storagePolicy.Resolution().Window/2)
   440  			expectedValuesByTimeList[spIdx][resetTime] = zero
   441  		}
   442  	}
   443  
   444  	var expectedResultsFlattened []aggregated.MetricWithStoragePolicy
   445  	for i := 0; i < len(storagePolicies); i++ {
   446  		expectedDatapointsByID := datapointsByID{
   447  			expectedMetricKeyList[i]: expectedValuesByTimeList[i],
   448  		}
   449  		expectedBuckets := []aggregationBucket{
   450  			{
   451  				key: aggregationKey{
   452  					aggregationID: maggregation.MustCompressTypes(maggregation.Sum),
   453  					storagePolicy: storagePolicies[i],
   454  				},
   455  				data: expectedDatapointsByID,
   456  			},
   457  		}
   458  		expectedResults, err := computeExpectedAggregationOutput(
   459  			dstServerflushTime,
   460  			expectedBuckets,
   461  			aggregatorOpts,
   462  		)
   463  		require.NoError(t, err)
   464  		expectedResultsFlattened = append(expectedResultsFlattened, expectedResults...)
   465  	}
   466  	sort.Sort(byTimeIDPolicyAscending(expectedResultsFlattened))
   467  	actual := destinationServer.sortedResults()
   468  	if !cmp.Equal(expectedResultsFlattened, actual, testCmpOpts...) {
   469  		require.Fail(t, "results differ", cmp.Diff(expectedResultsFlattened, actual, testCmpOpts...))
   470  	}
   471  }