github.com/m3db/m3@v1.5.0/src/aggregator/integration/multi_server_forwarding_pipeline_test.go (about)

     1  // +build integration
     2  
     3  // Copyright (c) 2018 Uber Technologies, Inc.
     4  //
     5  // Permission is hereby granted, free of charge, to any person obtaining a copy
     6  // of this software and associated documentation files (the "Software"), to deal
     7  // in the Software without restriction, including without limitation the rights
     8  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     9  // copies of the Software, and to permit persons to whom the Software is
    10  // furnished to do so, subject to the following conditions:
    11  //
    12  // The above copyright notice and this permission notice shall be included in
    13  // all copies or substantial portions of the Software.
    14  //
    15  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    18  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    20  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    21  // THE SOFTWARE.
    22  
    23  package integration
    24  
    25  import (
    26  	"math"
    27  	"sort"
    28  	"sync"
    29  	"sync/atomic"
    30  	"testing"
    31  	"time"
    32  
    33  	"github.com/m3db/m3/src/aggregator/aggregation"
    34  	aggclient "github.com/m3db/m3/src/aggregator/client"
    35  	"github.com/m3db/m3/src/cluster/kv"
    36  	memcluster "github.com/m3db/m3/src/cluster/mem"
    37  	"github.com/m3db/m3/src/cluster/placement"
    38  	maggregation "github.com/m3db/m3/src/metrics/aggregation"
    39  	"github.com/m3db/m3/src/metrics/metadata"
    40  	"github.com/m3db/m3/src/metrics/metric"
    41  	"github.com/m3db/m3/src/metrics/metric/aggregated"
    42  	"github.com/m3db/m3/src/metrics/pipeline"
    43  	"github.com/m3db/m3/src/metrics/pipeline/applied"
    44  	"github.com/m3db/m3/src/metrics/policy"
    45  	"github.com/m3db/m3/src/metrics/transformation"
    46  	"github.com/m3db/m3/src/x/instrument"
    47  	xtest "github.com/m3db/m3/src/x/test"
    48  	xtime "github.com/m3db/m3/src/x/time"
    49  
    50  	"github.com/google/go-cmp/cmp"
    51  	"github.com/stretchr/testify/require"
    52  	"go.uber.org/zap"
    53  )
    54  
    55  func TestMultiServerForwardingPipelineKeepNaNAggregatedValues(t *testing.T) {
    56  	testMultiServerForwardingPipeline(t, false)
    57  }
    58  
    59  func TestMultiServerForwardingPipelineDiscardNaNAggregatedValues(t *testing.T) {
    60  	testMultiServerForwardingPipeline(t, true)
    61  }
    62  
    63  func testMultiServerForwardingPipeline(t *testing.T, discardNaNAggregatedValues bool) {
    64  	if testing.Short() {
    65  		t.SkipNow()
    66  	}
    67  
    68  	aggregatorClientType, err := getAggregatorClientTypeFromEnv()
    69  	require.NoError(t, err)
    70  
    71  	// Clock setup.
    72  	clock := newTestClock(time.Now().Truncate(time.Hour))
    73  
    74  	// Placement setup.
    75  	var (
    76  		numTotalShards = 1024
    77  		placementKey   = "/placement"
    78  	)
    79  	multiServerSetup := []struct {
    80  		rawTCPAddr     string
    81  		httpAddr       string
    82  		m3MsgAddr      string
    83  		instanceConfig placementInstanceConfig
    84  	}{
    85  		{
    86  			rawTCPAddr: "localhost:6000",
    87  			httpAddr:   "localhost:16000",
    88  			m3MsgAddr:  "localhost:26000",
    89  			instanceConfig: placementInstanceConfig{
    90  				shardSetID:          1,
    91  				shardStartInclusive: 0,
    92  				shardEndExclusive:   512,
    93  			},
    94  		},
    95  		{
    96  			rawTCPAddr: "localhost:6001",
    97  			httpAddr:   "localhost:16001",
    98  			m3MsgAddr:  "localhost:26001",
    99  			instanceConfig: placementInstanceConfig{
   100  				shardSetID:          1,
   101  				shardStartInclusive: 0,
   102  				shardEndExclusive:   512,
   103  			},
   104  		},
   105  		{
   106  			rawTCPAddr: "localhost:6002",
   107  			httpAddr:   "localhost:16002",
   108  			m3MsgAddr:  "localhost:26002",
   109  			instanceConfig: placementInstanceConfig{
   110  				shardSetID:          2,
   111  				shardStartInclusive: 512,
   112  				shardEndExclusive:   1024,
   113  			},
   114  		},
   115  		{
   116  			rawTCPAddr: "localhost:6003",
   117  			httpAddr:   "localhost:16003",
   118  			m3MsgAddr:  "localhost:26003",
   119  			instanceConfig: placementInstanceConfig{
   120  				shardSetID:          2,
   121  				shardStartInclusive: 512,
   122  				shardEndExclusive:   1024,
   123  			},
   124  		},
   125  	}
   126  
   127  	for i, mss := range multiServerSetup {
   128  		multiServerSetup[i].instanceConfig.instanceID = mss.rawTCPAddr
   129  		if aggregatorClientType == aggclient.M3MsgAggregatorClient {
   130  			multiServerSetup[i].instanceConfig.instanceID = mss.m3MsgAddr
   131  		}
   132  	}
   133  
   134  	clusterClient := memcluster.New(kv.NewOverrideOptions())
   135  	instances := make([]placement.Instance, 0, len(multiServerSetup))
   136  	for _, mss := range multiServerSetup {
   137  		instance := mss.instanceConfig.newPlacementInstance()
   138  		instances = append(instances, instance)
   139  	}
   140  	initPlacement := newPlacement(numTotalShards, instances).SetReplicaFactor(2)
   141  	setPlacement(t, placementKey, clusterClient, initPlacement)
   142  	topicService, err := initializeTopic(defaultTopicName, clusterClient, numTotalShards)
   143  	require.NoError(t, err)
   144  
   145  	// Election cluster setup.
   146  	electionCluster := newTestCluster(t)
   147  
   148  	// Sharding function maps all metrics to shard 0 except for the rollup metric,
   149  	// which gets mapped to the last shard.
   150  	pipelineRollupID := "pipelineRollup"
   151  	shardFn := func(id []byte, numShards uint32) uint32 {
   152  		if pipelineRollupID == string(id) {
   153  			return numShards - 1
   154  		}
   155  		return 0
   156  	}
   157  
   158  	// Admin client connection options setup.
   159  	connectionOpts := aggclient.NewConnectionOptions().
   160  		SetInitReconnectThreshold(1).
   161  		SetMaxReconnectThreshold(1).
   162  		SetMaxReconnectDuration(2 * time.Second).
   163  		SetWriteTimeout(time.Second)
   164  
   165  	// Create servers.
   166  	servers := make(testServerSetups, 0, len(multiServerSetup))
   167  	for _, mss := range multiServerSetup {
   168  		instrumentOpts := instrument.NewOptions()
   169  		logger := instrumentOpts.Logger().With(
   170  			zap.String("serverAddr", mss.rawTCPAddr),
   171  		)
   172  		instrumentOpts = instrumentOpts.SetLogger(logger)
   173  		serverOpts := newTestServerOptions(t).
   174  			SetClockOptions(clock.Options()).
   175  			SetInstrumentOptions(instrumentOpts).
   176  			SetElectionCluster(electionCluster).
   177  			SetRawTCPAddr(mss.rawTCPAddr).
   178  			SetHTTPAddr(mss.httpAddr).
   179  			SetM3MsgAddr(mss.m3MsgAddr).
   180  			SetInstanceID(mss.instanceConfig.instanceID).
   181  			SetClusterClient(clusterClient).
   182  			SetTopicService(topicService).
   183  			SetTopicName(defaultTopicName).
   184  			SetShardFn(shardFn).
   185  			SetShardSetID(mss.instanceConfig.shardSetID).
   186  			SetClientConnectionOptions(connectionOpts).
   187  			SetDiscardNaNAggregatedValues(discardNaNAggregatedValues)
   188  		server := newTestServerSetup(t, serverOpts)
   189  		servers = append(servers, server)
   190  	}
   191  
   192  	// Start the servers.
   193  	log := xtest.NewLogger(t)
   194  	log.Info("test forwarding pipeline")
   195  	for i, server := range servers {
   196  		require.NoError(t, server.startServer())
   197  		log.Sugar().Infof("server %d is now up", i)
   198  	}
   199  
   200  	// Create clients for writing to the servers.
   201  	client := servers.newClient(t)
   202  	require.NoError(t, client.connect())
   203  
   204  	// Waiting for two leaders to come up.
   205  	var (
   206  		leaders    = make(map[int]struct{})
   207  		leaderCh   = make(chan int, len(servers)/2)
   208  		numLeaders int32
   209  		wg         sync.WaitGroup
   210  	)
   211  	wg.Add(len(servers) / 2)
   212  	for i, server := range servers {
   213  		i, server := i, server
   214  		go func() {
   215  			if err := server.waitUntilLeader(); err == nil {
   216  				res := int(atomic.AddInt32(&numLeaders, 1))
   217  				if res <= len(servers)/2 {
   218  					leaderCh <- i
   219  					wg.Done()
   220  				}
   221  			}
   222  		}()
   223  	}
   224  	wg.Wait()
   225  	close(leaderCh)
   226  
   227  	for i := range leaderCh {
   228  		leaders[i] = struct{}{}
   229  		log.Sugar().Infof("server %d has become the leader", i)
   230  	}
   231  	log.Sugar().Infof("%d servers have become leaders", len(leaders))
   232  
   233  	var (
   234  		idPrefix        = "foo"
   235  		numIDs          = 2
   236  		start           = clock.Now()
   237  		stop            = start.Add(12 * time.Second)
   238  		interval        = time.Second
   239  		storagePolicies = policy.StoragePolicies{
   240  			policy.NewStoragePolicy(2*time.Second, xtime.Second, time.Hour),
   241  			policy.NewStoragePolicy(4*time.Second, xtime.Second, 24*time.Hour),
   242  		}
   243  	)
   244  
   245  	ids := generateTestIDs(idPrefix, numIDs)
   246  	stagedMetadatas := metadata.StagedMetadatas{
   247  		{
   248  			CutoverNanos: 0,
   249  			Tombstoned:   false,
   250  			Metadata: metadata.Metadata{
   251  				Pipelines: []metadata.PipelineMetadata{
   252  					{
   253  						AggregationID:   maggregation.DefaultID,
   254  						StoragePolicies: storagePolicies,
   255  						Pipeline: applied.NewPipeline([]applied.OpUnion{
   256  							{
   257  								Type:           pipeline.TransformationOpType,
   258  								Transformation: pipeline.TransformationOp{Type: transformation.PerSecond},
   259  							},
   260  							{
   261  								Type: pipeline.RollupOpType,
   262  								Rollup: applied.RollupOp{
   263  									ID:            []byte(pipelineRollupID),
   264  									AggregationID: maggregation.MustCompressTypes(maggregation.Sum),
   265  								},
   266  							},
   267  						}),
   268  					},
   269  				},
   270  			},
   271  		},
   272  	}
   273  	metricTypeFn := constantMetricTypeFnFactory(metric.GaugeType)
   274  	valueGenOpts := valueGenOpts{
   275  		untimed: untimedValueGenOpts{
   276  			gaugeValueGenFn: func(intervalIdx, idIdx int) float64 {
   277  				// Each gauge will have two datapoints within the same aggregation window.
   278  				// The first value is 0.0 and should be ignored, and the second value will
   279  				// be used for computing the `PerSecond` value and should result in a `PerSecond`
   280  				// value of 1 that is then forwarded to the next aggregation server.
   281  				if intervalIdx%2 == 0 {
   282  					return 0.0
   283  				}
   284  				return float64(intervalIdx + idIdx)
   285  			},
   286  		},
   287  	}
   288  	metadataFn := func(idx int) metadataUnion {
   289  		return metadataUnion{
   290  			mType:           stagedMetadatasType,
   291  			stagedMetadatas: stagedMetadatas,
   292  		}
   293  	}
   294  	dataset := mustGenerateTestDataset(t, datasetGenOpts{
   295  		start:        start,
   296  		stop:         stop,
   297  		interval:     interval,
   298  		ids:          ids,
   299  		category:     untimedMetric,
   300  		typeFn:       metricTypeFn,
   301  		valueGenOpts: valueGenOpts,
   302  		metadataFn:   metadataFn,
   303  	})
   304  
   305  	for _, data := range dataset {
   306  		clock.SetNow(data.timestamp)
   307  
   308  		for _, mm := range data.metricWithMetadatas {
   309  			require.NoError(t, client.writeUntimedMetricWithMetadatas(mm.metric.untimed, mm.metadata.stagedMetadatas))
   310  		}
   311  		require.NoError(t, client.flush())
   312  
   313  		// Give server some time to process the incoming packets.
   314  		time.Sleep(time.Second)
   315  	}
   316  
   317  	// Move time forward using the larger resolution and wait for flushing to happen
   318  	// at the originating server (where the raw metrics are aggregated).
   319  	originatingServerflushTime := stop.Add(2 * storagePolicies[1].Resolution().Window)
   320  	for currTime := stop; !currTime.After(originatingServerflushTime); currTime = currTime.Add(time.Second) {
   321  		clock.SetNow(currTime)
   322  		time.Sleep(time.Second)
   323  	}
   324  
   325  	// Move time forward using the larger resolution again and wait for flushing to
   326  	// happen at the destination server (where the rollup metrics are aggregated).
   327  	destinationServerflushTime := originatingServerflushTime.Add(2 * storagePolicies[1].Resolution().Window)
   328  	for currTime := originatingServerflushTime; !currTime.After(destinationServerflushTime); currTime = currTime.Add(time.Second) {
   329  		clock.SetNow(currTime)
   330  		time.Sleep(time.Second)
   331  	}
   332  
   333  	// Remove all the topic consumers before closing clients and servers. This allows to close the
   334  	// connections between servers while they still are running. Otherwise, during server shutdown,
   335  	// the yet-to-be-closed servers would repeatedly try to reconnect to recently closed ones, which
   336  	// results in longer shutdown times.
   337  	require.NoError(t, removeAllTopicConsumers(topicService, defaultTopicName))
   338  
   339  	// Stop the client.
   340  	require.NoError(t, client.close())
   341  
   342  	// Stop the servers.
   343  	for i, server := range servers {
   344  		require.NoError(t, server.stopServer())
   345  		log.Sugar().Infof("server %d is now down", i)
   346  	}
   347  
   348  	// Validate results.
   349  	var destinationServer *testServerSetup
   350  	if _, exists := leaders[2]; exists {
   351  		destinationServer = servers[2]
   352  	} else if _, exists = leaders[3]; exists {
   353  		destinationServer = servers[3]
   354  	} else {
   355  		require.Fail(t, "there must exist a leader between server 2 and server 3")
   356  	}
   357  
   358  	aggregatorOpts := destinationServer.aggregatorOpts
   359  	expectedMetricKeyList := []metricKey{
   360  		{
   361  			category:      forwardedMetric,
   362  			typ:           metric.GaugeType,
   363  			id:            pipelineRollupID,
   364  			storagePolicy: storagePolicies[0],
   365  		},
   366  		{
   367  			category:      forwardedMetric,
   368  			typ:           metric.GaugeType,
   369  			id:            pipelineRollupID,
   370  			storagePolicy: storagePolicies[1],
   371  		},
   372  	}
   373  	// Expected results for 2s:1h storage policy.
   374  	expectedValuesByTimeList := []valuesByTime{
   375  		make(valuesByTime),
   376  		make(valuesByTime),
   377  	}
   378  	expectedValuesList := [][]float64{
   379  		{
   380  			math.NaN(),
   381  			float64(numIDs),
   382  			float64(numIDs),
   383  			float64(numIDs),
   384  			float64(numIDs),
   385  			float64(numIDs),
   386  		},
   387  		{
   388  			math.NaN(),
   389  			float64(numIDs),
   390  			float64(numIDs),
   391  		},
   392  	}
   393  	for spIdx := 0; spIdx < len(storagePolicies); spIdx++ {
   394  		storagePolicy := storagePolicies[spIdx]
   395  		for i := 0; i < len(expectedValuesList[spIdx]); i++ {
   396  			if discardNaNAggregatedValues && math.IsNaN(expectedValuesList[spIdx][i]) {
   397  				continue
   398  			}
   399  			currTime := start.Add(time.Duration(i+1) * storagePolicy.Resolution().Window)
   400  			instrumentOpts := aggregatorOpts.InstrumentOptions()
   401  			agg := aggregation.NewGauge(aggregation.NewOptions(instrumentOpts))
   402  			expectedAnnotation := generateAnnotation(metric.GaugeType, numIDs-1)
   403  			agg.Update(time.Now(), expectedValuesList[spIdx][i], expectedAnnotation)
   404  			expectedValuesByTimeList[spIdx][currTime.UnixNano()] = agg
   405  		}
   406  	}
   407  
   408  	var expectedResultsFlattened []aggregated.MetricWithStoragePolicy
   409  	for i := 0; i < len(storagePolicies); i++ {
   410  		expectedDatapointsByID := datapointsByID{
   411  			expectedMetricKeyList[i]: expectedValuesByTimeList[i],
   412  		}
   413  		expectedBuckets := []aggregationBucket{
   414  			{
   415  				key: aggregationKey{
   416  					aggregationID: maggregation.MustCompressTypes(maggregation.Sum),
   417  					storagePolicy: storagePolicies[i],
   418  				},
   419  				data: expectedDatapointsByID,
   420  			},
   421  		}
   422  		expectedResults, err := computeExpectedAggregationOutput(
   423  			destinationServerflushTime,
   424  			expectedBuckets,
   425  			aggregatorOpts,
   426  		)
   427  		require.NoError(t, err)
   428  		expectedResultsFlattened = append(expectedResultsFlattened, expectedResults...)
   429  	}
   430  	sort.Sort(byTimeIDPolicyAscending(expectedResultsFlattened))
   431  	actual := destinationServer.sortedResults()
   432  	require.True(t, cmp.Equal(expectedResultsFlattened, actual, testCmpOpts...))
   433  }