github.com/grafana/pyroscope@v1.18.0/pkg/scheduler/queue/user_queues_test.go (about)

     1  // SPDX-License-Identifier: AGPL-3.0-only
     2  // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/scheduler/queue/user_queues_test.go
     3  // Provenance-includes-license: Apache-2.0
     4  // Provenance-includes-copyright: The Cortex Authors.
     5  
     6  package queue
     7  
     8  import (
     9  	"fmt"
    10  	"math"
    11  	"math/rand"
    12  	"slices"
    13  	"sort"
    14  	"testing"
    15  	"time"
    16  
    17  	"github.com/stretchr/testify/assert"
    18  	"github.com/stretchr/testify/require"
    19  )
    20  
    21  func TestQueues(t *testing.T) {
    22  	uq := newUserQueues(0, 0)
    23  	assert.NotNil(t, uq)
    24  	assert.NoError(t, isConsistent(uq))
    25  
    26  	uq.addQuerierConnection("querier-1")
    27  	uq.addQuerierConnection("querier-2")
    28  
    29  	q, u, lastUserIndex := uq.getNextQueueForQuerier(-1, "querier-1")
    30  	assert.Nil(t, q)
    31  	assert.Equal(t, "", u)
    32  
    33  	// Add queues: [one]
    34  	qOne := getOrAdd(t, uq, "one", 0)
    35  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qOne, qOne)
    36  
    37  	// [one two]
    38  	qTwo := getOrAdd(t, uq, "two", 0)
    39  	assert.NotEqual(t, qOne, qTwo)
    40  
    41  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qTwo, qOne, qTwo, qOne)
    42  	confirmOrderForQuerier(t, uq, "querier-2", -1, qOne, qTwo, qOne)
    43  
    44  	// [one two three]
    45  	// confirm fifo by adding a third queue and iterating to it
    46  	qThree := getOrAdd(t, uq, "three", 0)
    47  
    48  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qTwo, qThree, qOne)
    49  
    50  	// Remove one: ["" two three]
    51  	uq.deleteQueue("one")
    52  	assert.NoError(t, isConsistent(uq))
    53  
    54  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qTwo, qThree, qTwo)
    55  
    56  	// "four" is added at the beginning of the list: [four two three]
    57  	qFour := getOrAdd(t, uq, "four", 0)
    58  
    59  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qThree, qFour, qTwo, qThree)
    60  
    61  	// Remove two: [four "" three]
    62  	uq.deleteQueue("two")
    63  	assert.NoError(t, isConsistent(uq))
    64  
    65  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qFour, qThree, qFour)
    66  
    67  	// Remove three: [four]
    68  	uq.deleteQueue("three")
    69  	assert.NoError(t, isConsistent(uq))
    70  
    71  	// Remove four: []
    72  	uq.deleteQueue("four")
    73  	assert.NoError(t, isConsistent(uq))
    74  
    75  	q, _, _ = uq.getNextQueueForQuerier(lastUserIndex, "querier-1")
    76  	assert.Nil(t, q)
    77  }
    78  
    79  func TestQueuesOnTerminatingQuerier(t *testing.T) {
    80  	uq := newUserQueues(0, 0)
    81  	assert.NotNil(t, uq)
    82  	assert.NoError(t, isConsistent(uq))
    83  
    84  	uq.addQuerierConnection("querier-1")
    85  	uq.addQuerierConnection("querier-2")
    86  
    87  	// Add queues: [one, two]
    88  	qOne := getOrAdd(t, uq, "one", 0)
    89  	qTwo := getOrAdd(t, uq, "two", 0)
    90  	confirmOrderForQuerier(t, uq, "querier-1", -1, qOne, qTwo, qOne, qTwo)
    91  	confirmOrderForQuerier(t, uq, "querier-2", -1, qOne, qTwo, qOne, qTwo)
    92  
    93  	// After notify shutdown for querier-2, it's expected to own no queue.
    94  	uq.notifyQuerierShutdown("querier-2")
    95  	q, u, _ := uq.getNextQueueForQuerier(-1, "querier-2")
    96  	assert.Nil(t, q)
    97  	assert.Equal(t, "", u)
    98  
    99  	// However, querier-1 still get queues because it's still running.
   100  	confirmOrderForQuerier(t, uq, "querier-1", -1, qOne, qTwo, qOne, qTwo)
   101  
   102  	// After disconnecting querier-2, it's expected to own no queue.
   103  	uq.removeQuerier("querier-2")
   104  	q, u, _ = uq.getNextQueueForQuerier(-1, "querier-2")
   105  	assert.Nil(t, q)
   106  	assert.Equal(t, "", u)
   107  }
   108  
   109  func TestQueuesWithQueriers(t *testing.T) {
   110  	uq := newUserQueues(0, 0)
   111  	assert.NotNil(t, uq)
   112  	assert.NoError(t, isConsistent(uq))
   113  
   114  	queriers := 30
   115  	users := 1000
   116  	maxQueriersPerUser := 5
   117  
   118  	// Add some queriers.
   119  	for ix := 0; ix < queriers; ix++ {
   120  		qid := fmt.Sprintf("querier-%d", ix)
   121  		uq.addQuerierConnection(qid)
   122  
   123  		// No querier has any queues yet.
   124  		q, u, _ := uq.getNextQueueForQuerier(-1, qid)
   125  		assert.Nil(t, q)
   126  		assert.Equal(t, "", u)
   127  	}
   128  
   129  	assert.NoError(t, isConsistent(uq))
   130  
   131  	// Add user queues.
   132  	for u := 0; u < users; u++ {
   133  		uid := fmt.Sprintf("user-%d", u)
   134  		getOrAdd(t, uq, uid, maxQueriersPerUser)
   135  
   136  		// Verify it has maxQueriersPerUser queriers assigned now.
   137  		qs := uq.userQueues[uid].queriers
   138  		assert.Equal(t, maxQueriersPerUser, len(qs))
   139  	}
   140  
   141  	// After adding all users, verify results. For each querier, find out how many different users it handles,
   142  	// and compute mean and stdDev.
   143  	queriersMap := make(map[string]int)
   144  
   145  	for q := 0; q < queriers; q++ {
   146  		qid := fmt.Sprintf("querier-%d", q)
   147  
   148  		lastUserIndex := -1
   149  		for {
   150  			_, _, newIx := uq.getNextQueueForQuerier(lastUserIndex, qid)
   151  			if newIx < lastUserIndex {
   152  				break
   153  			}
   154  			lastUserIndex = newIx
   155  			queriersMap[qid]++
   156  		}
   157  	}
   158  
   159  	mean := float64(0)
   160  	for _, c := range queriersMap {
   161  		mean += float64(c)
   162  	}
   163  	mean = mean / float64(len(queriersMap))
   164  
   165  	stdDev := float64(0)
   166  	for _, c := range queriersMap {
   167  		d := float64(c) - mean
   168  		stdDev += (d * d)
   169  	}
   170  	stdDev = math.Sqrt(stdDev / float64(len(queriersMap)))
   171  	t.Log("mean:", mean, "stddev:", stdDev)
   172  
   173  	assert.InDelta(t, users*maxQueriersPerUser/queriers, mean, 1)
   174  	assert.InDelta(t, stdDev, 0, mean*0.2)
   175  }
   176  
   177  func TestQueuesConsistency(t *testing.T) {
   178  	tests := map[string]struct {
   179  		forgetDelay time.Duration
   180  	}{
   181  		"without forget delay": {},
   182  		"with forget delay":    {forgetDelay: time.Minute},
   183  	}
   184  
   185  	for testName, testData := range tests {
   186  		t.Run(testName, func(t *testing.T) {
   187  			uq := newUserQueues(0, testData.forgetDelay)
   188  			assert.NotNil(t, uq)
   189  			assert.NoError(t, isConsistent(uq))
   190  
   191  			r := rand.New(rand.NewSource(time.Now().Unix()))
   192  
   193  			lastUserIndexes := map[string]int{}
   194  
   195  			conns := map[string]int{}
   196  
   197  			for i := 0; i < 10000; i++ {
   198  				switch r.Int() % 6 {
   199  				case 0:
   200  					assert.NotNil(t, uq.getOrAddQueue(generateTenant(r), 3))
   201  				case 1:
   202  					qid := generateQuerier(r)
   203  					_, _, luid := uq.getNextQueueForQuerier(lastUserIndexes[qid], qid)
   204  					lastUserIndexes[qid] = luid
   205  				case 2:
   206  					uq.deleteQueue(generateTenant(r))
   207  				case 3:
   208  					q := generateQuerier(r)
   209  					uq.addQuerierConnection(q)
   210  					conns[q]++
   211  				case 4:
   212  					q := generateQuerier(r)
   213  					if conns[q] > 0 {
   214  						uq.removeQuerierConnection(q, time.Now())
   215  						conns[q]--
   216  					}
   217  				case 5:
   218  					q := generateQuerier(r)
   219  					uq.notifyQuerierShutdown(q)
   220  				}
   221  
   222  				assert.NoErrorf(t, isConsistent(uq), "last action %d", i)
   223  			}
   224  		})
   225  	}
   226  }
   227  
   228  func TestQueues_ForgetDelay(t *testing.T) {
   229  	const (
   230  		forgetDelay        = time.Minute
   231  		maxQueriersPerUser = 1
   232  		numUsers           = 100
   233  	)
   234  
   235  	now := time.Now()
   236  	uq := newUserQueues(0, forgetDelay)
   237  	assert.NotNil(t, uq)
   238  	assert.NoError(t, isConsistent(uq))
   239  
   240  	// 3 queriers open 2 connections each.
   241  	for i := 1; i <= 3; i++ {
   242  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   243  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   244  	}
   245  
   246  	// Add user queues.
   247  	for i := 0; i < numUsers; i++ {
   248  		userID := fmt.Sprintf("user-%d", i)
   249  		getOrAdd(t, uq, userID, maxQueriersPerUser)
   250  	}
   251  
   252  	// We expect querier-1 to have some users.
   253  	querier1Users := getUsersByQuerier(uq, "querier-1")
   254  	require.NotEmpty(t, querier1Users)
   255  
   256  	// Gracefully shutdown querier-1.
   257  	uq.removeQuerierConnection("querier-1", now.Add(20*time.Second))
   258  	uq.removeQuerierConnection("querier-1", now.Add(21*time.Second))
   259  	uq.notifyQuerierShutdown("querier-1")
   260  
   261  	// We expect querier-1 has been removed.
   262  	assert.NotContains(t, uq.queriers, "querier-1")
   263  	assert.NoError(t, isConsistent(uq))
   264  
   265  	// We expect querier-1 users have been shuffled to other queriers.
   266  	for _, userID := range querier1Users {
   267  		assert.Contains(t, append(getUsersByQuerier(uq, "querier-2"), getUsersByQuerier(uq, "querier-3")...), userID)
   268  	}
   269  
   270  	// Querier-1 reconnects.
   271  	uq.addQuerierConnection("querier-1")
   272  	uq.addQuerierConnection("querier-1")
   273  
   274  	// We expect the initial querier-1 users have got back to querier-1.
   275  	for _, userID := range querier1Users {
   276  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   277  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   278  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   279  	}
   280  
   281  	// Querier-1 abruptly terminates (no shutdown notification received).
   282  	uq.removeQuerierConnection("querier-1", now.Add(40*time.Second))
   283  	uq.removeQuerierConnection("querier-1", now.Add(41*time.Second))
   284  
   285  	// We expect querier-1 has NOT been removed.
   286  	assert.Contains(t, uq.queriers, "querier-1")
   287  	assert.NoError(t, isConsistent(uq))
   288  
   289  	// We expect the querier-1 users have not been shuffled to other queriers.
   290  	for _, userID := range querier1Users {
   291  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   292  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   293  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   294  	}
   295  
   296  	// Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet.
   297  	uq.forgetDisconnectedQueriers(now.Add(90 * time.Second))
   298  
   299  	assert.Contains(t, uq.queriers, "querier-1")
   300  	assert.NoError(t, isConsistent(uq))
   301  
   302  	for _, userID := range querier1Users {
   303  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   304  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   305  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   306  	}
   307  
   308  	// Try to forget disconnected queriers. This time querier-1 forget delay has passed.
   309  	uq.forgetDisconnectedQueriers(now.Add(105 * time.Second))
   310  
   311  	assert.NotContains(t, uq.queriers, "querier-1")
   312  	assert.NoError(t, isConsistent(uq))
   313  
   314  	// We expect querier-1 users have been shuffled to other queriers.
   315  	for _, userID := range querier1Users {
   316  		assert.Contains(t, append(getUsersByQuerier(uq, "querier-2"), getUsersByQuerier(uq, "querier-3")...), userID)
   317  	}
   318  }
   319  
   320  func TestQueues_ForgetDelay_ShouldCorrectlyHandleQuerierReconnectingBeforeForgetDelayIsPassed(t *testing.T) {
   321  	const (
   322  		forgetDelay        = time.Minute
   323  		maxQueriersPerUser = 1
   324  		numUsers           = 100
   325  	)
   326  
   327  	now := time.Now()
   328  	uq := newUserQueues(0, forgetDelay)
   329  	assert.NotNil(t, uq)
   330  	assert.NoError(t, isConsistent(uq))
   331  
   332  	// 3 queriers open 2 connections each.
   333  	for i := 1; i <= 3; i++ {
   334  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   335  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   336  	}
   337  
   338  	// Add user queues.
   339  	for i := 0; i < numUsers; i++ {
   340  		userID := fmt.Sprintf("user-%d", i)
   341  		getOrAdd(t, uq, userID, maxQueriersPerUser)
   342  	}
   343  
   344  	// We expect querier-1 to have some users.
   345  	querier1Users := getUsersByQuerier(uq, "querier-1")
   346  	require.NotEmpty(t, querier1Users)
   347  
   348  	// Querier-1 abruptly terminates (no shutdown notification received).
   349  	uq.removeQuerierConnection("querier-1", now.Add(40*time.Second))
   350  	uq.removeQuerierConnection("querier-1", now.Add(41*time.Second))
   351  
   352  	// We expect querier-1 has NOT been removed.
   353  	assert.Contains(t, uq.queriers, "querier-1")
   354  	assert.NoError(t, isConsistent(uq))
   355  
   356  	// We expect the querier-1 users have not been shuffled to other queriers.
   357  	for _, userID := range querier1Users {
   358  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   359  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   360  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   361  	}
   362  
   363  	// Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet.
   364  	uq.forgetDisconnectedQueriers(now.Add(90 * time.Second))
   365  
   366  	// Querier-1 reconnects.
   367  	uq.addQuerierConnection("querier-1")
   368  	uq.addQuerierConnection("querier-1")
   369  
   370  	assert.Contains(t, uq.queriers, "querier-1")
   371  	assert.NoError(t, isConsistent(uq))
   372  
   373  	// We expect the querier-1 users have not been shuffled to other queriers.
   374  	for _, userID := range querier1Users {
   375  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   376  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   377  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   378  	}
   379  
   380  	// Try to forget disconnected queriers far in the future, but there's no disconnected querier.
   381  	uq.forgetDisconnectedQueriers(now.Add(200 * time.Second))
   382  
   383  	assert.Contains(t, uq.queriers, "querier-1")
   384  	assert.NoError(t, isConsistent(uq))
   385  
   386  	for _, userID := range querier1Users {
   387  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   388  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   389  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   390  	}
   391  }
   392  
   393  func generateTenant(r *rand.Rand) string {
   394  	return fmt.Sprint("tenant-", r.Int()%5)
   395  }
   396  
   397  func generateQuerier(r *rand.Rand) string {
   398  	return fmt.Sprint("querier-", r.Int()%5)
   399  }
   400  
   401  func getOrAdd(t *testing.T, uq *queues, tenant string, maxQueriers int) chan Request {
   402  	q := uq.getOrAddQueue(tenant, maxQueriers)
   403  	assert.NotNil(t, q)
   404  	assert.NoError(t, isConsistent(uq))
   405  	assert.Equal(t, q, uq.getOrAddQueue(tenant, maxQueriers))
   406  	return q
   407  }
   408  
   409  func confirmOrderForQuerier(t *testing.T, uq *queues, querier string, lastUserIndex int, qs ...chan Request) int {
   410  	var n chan Request
   411  	for _, q := range qs {
   412  		n, _, lastUserIndex = uq.getNextQueueForQuerier(lastUserIndex, querier)
   413  		assert.Equal(t, q, n)
   414  		assert.NoError(t, isConsistent(uq))
   415  	}
   416  	return lastUserIndex
   417  }
   418  
   419  func isConsistent(uq *queues) error {
   420  	if len(uq.sortedQueriers) != len(uq.queriers) {
   421  		return fmt.Errorf("inconsistent number of sorted queriers and querier connections")
   422  	}
   423  
   424  	uc := 0
   425  	for ix, u := range uq.users {
   426  		q := uq.userQueues[u]
   427  		if u != "" && q == nil {
   428  			return fmt.Errorf("user %s doesn't have queue", u)
   429  		}
   430  		if u == "" && q != nil {
   431  			return fmt.Errorf("user %s shouldn't have queue", u)
   432  		}
   433  		if u == "" {
   434  			continue
   435  		}
   436  
   437  		uc++
   438  
   439  		if q.index != ix {
   440  			return fmt.Errorf("invalid user's index, expected=%d, got=%d", ix, q.index)
   441  		}
   442  
   443  		if q.maxQueriers == 0 && q.queriers != nil {
   444  			return fmt.Errorf("user %s has queriers, but maxQueriers=0", u)
   445  		}
   446  
   447  		if q.maxQueriers > 0 && len(uq.sortedQueriers) <= q.maxQueriers && q.queriers != nil {
   448  			return fmt.Errorf("user %s has queriers set despite not enough queriers available", u)
   449  		}
   450  
   451  		if q.maxQueriers > 0 && len(uq.sortedQueriers) > q.maxQueriers && len(q.queriers) != q.maxQueriers {
   452  			return fmt.Errorf("user %s has incorrect number of queriers, expected=%d, got=%d", u, len(q.queriers), q.maxQueriers)
   453  		}
   454  	}
   455  
   456  	if uc != len(uq.userQueues) {
   457  		return fmt.Errorf("inconsistent number of users list and user queues")
   458  	}
   459  
   460  	return nil
   461  }
   462  
   463  // getUsersByQuerier returns the list of users handled by the provided querierID.
   464  func getUsersByQuerier(queues *queues, querierID string) []string {
   465  	var userIDs []string
   466  	for userID, q := range queues.userQueues {
   467  		if q.queriers == nil {
   468  			// If it's nil then all queriers can handle this user.
   469  			userIDs = append(userIDs, userID)
   470  			continue
   471  		}
   472  		if _, ok := q.queriers[querierID]; ok {
   473  			userIDs = append(userIDs, userID)
   474  		}
   475  	}
   476  	return userIDs
   477  }
   478  
   479  func TestShuffleQueriers(t *testing.T) {
   480  	allQueriers := []string{"a", "b", "c", "d", "e"}
   481  
   482  	require.Nil(t, shuffleQueriersForUser(12345, 10, allQueriers, nil))
   483  	require.Nil(t, shuffleQueriersForUser(12345, len(allQueriers), allQueriers, nil))
   484  
   485  	r1 := shuffleQueriersForUser(12345, 3, allQueriers, nil)
   486  	require.Equal(t, 3, len(r1))
   487  
   488  	// Same input produces same output.
   489  	r2 := shuffleQueriersForUser(12345, 3, allQueriers, nil)
   490  	require.Equal(t, 3, len(r2))
   491  	require.Equal(t, r1, r2)
   492  }
   493  
   494  func TestShuffleQueriersCorrectness(t *testing.T) {
   495  	const queriersCount = 100
   496  
   497  	var allSortedQueriers []string
   498  	for i := 0; i < queriersCount; i++ {
   499  		allSortedQueriers = append(allSortedQueriers, fmt.Sprintf("%d", i))
   500  	}
   501  	slices.Sort(allSortedQueriers)
   502  
   503  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   504  	const tests = 1000
   505  	for i := 0; i < tests; i++ {
   506  		toSelect := r.Intn(queriersCount)
   507  		if toSelect == 0 {
   508  			toSelect = 3
   509  		}
   510  
   511  		selected := shuffleQueriersForUser(r.Int63(), toSelect, allSortedQueriers, nil)
   512  
   513  		require.Equal(t, toSelect, len(selected))
   514  
   515  		slices.Sort(allSortedQueriers)
   516  		prevQuerier := ""
   517  		for _, q := range allSortedQueriers {
   518  			require.True(t, prevQuerier < q, "non-unique querier")
   519  			prevQuerier = q
   520  
   521  			ix := sort.SearchStrings(allSortedQueriers, q)
   522  			require.True(t, ix < len(allSortedQueriers) && allSortedQueriers[ix] == q, "selected querier is not between all queriers")
   523  		}
   524  	}
   525  }