github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/scheduler/queue/user_queues_test.go (about)

     1  package queue
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"math/rand"
     7  	"sort"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/stretchr/testify/assert"
    12  	"github.com/stretchr/testify/require"
    13  )
    14  
    15  func TestQueues(t *testing.T) {
    16  	uq := newUserQueues(0, 0)
    17  	assert.NotNil(t, uq)
    18  	assert.NoError(t, isConsistent(uq))
    19  
    20  	q, u, lastUserIndex := uq.getNextQueueForQuerier(-1, "querier-1")
    21  	assert.Nil(t, q)
    22  	assert.Equal(t, "", u)
    23  
    24  	// Add queues: [one]
    25  	qOne := getOrAdd(t, uq, "one", 0)
    26  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qOne, qOne)
    27  
    28  	// [one two]
    29  	qTwo := getOrAdd(t, uq, "two", 0)
    30  	assert.NotEqual(t, qOne, qTwo)
    31  
    32  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qTwo, qOne, qTwo, qOne)
    33  	confirmOrderForQuerier(t, uq, "querier-2", -1, qOne, qTwo, qOne)
    34  
    35  	// [one two three]
    36  	// confirm fifo by adding a third queue and iterating to it
    37  	qThree := getOrAdd(t, uq, "three", 0)
    38  
    39  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qTwo, qThree, qOne)
    40  
    41  	// Remove one: ["" two three]
    42  	uq.deleteQueue("one")
    43  	assert.NoError(t, isConsistent(uq))
    44  
    45  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qTwo, qThree, qTwo)
    46  
    47  	// "four" is added at the beginning of the list: [four two three]
    48  	qFour := getOrAdd(t, uq, "four", 0)
    49  
    50  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qThree, qFour, qTwo, qThree)
    51  
    52  	// Remove two: [four "" three]
    53  	uq.deleteQueue("two")
    54  	assert.NoError(t, isConsistent(uq))
    55  
    56  	lastUserIndex = confirmOrderForQuerier(t, uq, "querier-1", lastUserIndex, qFour, qThree, qFour)
    57  
    58  	// Remove three: [four]
    59  	uq.deleteQueue("three")
    60  	assert.NoError(t, isConsistent(uq))
    61  
    62  	// Remove four: []
    63  	uq.deleteQueue("four")
    64  	assert.NoError(t, isConsistent(uq))
    65  
    66  	q, _, _ = uq.getNextQueueForQuerier(lastUserIndex, "querier-1")
    67  	assert.Nil(t, q)
    68  }
    69  
    70  func TestQueuesWithQueriers(t *testing.T) {
    71  	uq := newUserQueues(0, 0)
    72  	assert.NotNil(t, uq)
    73  	assert.NoError(t, isConsistent(uq))
    74  
    75  	queriers := 30
    76  	users := 1000
    77  	maxQueriersPerUser := 5
    78  
    79  	// Add some queriers.
    80  	for ix := 0; ix < queriers; ix++ {
    81  		qid := fmt.Sprintf("querier-%d", ix)
    82  		uq.addQuerierConnection(qid)
    83  
    84  		// No querier has any queues yet.
    85  		q, u, _ := uq.getNextQueueForQuerier(-1, qid)
    86  		assert.Nil(t, q)
    87  		assert.Equal(t, "", u)
    88  	}
    89  
    90  	assert.NoError(t, isConsistent(uq))
    91  
    92  	// Add user queues.
    93  	for u := 0; u < users; u++ {
    94  		uid := fmt.Sprintf("user-%d", u)
    95  		getOrAdd(t, uq, uid, maxQueriersPerUser)
    96  
    97  		// Verify it has maxQueriersPerUser queriers assigned now.
    98  		qs := uq.userQueues[uid].queriers
    99  		assert.Equal(t, maxQueriersPerUser, len(qs))
   100  	}
   101  
   102  	// After adding all users, verify results. For each querier, find out how many different users it handles,
   103  	// and compute mean and stdDev.
   104  	queriersMap := make(map[string]int)
   105  
   106  	for q := 0; q < queriers; q++ {
   107  		qid := fmt.Sprintf("querier-%d", q)
   108  
   109  		lastUserIndex := -1
   110  		for {
   111  			_, _, newIx := uq.getNextQueueForQuerier(lastUserIndex, qid)
   112  			if newIx < lastUserIndex {
   113  				break
   114  			}
   115  			lastUserIndex = newIx
   116  			queriersMap[qid]++
   117  		}
   118  	}
   119  
   120  	mean := float64(0)
   121  	for _, c := range queriersMap {
   122  		mean += float64(c)
   123  	}
   124  	mean = mean / float64(len(queriersMap))
   125  
   126  	stdDev := float64(0)
   127  	for _, c := range queriersMap {
   128  		d := float64(c) - mean
   129  		stdDev += (d * d)
   130  	}
   131  	stdDev = math.Sqrt(stdDev / float64(len(queriersMap)))
   132  	t.Log("mean:", mean, "stddev:", stdDev)
   133  
   134  	assert.InDelta(t, users*maxQueriersPerUser/queriers, mean, 1)
   135  	assert.InDelta(t, stdDev, 0, mean*0.2)
   136  }
   137  
   138  func TestQueuesConsistency(t *testing.T) {
   139  	tests := map[string]struct {
   140  		forgetDelay time.Duration
   141  	}{
   142  		"without forget delay": {},
   143  		"with forget delay":    {forgetDelay: time.Minute},
   144  	}
   145  
   146  	for testName, testData := range tests {
   147  		t.Run(testName, func(t *testing.T) {
   148  			uq := newUserQueues(0, testData.forgetDelay)
   149  			assert.NotNil(t, uq)
   150  			assert.NoError(t, isConsistent(uq))
   151  
   152  			r := rand.New(rand.NewSource(time.Now().Unix()))
   153  
   154  			lastUserIndexes := map[string]int{}
   155  
   156  			conns := map[string]int{}
   157  
   158  			for i := 0; i < 10000; i++ {
   159  				switch r.Int() % 6 {
   160  				case 0:
   161  					assert.NotNil(t, uq.getOrAddQueue(generateTenant(r), 3))
   162  				case 1:
   163  					qid := generateQuerier(r)
   164  					_, _, luid := uq.getNextQueueForQuerier(lastUserIndexes[qid], qid)
   165  					lastUserIndexes[qid] = luid
   166  				case 2:
   167  					uq.deleteQueue(generateTenant(r))
   168  				case 3:
   169  					q := generateQuerier(r)
   170  					uq.addQuerierConnection(q)
   171  					conns[q]++
   172  				case 4:
   173  					q := generateQuerier(r)
   174  					if conns[q] > 0 {
   175  						uq.removeQuerierConnection(q, time.Now())
   176  						conns[q]--
   177  					}
   178  				case 5:
   179  					q := generateQuerier(r)
   180  					uq.notifyQuerierShutdown(q)
   181  				}
   182  
   183  				assert.NoErrorf(t, isConsistent(uq), "last action %d", i)
   184  			}
   185  		})
   186  	}
   187  }
   188  
   189  func TestQueues_ForgetDelay(t *testing.T) {
   190  	const (
   191  		forgetDelay        = time.Minute
   192  		maxQueriersPerUser = 1
   193  		numUsers           = 100
   194  	)
   195  
   196  	now := time.Now()
   197  	uq := newUserQueues(0, forgetDelay)
   198  	assert.NotNil(t, uq)
   199  	assert.NoError(t, isConsistent(uq))
   200  
   201  	// 3 queriers open 2 connections each.
   202  	for i := 1; i <= 3; i++ {
   203  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   204  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   205  	}
   206  
   207  	// Add user queues.
   208  	for i := 0; i < numUsers; i++ {
   209  		userID := fmt.Sprintf("user-%d", i)
   210  		getOrAdd(t, uq, userID, maxQueriersPerUser)
   211  	}
   212  
   213  	// We expect querier-1 to have some users.
   214  	querier1Users := getUsersByQuerier(uq, "querier-1")
   215  	require.NotEmpty(t, querier1Users)
   216  
   217  	// Gracefully shutdown querier-1.
   218  	uq.removeQuerierConnection("querier-1", now.Add(20*time.Second))
   219  	uq.removeQuerierConnection("querier-1", now.Add(21*time.Second))
   220  	uq.notifyQuerierShutdown("querier-1")
   221  
   222  	// We expect querier-1 has been removed.
   223  	assert.NotContains(t, uq.queriers, "querier-1")
   224  	assert.NoError(t, isConsistent(uq))
   225  
   226  	// We expect querier-1 users have been shuffled to other queriers.
   227  	for _, userID := range querier1Users {
   228  		assert.Contains(t, append(getUsersByQuerier(uq, "querier-2"), getUsersByQuerier(uq, "querier-3")...), userID)
   229  	}
   230  
   231  	// Querier-1 reconnects.
   232  	uq.addQuerierConnection("querier-1")
   233  	uq.addQuerierConnection("querier-1")
   234  
   235  	// We expect the initial querier-1 users have got back to querier-1.
   236  	for _, userID := range querier1Users {
   237  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   238  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   239  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   240  	}
   241  
   242  	// Querier-1 abruptly terminates (no shutdown notification received).
   243  	uq.removeQuerierConnection("querier-1", now.Add(40*time.Second))
   244  	uq.removeQuerierConnection("querier-1", now.Add(41*time.Second))
   245  
   246  	// We expect querier-1 has NOT been removed.
   247  	assert.Contains(t, uq.queriers, "querier-1")
   248  	assert.NoError(t, isConsistent(uq))
   249  
   250  	// We expect the querier-1 users have not been shuffled to other queriers.
   251  	for _, userID := range querier1Users {
   252  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   253  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   254  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   255  	}
   256  
   257  	// Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet.
   258  	uq.forgetDisconnectedQueriers(now.Add(90 * time.Second))
   259  
   260  	assert.Contains(t, uq.queriers, "querier-1")
   261  	assert.NoError(t, isConsistent(uq))
   262  
   263  	for _, userID := range querier1Users {
   264  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   265  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   266  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   267  	}
   268  
   269  	// Try to forget disconnected queriers. This time querier-1 forget delay has passed.
   270  	uq.forgetDisconnectedQueriers(now.Add(105 * time.Second))
   271  
   272  	assert.NotContains(t, uq.queriers, "querier-1")
   273  	assert.NoError(t, isConsistent(uq))
   274  
   275  	// We expect querier-1 users have been shuffled to other queriers.
   276  	for _, userID := range querier1Users {
   277  		assert.Contains(t, append(getUsersByQuerier(uq, "querier-2"), getUsersByQuerier(uq, "querier-3")...), userID)
   278  	}
   279  }
   280  
   281  func TestQueues_ForgetDelay_ShouldCorrectlyHandleQuerierReconnectingBeforeForgetDelayIsPassed(t *testing.T) {
   282  	const (
   283  		forgetDelay        = time.Minute
   284  		maxQueriersPerUser = 1
   285  		numUsers           = 100
   286  	)
   287  
   288  	now := time.Now()
   289  	uq := newUserQueues(0, forgetDelay)
   290  	assert.NotNil(t, uq)
   291  	assert.NoError(t, isConsistent(uq))
   292  
   293  	// 3 queriers open 2 connections each.
   294  	for i := 1; i <= 3; i++ {
   295  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   296  		uq.addQuerierConnection(fmt.Sprintf("querier-%d", i))
   297  	}
   298  
   299  	// Add user queues.
   300  	for i := 0; i < numUsers; i++ {
   301  		userID := fmt.Sprintf("user-%d", i)
   302  		getOrAdd(t, uq, userID, maxQueriersPerUser)
   303  	}
   304  
   305  	// We expect querier-1 to have some users.
   306  	querier1Users := getUsersByQuerier(uq, "querier-1")
   307  	require.NotEmpty(t, querier1Users)
   308  
   309  	// Querier-1 abruptly terminates (no shutdown notification received).
   310  	uq.removeQuerierConnection("querier-1", now.Add(40*time.Second))
   311  	uq.removeQuerierConnection("querier-1", now.Add(41*time.Second))
   312  
   313  	// We expect querier-1 has NOT been removed.
   314  	assert.Contains(t, uq.queriers, "querier-1")
   315  	assert.NoError(t, isConsistent(uq))
   316  
   317  	// We expect the querier-1 users have not been shuffled to other queriers.
   318  	for _, userID := range querier1Users {
   319  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   320  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   321  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   322  	}
   323  
   324  	// Try to forget disconnected queriers, but querier-1 forget delay hasn't passed yet.
   325  	uq.forgetDisconnectedQueriers(now.Add(90 * time.Second))
   326  
   327  	// Querier-1 reconnects.
   328  	uq.addQuerierConnection("querier-1")
   329  	uq.addQuerierConnection("querier-1")
   330  
   331  	assert.Contains(t, uq.queriers, "querier-1")
   332  	assert.NoError(t, isConsistent(uq))
   333  
   334  	// We expect the querier-1 users have not been shuffled to other queriers.
   335  	for _, userID := range querier1Users {
   336  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   337  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   338  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   339  	}
   340  
   341  	// Try to forget disconnected queriers far in the future, but there's no disconnected querier.
   342  	uq.forgetDisconnectedQueriers(now.Add(200 * time.Second))
   343  
   344  	assert.Contains(t, uq.queriers, "querier-1")
   345  	assert.NoError(t, isConsistent(uq))
   346  
   347  	for _, userID := range querier1Users {
   348  		assert.Contains(t, getUsersByQuerier(uq, "querier-1"), userID)
   349  		assert.NotContains(t, getUsersByQuerier(uq, "querier-2"), userID)
   350  		assert.NotContains(t, getUsersByQuerier(uq, "querier-3"), userID)
   351  	}
   352  }
   353  
   354  func generateTenant(r *rand.Rand) string {
   355  	return fmt.Sprint("tenant-", r.Int()%5)
   356  }
   357  
   358  func generateQuerier(r *rand.Rand) string {
   359  	return fmt.Sprint("querier-", r.Int()%5)
   360  }
   361  
   362  func getOrAdd(t *testing.T, uq *queues, tenant string, maxQueriers int) chan Request {
   363  	q := uq.getOrAddQueue(tenant, maxQueriers)
   364  	assert.NotNil(t, q)
   365  	assert.NoError(t, isConsistent(uq))
   366  	assert.Equal(t, q, uq.getOrAddQueue(tenant, maxQueriers))
   367  	return q
   368  }
   369  
   370  func confirmOrderForQuerier(t *testing.T, uq *queues, querier string, lastUserIndex int, qs ...chan Request) int {
   371  	var n chan Request
   372  	for _, q := range qs {
   373  		n, _, lastUserIndex = uq.getNextQueueForQuerier(lastUserIndex, querier)
   374  		assert.Equal(t, q, n)
   375  		assert.NoError(t, isConsistent(uq))
   376  	}
   377  	return lastUserIndex
   378  }
   379  
   380  func isConsistent(uq *queues) error {
   381  	if len(uq.sortedQueriers) != len(uq.queriers) {
   382  		return fmt.Errorf("inconsistent number of sorted queriers and querier connections")
   383  	}
   384  
   385  	uc := 0
   386  	for ix, u := range uq.users {
   387  		q := uq.userQueues[u]
   388  		if u != "" && q == nil {
   389  			return fmt.Errorf("user %s doesn't have queue", u)
   390  		}
   391  		if u == "" && q != nil {
   392  			return fmt.Errorf("user %s shouldn't have queue", u)
   393  		}
   394  		if u == "" {
   395  			continue
   396  		}
   397  
   398  		uc++
   399  
   400  		if q.index != ix {
   401  			return fmt.Errorf("invalid user's index, expected=%d, got=%d", ix, q.index)
   402  		}
   403  
   404  		if q.maxQueriers == 0 && q.queriers != nil {
   405  			return fmt.Errorf("user %s has queriers, but maxQueriers=0", u)
   406  		}
   407  
   408  		if q.maxQueriers > 0 && len(uq.sortedQueriers) <= q.maxQueriers && q.queriers != nil {
   409  			return fmt.Errorf("user %s has queriers set despite not enough queriers available", u)
   410  		}
   411  
   412  		if q.maxQueriers > 0 && len(uq.sortedQueriers) > q.maxQueriers && len(q.queriers) != q.maxQueriers {
   413  			return fmt.Errorf("user %s has incorrect number of queriers, expected=%d, got=%d", u, len(q.queriers), q.maxQueriers)
   414  		}
   415  	}
   416  
   417  	if uc != len(uq.userQueues) {
   418  		return fmt.Errorf("inconsistent number of users list and user queues")
   419  	}
   420  
   421  	return nil
   422  }
   423  
   424  // getUsersByQuerier returns the list of users handled by the provided querierID.
   425  func getUsersByQuerier(queues *queues, querierID string) []string {
   426  	var userIDs []string
   427  	for userID, q := range queues.userQueues {
   428  		if q.queriers == nil {
   429  			// If it's nil then all queriers can handle this user.
   430  			userIDs = append(userIDs, userID)
   431  			continue
   432  		}
   433  		if _, ok := q.queriers[querierID]; ok {
   434  			userIDs = append(userIDs, userID)
   435  		}
   436  	}
   437  	return userIDs
   438  }
   439  
   440  func TestShuffleQueriers(t *testing.T) {
   441  	allQueriers := []string{"a", "b", "c", "d", "e"}
   442  
   443  	require.Nil(t, shuffleQueriersForUser(12345, 10, allQueriers, nil))
   444  	require.Nil(t, shuffleQueriersForUser(12345, len(allQueriers), allQueriers, nil))
   445  
   446  	r1 := shuffleQueriersForUser(12345, 3, allQueriers, nil)
   447  	require.Equal(t, 3, len(r1))
   448  
   449  	// Same input produces same output.
   450  	r2 := shuffleQueriersForUser(12345, 3, allQueriers, nil)
   451  	require.Equal(t, 3, len(r2))
   452  	require.Equal(t, r1, r2)
   453  }
   454  
   455  func TestShuffleQueriersCorrectness(t *testing.T) {
   456  	const queriersCount = 100
   457  
   458  	var allSortedQueriers []string
   459  	for i := 0; i < queriersCount; i++ {
   460  		allSortedQueriers = append(allSortedQueriers, fmt.Sprintf("%d", i))
   461  	}
   462  	sort.Strings(allSortedQueriers)
   463  
   464  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   465  	const tests = 1000
   466  	for i := 0; i < tests; i++ {
   467  		toSelect := r.Intn(queriersCount)
   468  		if toSelect == 0 {
   469  			toSelect = 3
   470  		}
   471  
   472  		selected := shuffleQueriersForUser(r.Int63(), toSelect, allSortedQueriers, nil)
   473  
   474  		require.Equal(t, toSelect, len(selected))
   475  
   476  		sort.Strings(allSortedQueriers)
   477  		prevQuerier := ""
   478  		for _, q := range allSortedQueriers {
   479  			require.True(t, prevQuerier < q, "non-unique querier")
   480  			prevQuerier = q
   481  
   482  			ix := sort.SearchStrings(allSortedQueriers, q)
   483  			require.True(t, ix < len(allSortedQueriers) && allSortedQueriers[ix] == q, "selected querier is not between all queriers")
   484  		}
   485  	}
   486  }