github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/rebalance_load.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"sort"
    18  	"strings"
    19  	"time"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    22  	"github.com/cockroachdb/errors"
    23  	"golang.org/x/sync/errgroup"
    24  )
    25  
    26  func registerRebalanceLoad(r *testRegistry) {
    27  	// This test creates a single table for kv to use and splits the table to
    28  	// have one range for every node in the cluster. Because even brand new
    29  	// clusters start with 20+ ranges in them, the number of new ranges in kv's
    30  	// table is small enough that it typically won't trigger rebalancing of
    31  	// leases in the cluster based on lease count alone. We let kv generate a lot
    32  	// of load against the ranges such that when
    33  	// kv.allocator.stat_based_rebalancing.enabled is set to true, we'd expect
    34  	// load-based rebalancing to distribute the load evenly across the nodes in
    35  	// the cluster. Without that setting, the fact that the kv table has so few
    36  	// ranges means that they probablistically won't have their leases evenly
    37  	// spread across all the nodes (they'll often just end up staying on n1).
    38  	//
    39  	// In other words, this test should always pass with
    40  	// kv.allocator.stat_based_rebalancing.enabled set to true, while it should
    41  	// usually (but not always fail) with it set to false.
    42  	rebalanceLoadRun := func(
    43  		ctx context.Context,
    44  		t *test,
    45  		c *cluster,
    46  		rebalanceMode string,
    47  		maxDuration time.Duration,
    48  		concurrency int,
    49  	) {
    50  		roachNodes := c.Range(1, c.spec.NodeCount-1)
    51  		appNode := c.Node(c.spec.NodeCount)
    52  		splits := len(roachNodes) - 1 // n-1 splits => n ranges => 1 lease per node
    53  
    54  		c.Put(ctx, cockroach, "./cockroach", roachNodes)
    55  		args := startArgs(
    56  			"--args=--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5")
    57  		c.Start(ctx, t, roachNodes, args)
    58  
    59  		c.Put(ctx, workload, "./workload", appNode)
    60  		c.Run(ctx, appNode, fmt.Sprintf("./workload init kv --drop --splits=%d {pgurl:1}", splits))
    61  
    62  		var m *errgroup.Group // see comment in version.go
    63  		m, ctx = errgroup.WithContext(ctx)
    64  
    65  		// Enable us to exit out of workload early when we achieve the desired
    66  		// lease balance. This drastically shortens the duration of the test in the
    67  		// common case.
    68  		ctx, cancel := context.WithCancel(ctx)
    69  
    70  		m.Go(func() error {
    71  			t.l.Printf("starting load generator\n")
    72  
    73  			err := c.RunE(ctx, appNode, fmt.Sprintf(
    74  				"./workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+
    75  					"--duration=%v {pgurl:1-%d}",
    76  				concurrency, maxDuration, len(roachNodes)))
    77  			if errors.Is(ctx.Err(), context.Canceled) {
    78  				// We got canceled either because lease balance was achieved or the
    79  				// other worker hit an error. In either case, it's not this worker's
    80  				// fault.
    81  				return nil
    82  			}
    83  			return err
    84  		})
    85  
    86  		m.Go(func() error {
    87  			t.Status("checking for lease balance")
    88  
    89  			db := c.Conn(ctx, 1)
    90  			defer db.Close()
    91  
    92  			t.Status("disable load based splitting")
    93  			if err := disableLoadBasedSplitting(ctx, db); err != nil {
    94  				return err
    95  			}
    96  
    97  			if _, err := db.ExecContext(
    98  				ctx, `SET CLUSTER SETTING kv.allocator.load_based_rebalancing=$1::string`, rebalanceMode,
    99  			); err != nil {
   100  				return err
   101  			}
   102  
   103  			for tBegin := timeutil.Now(); timeutil.Since(tBegin) <= maxDuration; {
   104  				if done, err := isLoadEvenlyDistributed(t.l, db, len(roachNodes)); err != nil {
   105  					return err
   106  				} else if done {
   107  					t.Status("successfully achieved lease balance; waiting for kv to finish running")
   108  					cancel()
   109  					return nil
   110  				}
   111  
   112  				select {
   113  				case <-ctx.Done():
   114  					return ctx.Err()
   115  				case <-time.After(5 * time.Second):
   116  				}
   117  			}
   118  
   119  			return fmt.Errorf("timed out before leases were evenly spread")
   120  		})
   121  		if err := m.Wait(); err != nil {
   122  			t.Fatal(err)
   123  		}
   124  	}
   125  
   126  	concurrency := 128
   127  
   128  	r.Add(testSpec{
   129  		Name:       `rebalance/by-load/leases`,
   130  		Owner:      OwnerKV,
   131  		Cluster:    makeClusterSpec(4), // the last node is just used to generate load
   132  		MinVersion: "v2.1.0",
   133  		Run: func(ctx context.Context, t *test, c *cluster) {
   134  			if local {
   135  				concurrency = 32
   136  				fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
   137  			}
   138  			rebalanceLoadRun(ctx, t, c, "leases", 3*time.Minute, concurrency)
   139  		},
   140  	})
   141  	r.Add(testSpec{
   142  		Name:       `rebalance/by-load/replicas`,
   143  		Owner:      OwnerKV,
   144  		Cluster:    makeClusterSpec(7), // the last node is just used to generate load
   145  		MinVersion: "v2.1.0",
   146  		Run: func(ctx context.Context, t *test, c *cluster) {
   147  			if local {
   148  				concurrency = 32
   149  				fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
   150  			}
   151  			rebalanceLoadRun(ctx, t, c, "leases and replicas", 5*time.Minute, concurrency)
   152  		},
   153  	})
   154  }
   155  
   156  func isLoadEvenlyDistributed(l *logger, db *gosql.DB, numNodes int) (bool, error) {
   157  	rows, err := db.Query(
   158  		`select lease_holder, count(*) ` +
   159  			`from [show ranges from table kv.kv] ` +
   160  			`group by lease_holder;`)
   161  	if err != nil {
   162  		// TODO(rafi): Remove experimental_ranges query once we stop testing 19.1 or
   163  		// earlier.
   164  		if strings.Contains(err.Error(), "syntax error at or near \"ranges\"") {
   165  			rows, err = db.Query(
   166  				`select lease_holder, count(*) ` +
   167  					`from [show experimental_ranges from table kv.kv] ` +
   168  					`group by lease_holder;`)
   169  		}
   170  	}
   171  	if err != nil {
   172  		return false, err
   173  	}
   174  	defer rows.Close()
   175  	leaseCounts := make(map[int]int)
   176  	var rangeCount int
   177  	for rows.Next() {
   178  		var storeID, leaseCount int
   179  		if err := rows.Scan(&storeID, &leaseCount); err != nil {
   180  			return false, err
   181  		}
   182  		leaseCounts[storeID] = leaseCount
   183  		rangeCount += leaseCount
   184  	}
   185  
   186  	if len(leaseCounts) < numNodes {
   187  		l.Printf("not all nodes have a lease yet: %v\n", formatLeaseCounts(leaseCounts))
   188  		return false, nil
   189  	}
   190  
   191  	// The simple case is when ranges haven't split. We can require that every
   192  	// store has one lease.
   193  	if rangeCount == numNodes {
   194  		for _, leaseCount := range leaseCounts {
   195  			if leaseCount != 1 {
   196  				l.Printf("uneven lease distribution: %s\n", formatLeaseCounts(leaseCounts))
   197  				return false, nil
   198  			}
   199  		}
   200  		l.Printf("leases successfully distributed: %s\n", formatLeaseCounts(leaseCounts))
   201  		return true, nil
   202  	}
   203  
   204  	// For completeness, if leases have split, verify the leases per store don't
   205  	// differ by any more than 1.
   206  	leases := make([]int, 0, numNodes)
   207  	for _, leaseCount := range leaseCounts {
   208  		leases = append(leases, leaseCount)
   209  	}
   210  	sort.Ints(leases)
   211  	if leases[0]+1 < leases[len(leases)-1] {
   212  		l.Printf("leases per store differ by more than one: %s\n", formatLeaseCounts(leaseCounts))
   213  		return false, nil
   214  	}
   215  
   216  	l.Printf("leases successfully distributed: %s\n", formatLeaseCounts(leaseCounts))
   217  	return true, nil
   218  }
   219  
   220  func formatLeaseCounts(counts map[int]int) string {
   221  	storeIDs := make([]int, 0, len(counts))
   222  	for storeID := range counts {
   223  		storeIDs = append(storeIDs, storeID)
   224  	}
   225  	sort.Ints(storeIDs)
   226  	strs := make([]string, 0, len(counts))
   227  	for _, storeID := range storeIDs {
   228  		strs = append(strs, fmt.Sprintf("s%d: %d", storeID, counts[storeID]))
   229  	}
   230  	return fmt.Sprintf("[%s]", strings.Join(strs, ", "))
   231  }