vitess.io/vitess@v0.16.2/go/test/endtoend/tabletgateway/buffer/buffer_test_helpers.go (about)

     1  /*
     2  Copyright 2020 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  /*
    18  Test the vtgate buffering.
    19  
    20  During a failover, vtgate should automatically buffer (stall) requests
    21  for a configured time and retry them after the failover is over.
    22  
    23  The test reproduces such a scenario as follows:
    24  - run two threads, the first thread continuously executes a critical read and the second executes a write (UPDATE)
    25  - vtctl PlannedReparentShard runs a failover
    26  - both threads should not see any error during the failover
    27  */
    28  
    29  package buffer
    30  
    31  import (
    32  	"context"
    33  	"encoding/json"
    34  	"fmt"
    35  	"io"
    36  	"math/rand"
    37  	"net/http"
    38  	"sync"
    39  	"testing"
    40  	"time"
    41  
    42  	"vitess.io/vitess/go/test/endtoend/utils"
    43  
    44  	"vitess.io/vitess/go/vt/log"
    45  
    46  	"github.com/stretchr/testify/assert"
    47  	"github.com/stretchr/testify/require"
    48  
    49  	"vitess.io/vitess/go/mysql"
    50  	"vitess.io/vitess/go/test/endtoend/cluster"
    51  )
    52  
    53  const (
    54  	keyspaceUnshardedName = "ks1"
    55  	cell                  = "zone1"
    56  	hostname              = "localhost"
    57  	sqlSchema             = `
    58  	create table buffer(
    59  		id BIGINT NOT NULL,
    60  		msg VARCHAR(64) NOT NULL,
    61  		PRIMARY KEY (id)
    62  	) Engine=InnoDB;`
    63  )
    64  
    65  const (
    66  	criticalReadRowID = 1
    67  	updateRowID       = 2
    68  )
    69  
    70  // threadParams is set of params passed into read and write threads
    71  type threadParams struct {
    72  	quit                       bool
    73  	rpcs                       int        // Number of queries successfully executed.
    74  	errors                     int        // Number of failed queries.
    75  	waitForNotification        chan bool  // Channel used to notify the main thread that this thread executed
    76  	notifyLock                 sync.Mutex // notifyLock guards the two fields notifyAfterNSuccessfulRpcs/rpcsSoFar.
    77  	notifyAfterNSuccessfulRpcs int        // If 0, notifications are disabled
    78  	rpcsSoFar                  int        // Number of RPCs at the time a notification was requested
    79  	index                      int        //
    80  	internalErrs               int
    81  	executeFunction            func(c *threadParams, conn *mysql.Conn) error // Implement the method for read/update.
    82  	typ                        string
    83  	reservedConn               bool
    84  	slowQueries                bool
    85  }
    86  
    87  // Thread which constantly executes a query on vtgate.
    88  func (c *threadParams) threadRun(wg *sync.WaitGroup, vtParams *mysql.ConnParams) {
    89  	defer wg.Done()
    90  
    91  	conn, err := mysql.Connect(context.Background(), vtParams)
    92  	if err != nil {
    93  		log.Errorf("error connecting to mysql with params %v: %v", vtParams, err)
    94  	}
    95  	defer conn.Close()
    96  	if c.reservedConn {
    97  		_, err = conn.ExecuteFetch("set default_week_format = 1", 1000, true)
    98  		if err != nil {
    99  			c.errors++
   100  			log.Errorf("error setting default_week_format: %v", err)
   101  		}
   102  	}
   103  	for !c.quit {
   104  		err = c.executeFunction(c, conn)
   105  		if err != nil {
   106  			c.errors++
   107  			log.Errorf("error executing function %s: %v", c.typ, err)
   108  		}
   109  		c.rpcs++
   110  		// If notifications are requested, check if we already executed the
   111  		// required number of successful RPCs.
   112  		// Use >= instead of == because we can miss the exact point due to
   113  		// slow thread scheduling.
   114  		c.notifyLock.Lock()
   115  		if c.notifyAfterNSuccessfulRpcs != 0 && c.rpcs >= (c.notifyAfterNSuccessfulRpcs+c.rpcsSoFar) {
   116  			c.waitForNotification <- true
   117  			c.notifyAfterNSuccessfulRpcs = 0
   118  		}
   119  		c.notifyLock.Unlock()
   120  		// Wait 10ms seconds between two attempts.
   121  		time.Sleep(10 * time.Millisecond)
   122  	}
   123  }
   124  
   125  func (c *threadParams) ExpectQueries(n int) {
   126  	c.notifyLock.Lock()
   127  	c.notifyAfterNSuccessfulRpcs = n
   128  	c.rpcsSoFar = c.rpcs
   129  	c.notifyLock.Unlock()
   130  }
   131  
   132  func (c *threadParams) stop() {
   133  	c.quit = true
   134  }
   135  
   136  func readExecute(c *threadParams, conn *mysql.Conn) error {
   137  	attempt := c.index
   138  	c.index++
   139  
   140  	sel := "*"
   141  	if c.slowQueries {
   142  		sel = "*, SLEEP(1)"
   143  	}
   144  	qr, err := conn.ExecuteFetch(fmt.Sprintf("SELECT %s FROM buffer WHERE id = %d", sel, criticalReadRowID), 1000, true)
   145  
   146  	if err != nil {
   147  		log.Errorf("select attempt #%d, failed with err: %v", attempt, err)
   148  		// For a reserved connection, read query can fail as it does not go through the gateway and
   149  		// goes to tablet directly and later is directed to use Gateway if the error is caused due to cluster failover operation.
   150  		if c.reservedConn {
   151  			c.internalErrs++
   152  			if c.internalErrs > 1 {
   153  				log.Errorf("More Read Errors: %d", c.internalErrs)
   154  				return err
   155  			}
   156  			log.Error("This is okay once because we do not support buffering it.")
   157  			return nil
   158  		}
   159  		return err
   160  	}
   161  
   162  	log.Infof("select attempt #%d, rows: %d", attempt, len(qr.Rows))
   163  	return nil
   164  }
   165  
   166  func updateExecute(c *threadParams, conn *mysql.Conn) error {
   167  	attempt := c.index
   168  	// Value used in next UPDATE query. Increased after every query.
   169  	c.index++
   170  	conn.ExecuteFetch("begin", 1000, true)
   171  
   172  	result, err := conn.ExecuteFetch(fmt.Sprintf("UPDATE buffer SET msg='update %d' WHERE id = %d", attempt, updateRowID), 1000, true)
   173  
   174  	// Sleep between [0, 1] seconds to prolong the time the transaction is in
   175  	// flight. This is more realistic because applications are going to keep
   176  	// their transactions open for longer as well.
   177  	dur := time.Duration(rand.Int31n(1000)) * time.Millisecond
   178  	if c.slowQueries {
   179  		dur = dur + 1*time.Second
   180  	}
   181  	time.Sleep(dur)
   182  
   183  	if err == nil {
   184  		log.Infof("update attempt #%d affected %v rows", attempt, result.RowsAffected)
   185  		_, err = conn.ExecuteFetch("commit", 1000, true)
   186  		if err != nil {
   187  			log.Errorf("UPDATE #%d failed during COMMIT, err: %v", attempt, err)
   188  			_, errRollback := conn.ExecuteFetch("rollback", 1000, true)
   189  			if errRollback != nil {
   190  				log.Errorf("Error in rollback #%d: %v", attempt, errRollback)
   191  			}
   192  			c.internalErrs++
   193  			if c.internalErrs > 1 {
   194  				log.Errorf("More Commit Errors: %d", c.internalErrs)
   195  				return err
   196  			}
   197  			log.Error("This is okay once because we do not support buffering it.")
   198  		}
   199  		return nil
   200  	}
   201  	log.Errorf("UPDATE #%d failed with err: %v", attempt, err)
   202  	_, errRollback := conn.ExecuteFetch("rollback", 1000, true)
   203  	if errRollback != nil {
   204  		log.Errorf("Error in rollback #%d: %v", attempt, errRollback)
   205  	}
   206  	c.internalErrs++
   207  	if c.internalErrs > 1 {
   208  		log.Errorf("More Rollback Errors: %d", c.internalErrs)
   209  		return err
   210  	}
   211  	log.Error("This is okay once because we do not support buffering it.")
   212  	return nil
   213  }
   214  
   215  func (bt *BufferingTest) createCluster() (*cluster.LocalProcessCluster, int) {
   216  	clusterInstance := cluster.NewCluster(cell, hostname)
   217  
   218  	// Start topo server
   219  	clusterInstance.VtctldExtraArgs = []string{"--remote_operation_timeout", "30s", "--topo_etcd_lease_ttl", "40"}
   220  	if err := clusterInstance.StartTopo(); err != nil {
   221  		return nil, 1
   222  	}
   223  
   224  	// Start keyspace
   225  	keyspace := &cluster.Keyspace{
   226  		Name:      keyspaceUnshardedName,
   227  		SchemaSQL: sqlSchema,
   228  		VSchema:   bt.VSchema,
   229  	}
   230  	clusterInstance.VtTabletExtraArgs = []string{
   231  		"--health_check_interval", "1s",
   232  		"--queryserver-config-transaction-timeout", "20",
   233  	}
   234  	if err := clusterInstance.StartUnshardedKeyspace(*keyspace, 1, false); err != nil {
   235  		return nil, 1
   236  	}
   237  
   238  	clusterInstance.VtGateExtraArgs = []string{
   239  		"--enable_buffer",
   240  		// Long timeout in case failover is slow.
   241  		"--buffer_window", "10m",
   242  		"--buffer_max_failover_duration", "10m",
   243  		"--buffer_min_time_between_failovers", "20m",
   244  		"--buffer_implementation", "keyspace_events",
   245  		"--tablet_refresh_interval", "1s",
   246  	}
   247  	clusterInstance.VtGateExtraArgs = append(clusterInstance.VtGateExtraArgs, bt.VtGateExtraArgs...)
   248  
   249  	// Start vtgate
   250  	clusterInstance.VtGatePlannerVersion = 0
   251  	if err := clusterInstance.StartVtgate(); err != nil {
   252  		return nil, 1
   253  	}
   254  	rand.Seed(time.Now().UnixNano())
   255  	return clusterInstance, 0
   256  }
   257  
   258  type QueryEngine interface {
   259  	ExpectQueries(count int)
   260  }
   261  
   262  type BufferingTest struct {
   263  	Assert   func(t *testing.T, shard string, stats *VTGateBufferingStats)
   264  	Failover func(t *testing.T, cluster *cluster.LocalProcessCluster, keyspace string, reads, writes QueryEngine)
   265  
   266  	ReserveConn bool
   267  	SlowQueries bool
   268  
   269  	VSchema         string
   270  	VtGateExtraArgs []string
   271  
   272  	wg sync.WaitGroup
   273  }
   274  
   275  func (bt *BufferingTest) Test(t *testing.T) {
   276  	defer cluster.PanicHandler(t)
   277  	clusterInstance, exitCode := bt.createCluster()
   278  	if exitCode != 0 {
   279  		t.Fatal("failed to start cluster")
   280  	}
   281  	defer clusterInstance.Teardown()
   282  
   283  	vtParams := mysql.ConnParams{
   284  		Host: clusterInstance.Hostname,
   285  		Port: clusterInstance.VtgateMySQLPort,
   286  	}
   287  
   288  	// Healthcheck interval on tablet is set to 1s, so sleep for 2s
   289  	time.Sleep(2 * time.Second)
   290  	conn, err := mysql.Connect(context.Background(), &vtParams)
   291  	require.NoError(t, err)
   292  	defer conn.Close()
   293  
   294  	// Insert two rows for the later threads (critical read, update).
   295  	utils.Exec(t, conn, fmt.Sprintf("INSERT INTO buffer (id, msg) VALUES (%d, %s)", criticalReadRowID, "'critical read'"))
   296  	utils.Exec(t, conn, fmt.Sprintf("INSERT INTO buffer (id, msg) VALUES (%d, %s)", updateRowID, "'update'"))
   297  
   298  	// Start both threads.
   299  	readThreadInstance := &threadParams{
   300  		index:               1,
   301  		typ:                 "read",
   302  		executeFunction:     readExecute,
   303  		waitForNotification: make(chan bool),
   304  		reservedConn:        bt.ReserveConn,
   305  		slowQueries:         bt.SlowQueries,
   306  	}
   307  	bt.wg.Add(1)
   308  	go readThreadInstance.threadRun(&bt.wg, &vtParams)
   309  	updateThreadInstance := &threadParams{
   310  		index:               1,
   311  		typ:                 "write",
   312  		executeFunction:     updateExecute,
   313  		waitForNotification: make(chan bool),
   314  		reservedConn:        bt.ReserveConn,
   315  		slowQueries:         bt.SlowQueries,
   316  	}
   317  	bt.wg.Add(1)
   318  	go updateThreadInstance.threadRun(&bt.wg, &vtParams)
   319  
   320  	// Verify they got at least 2 RPCs through.
   321  	readThreadInstance.ExpectQueries(2)
   322  	updateThreadInstance.ExpectQueries(2)
   323  
   324  	<-readThreadInstance.waitForNotification
   325  	<-updateThreadInstance.waitForNotification
   326  
   327  	bt.Failover(t, clusterInstance, keyspaceUnshardedName, readThreadInstance, updateThreadInstance)
   328  
   329  	timeout := time.After(120 * time.Second)
   330  	select {
   331  	case <-readThreadInstance.waitForNotification:
   332  	case <-timeout:
   333  		timeout = time.After(100 * time.Millisecond)
   334  		log.Error("failed to get read thread notification")
   335  	}
   336  	select {
   337  	case <-updateThreadInstance.waitForNotification:
   338  	case <-timeout:
   339  		log.Error("failed to get update thread notification")
   340  	}
   341  
   342  	// Stop threads
   343  	readThreadInstance.stop()
   344  	updateThreadInstance.stop()
   345  
   346  	// Both threads must not see any error
   347  	assert.Zero(t, readThreadInstance.errors, "found errors in read queries")
   348  	assert.Zero(t, updateThreadInstance.errors, "found errors in tx queries")
   349  
   350  	//At least one thread should have been buffered.
   351  	//This may fail if a failover is too fast. Add retries then.
   352  	resp, err := http.Get(clusterInstance.VtgateProcess.VerifyURL)
   353  	require.NoError(t, err)
   354  	defer resp.Body.Close()
   355  
   356  	require.Equal(t, 200, resp.StatusCode)
   357  
   358  	var metadata VTGateBufferingStats
   359  	respByte, err := io.ReadAll(resp.Body)
   360  	require.NoError(t, err)
   361  	err = json.Unmarshal(respByte, &metadata)
   362  	require.NoError(t, err)
   363  
   364  	label := fmt.Sprintf("%s.%s", keyspaceUnshardedName, "0")
   365  	if metadata.BufferLastRequestsInFlightMax[label] == 0 {
   366  		// Missed buffering is okay when we observed the failover during the
   367  		// COMMIT (which cannot trigger the buffering).
   368  		assert.Greater(t, updateThreadInstance.internalErrs, 0, "No buffering took place and the update thread saw no error during COMMIT. But one of it must happen.")
   369  	} else {
   370  		bt.Assert(t, label, &metadata)
   371  	}
   372  
   373  	bt.wg.Wait()
   374  }
   375  
   376  type VTGateBufferingStats struct {
   377  	BufferLastRequestsInFlightMax map[string]int
   378  	HealthcheckPrimaryPromoted    map[string]int
   379  	BufferFailoverDurationSumMs   map[string]int
   380  	BufferRequestsBuffered        map[string]int
   381  	BufferStops                   map[string]int
   382  }