vitess.io/vitess@v0.16.2/go/vt/throttler/demo/throttler_demo.go

vitess.io/vitess@v0.16.2/go/vt/throttler/demo/throttler_demo.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"context"
    21  	"math/rand"
    22  	"net/http"
    23  	"sync"
    24  	"testing"
    25  	"time"
    26  
    27  	"github.com/spf13/pflag"
    28  
    29  	"vitess.io/vitess/go/vt/discovery"
    30  	"vitess.io/vitess/go/vt/log"
    31  	"vitess.io/vitess/go/vt/logutil"
    32  	querypb "vitess.io/vitess/go/vt/proto/query"
    33  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    34  	"vitess.io/vitess/go/vt/servenv"
    35  	"vitess.io/vitess/go/vt/throttler"
    36  	"vitess.io/vitess/go/vt/topo"
    37  	"vitess.io/vitess/go/vt/topo/memorytopo"
    38  	"vitess.io/vitess/go/vt/vttablet/grpcqueryservice"
    39  	"vitess.io/vitess/go/vt/vttablet/queryservice/fakes"
    40  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    41  	"vitess.io/vitess/go/vt/wrangler"
    42  	"vitess.io/vitess/go/vt/wrangler/testlib"
    43  )
    44  
    45  // This file contains a demo binary that demonstrates how the resharding
    46  // throttler adapts its throttling rate to the replication lag.
    47  //
    48  // The throttler is necessary because replicas apply transactions at a slower
    49  // rate than primaries and fall behind at high write throughput.
    50  // (Mostly they fall behind because MySQL replication is single threaded but
    51  //  the write throughput on the primary does not have to.)
    52  //
    53  // This demo simulates a client (writer), a primary and a replica.
    54  // The client writes to the primary which in turn replicas everything to the
    55  // replica.
    56  // The replica measures its replication lag via the timestamp which is part of
    57  // each message.
    58  // While the primary has no rate limit, the replica is limited to
    59  // --rate (see below) transactions/second. The client runs the resharding
    60  // throttler which tries to throttle the client based on the observed
    61  // replication lag.
    62  
    63  var (
    64  	rate                     = int64(1000)
    65  	duration                 = 600 * time.Second
    66  	lagUpdateInterval        = 5 * time.Second
    67  	replicaDegrationDuration = 10 * time.Second
    68  	replicaDegrationInterval time.Duration
    69  )
    70  
    71  const flagSetName = "throttler_demo"
    72  
    73  func registerDemoFlags(fs *pflag.FlagSet) {
    74  	fs.Int64Var(&rate, "rate", rate, "maximum rate of the throttled demo server at the start")
    75  	fs.DurationVar(&duration, "duration", duration, "total duration the demo runs")
    76  	fs.DurationVar(&lagUpdateInterval, "lag_update_interval", lagUpdateInterval, "interval at which the current replication lag will be broadcast to the throttler")
    77  	fs.DurationVar(&replicaDegrationInterval, "replica_degration_interval", replicaDegrationInterval, "simulate a throughput degration of the replica every X interval (i.e. the replica applies transactions at a slower rate for -reparent_duration and the replication lag might go up)")
    78  	fs.DurationVar(&replicaDegrationDuration, "replica_degration_duration", replicaDegrationDuration, "duration a simulated degration should take")
    79  }
    80  
    81  // primary simulates an *unthrottled* MySQL primary which replicates every
    82  // received "execute" call to a known "replica".
    83  type primary struct {
    84  	replica *replica
    85  }
    86  
    87  // execute is the simulated RPC which is called by the client.
    88  func (m *primary) execute(msg time.Time) {
    89  	m.replica.replicate(msg)
    90  }
    91  
    92  // replica simulates a *throttled* MySQL replica.
    93  // If it cannot keep up with applying the primary writes, it will report a
    94  // replication lag > 0 seconds.
    95  type replica struct {
    96  	fakeTablet *testlib.FakeTablet
    97  	qs         *fakes.StreamHealthQueryService
    98  
    99  	// replicationStream is the incoming stream of messages from the primary.
   100  	replicationStream chan time.Time
   101  
   102  	// throttler is used to enforce the maximum rate at which replica applies
   103  	// transactions. It must not be confused with the client's throttler.
   104  	throttler         *throttler.Throttler
   105  	lastHealthUpdate  time.Time
   106  	lagUpdateInterval time.Duration
   107  
   108  	degrationInterval   time.Duration
   109  	degrationDuration   time.Duration
   110  	nextDegration       time.Time
   111  	currentDegrationEnd time.Time
   112  
   113  	stopChan chan struct{}
   114  	wg       sync.WaitGroup
   115  }
   116  
   117  func newReplica(lagUpdateInterval, degrationInterval, degrationDuration time.Duration, ts *topo.Server) *replica {
   118  	t := &testing.T{}
   119  	wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient())
   120  	fakeTablet := testlib.NewFakeTablet(t, wr, "cell1", 0,
   121  		topodatapb.TabletType_REPLICA, nil, testlib.TabletKeyspaceShard(t, "ks", "-80"))
   122  	fakeTablet.StartActionLoop(t, wr)
   123  
   124  	target := &querypb.Target{
   125  		Keyspace:   "ks",
   126  		Shard:      "-80",
   127  		TabletType: topodatapb.TabletType_REPLICA,
   128  	}
   129  	qs := fakes.NewStreamHealthQueryService(target)
   130  	grpcqueryservice.Register(fakeTablet.RPCServer, qs)
   131  
   132  	throttler, err := throttler.NewThrottler("replica", "TPS", 1, rate, throttler.ReplicationLagModuleDisabled)
   133  	if err != nil {
   134  		log.Fatal(err)
   135  	}
   136  
   137  	var nextDegration time.Time
   138  	if degrationInterval != time.Duration(0) {
   139  		nextDegration = time.Now().Add(degrationInterval)
   140  	}
   141  	r := &replica{
   142  		fakeTablet:        fakeTablet,
   143  		qs:                qs,
   144  		throttler:         throttler,
   145  		replicationStream: make(chan time.Time, 1*1024*1024),
   146  		lagUpdateInterval: lagUpdateInterval,
   147  		degrationInterval: degrationInterval,
   148  		degrationDuration: degrationDuration,
   149  		nextDegration:     nextDegration,
   150  		stopChan:          make(chan struct{}),
   151  	}
   152  	r.wg.Add(1)
   153  	go r.processReplicationStream()
   154  	return r
   155  }
   156  
   157  func (r *replica) replicate(msg time.Time) {
   158  	r.replicationStream <- msg
   159  }
   160  
   161  func (r *replica) processReplicationStream() {
   162  	defer r.wg.Done()
   163  
   164  	// actualRate counts the number of requests per r.lagUpdateInterval.
   165  	actualRate := 0
   166  	for msg := range r.replicationStream {
   167  		select {
   168  		case <-r.stopChan:
   169  			return
   170  		default:
   171  		}
   172  
   173  		now := time.Now()
   174  		if now.Sub(r.lastHealthUpdate) > r.lagUpdateInterval {
   175  			// Broadcast current lag every "lagUpdateInterval".
   176  			//
   177  			// Use integer values to calculate the lag. In consequence, the reported
   178  			// lag will constantly vary between the floor and ceil value e.g.
   179  			// an actual lag of 0.5s could be reported as 0s or 1s based on the
   180  			// truncation of the two times.
   181  			lagTruncated := uint32(now.Unix() - msg.Unix())
   182  			// Display lag with a higher precision as well.
   183  			lag := now.Sub(msg).Seconds()
   184  			log.Infof("current lag: %1ds (%1.1fs) replica rate: % 7.1f chan len: % 6d", lagTruncated, lag, float64(actualRate)/r.lagUpdateInterval.Seconds(), len(r.replicationStream))
   185  			r.qs.AddHealthResponseWithReplicationLag(lagTruncated)
   186  			r.lastHealthUpdate = now
   187  			actualRate = 0
   188  		}
   189  		if !r.nextDegration.IsZero() && time.Now().After(r.nextDegration) && r.currentDegrationEnd.IsZero() {
   190  			degradedRate := rand.Int63n(rate)
   191  			log.Infof("degrading the replica for %.f seconds from %v TPS to %v", r.degrationDuration.Seconds(), rate, degradedRate)
   192  			r.throttler.SetMaxRate(degradedRate)
   193  			r.currentDegrationEnd = time.Now().Add(r.degrationDuration)
   194  		}
   195  		if !r.currentDegrationEnd.IsZero() && time.Now().After(r.currentDegrationEnd) {
   196  			log.Infof("degrading the replica stopped. Restoring TPS to: %v", rate)
   197  			r.throttler.SetMaxRate(rate)
   198  			r.currentDegrationEnd = time.Time{}
   199  			r.nextDegration = time.Now().Add(r.degrationInterval)
   200  		}
   201  
   202  		for {
   203  			backoff := r.throttler.Throttle(0 /* threadID */)
   204  			if backoff == throttler.NotThrottled {
   205  				break
   206  			}
   207  			time.Sleep(backoff)
   208  		}
   209  		actualRate++
   210  	}
   211  }
   212  
   213  func (r *replica) stop() {
   214  	close(r.replicationStream)
   215  	close(r.stopChan)
   216  	log.Info("Triggered replica shutdown. Waiting for it to stop.")
   217  	r.wg.Wait()
   218  	r.fakeTablet.StopActionLoop(&testing.T{})
   219  }
   220  
   221  // client simulates a client which should throttle itself based on the
   222  // replication lag of all replicas.
   223  type client struct {
   224  	primary *primary
   225  
   226  	healthCheck discovery.HealthCheck
   227  	throttler   *throttler.Throttler
   228  
   229  	stopChan      chan struct{}
   230  	wg            sync.WaitGroup
   231  	healthcheckCh chan *discovery.TabletHealth
   232  }
   233  
   234  func newClient(primary *primary, replica *replica, ts *topo.Server) *client {
   235  	t, err := throttler.NewThrottler("client", "TPS", 1, throttler.MaxRateModuleDisabled, 5 /* seconds */)
   236  	if err != nil {
   237  		log.Fatal(err)
   238  	}
   239  
   240  	healthCheck := discovery.NewHealthCheck(context.Background(), 5*time.Second, 1*time.Minute, ts, "cell1", "")
   241  	c := &client{
   242  		primary:     primary,
   243  		healthCheck: healthCheck,
   244  		throttler:   t,
   245  		stopChan:    make(chan struct{}),
   246  	}
   247  	healthcheckCh := c.healthCheck.Subscribe()
   248  	c.healthcheckCh = healthcheckCh
   249  	c.healthCheck.AddTablet(replica.fakeTablet.Tablet)
   250  	return c
   251  }
   252  
   253  func (c *client) run() {
   254  	c.wg.Add(1)
   255  	go c.loop()
   256  }
   257  
   258  func (c *client) loop() {
   259  	defer c.wg.Done()
   260  
   261  	for {
   262  		select {
   263  		case <-c.stopChan:
   264  			return
   265  		case th := <-c.healthcheckCh:
   266  			c.StatsUpdate(th)
   267  		default:
   268  		}
   269  
   270  		for {
   271  			backoff := c.throttler.Throttle(0 /* threadID */)
   272  			if backoff == throttler.NotThrottled {
   273  				break
   274  			}
   275  			time.Sleep(backoff)
   276  		}
   277  
   278  		c.primary.execute(time.Now())
   279  	}
   280  }
   281  
   282  func (c *client) stop() {
   283  	close(c.stopChan)
   284  	c.wg.Wait()
   285  
   286  	c.healthCheck.Close()
   287  	c.throttler.Close()
   288  }
   289  
   290  // StatsUpdate gets called by the healthCheck instance every time a tablet broadcasts
   291  // a health update.
   292  func (c *client) StatsUpdate(ts *discovery.TabletHealth) {
   293  	// Ignore unless REPLICA or RDONLY.
   294  	if ts.Target.TabletType != topodatapb.TabletType_REPLICA && ts.Target.TabletType != topodatapb.TabletType_RDONLY {
   295  		return
   296  	}
   297  
   298  	c.throttler.RecordReplicationLag(time.Now(), ts)
   299  }
   300  
   301  func main() {
   302  	servenv.ParseFlags(flagSetName)
   303  
   304  	go servenv.RunDefault()
   305  	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
   306  		http.Redirect(w, r, "/throttlerz", http.StatusTemporaryRedirect)
   307  	})
   308  
   309  	log.Infof("start rate set to: %v", rate)
   310  	ts := memorytopo.NewServer("cell1")
   311  	replica := newReplica(lagUpdateInterval, replicaDegrationInterval, replicaDegrationDuration, ts)
   312  	primary := &primary{replica: replica}
   313  	client := newClient(primary, replica, ts)
   314  	client.run()
   315  
   316  	time.Sleep(duration)
   317  	client.stop()
   318  	replica.stop()
   319  }
   320  
   321  func init() {
   322  	servenv.RegisterDefaultFlags()
   323  	servenv.RegisterFlags()
   324  	servenv.RegisterGRPCServerFlags()
   325  	servenv.RegisterGRPCServerAuthFlags()
   326  	servenv.OnParseFor(flagSetName, registerDemoFlags)
   327  }